blob: 80b15d2f020b7f4d1744a4fc1fde636b42db120e [file] [log] [blame]
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/autofill/core/browser/autofill_data_util.h"
#include <algorithm>
#include <vector>
#include "base/i18n/char_iterator.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "components/autofill/core/browser/field_types.h"
#include "components/grit/components_scaled_resources.h"
#include "components/strings/grit/components_strings.h"
#include "third_party/icu/source/common/unicode/uscript.h"
#include "third_party/re2/src/re2/re2.h"
namespace autofill {
namespace data_util {
namespace {
// Mappings from Chrome card types to Payment Request API basic card payment
// spec types and icons. Note that "generic" is not in the spec.
// https://w3c.github.io/webpayments-methods-card/#method-id
const PaymentRequestData kPaymentRequestData[]{
{"americanExpressCC", "amex", IDR_AUTOFILL_PR_AMEX, IDS_AUTOFILL_CC_AMEX},
{"dinersCC", "diners", IDR_AUTOFILL_PR_DINERS, IDS_AUTOFILL_CC_DINERS},
{"discoverCC", "discover", IDR_AUTOFILL_PR_DISCOVER,
IDS_AUTOFILL_CC_DISCOVER},
{"jcbCC", "jcb", IDR_AUTOFILL_PR_JCB, IDS_AUTOFILL_CC_JCB},
{"masterCardCC", "mastercard", IDR_AUTOFILL_PR_MASTERCARD,
IDS_AUTOFILL_CC_MASTERCARD},
{"mirCC", "mir", IDR_AUTOFILL_PR_MIR, IDS_AUTOFILL_CC_MIR},
{"unionPayCC", "unionpay", IDR_AUTOFILL_PR_UNIONPAY,
IDS_AUTOFILL_CC_UNION_PAY},
{"visaCC", "visa", IDR_AUTOFILL_PR_VISA, IDS_AUTOFILL_CC_VISA},
};
const PaymentRequestData kGenericPaymentRequestData = {
"genericCC", "generic", IDR_AUTOFILL_PR_GENERIC, IDS_AUTOFILL_CC_GENERIC};
const char* const name_prefixes[] = {
"1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt",
"captain", "col", "cpt", "dr", "gen", "general", "lcdr",
"lt", "ltc", "ltg", "ltjg", "maj", "major", "mg",
"mr", "mrs", "ms", "pastor", "prof", "rep", "reverend",
"rev", "sen", "st"};
const char* const name_suffixes[] = {"b.a", "ba", "d.d.s", "dds", "i", "ii",
"iii", "iv", "ix", "jr", "m.a", "m.d",
"ma", "md", "ms", "ph.d", "phd", "sr",
"v", "vi", "vii", "viii", "x"};
const char* const family_name_prefixes[] = {"d'", "de", "del", "der", "di",
"la", "le", "mc", "san", "st",
"ter", "van", "von"};
// The common and non-ambiguous CJK surnames (last names) that have more than
// one character.
const char* common_cjk_multi_char_surnames[] = {
// Korean, taken from the list of surnames:
// https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D
"남궁", "사공", "서문", "선우", "제갈", "황보", "독고", "망절",
// Chinese, taken from the top 10 Chinese 2-character surnames:
// https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93
// Simplified Chinese (mostly mainland China)
"欧阳", "令狐", "皇甫", "上官", "司徒", "诸葛", "司马", "宇文", "呼延", "端木",
// Traditional Chinese (mostly Taiwan)
"張簡", "歐陽", "諸葛", "申屠", "尉遲", "司馬", "軒轅", "夏侯"
};
// All Korean surnames that have more than one character, even the
// rare/ambiguous ones.
const char* korean_multi_char_surnames[] = {
"강전", "남궁", "독고", "동방", "망절", "사공", "서문", "선우",
"소봉", "어금", "장곡", "제갈", "황목", "황보"
};
// Returns true if |set| contains |element|, modulo a final period.
bool ContainsString(const char* const set[],
size_t set_size,
base::StringPiece16 element) {
if (!base::IsStringASCII(element))
return false;
base::StringPiece16 trimmed_element =
base::TrimString(element, base::ASCIIToUTF16("."), base::TRIM_ALL);
for (size_t i = 0; i < set_size; ++i) {
if (base::LowerCaseEqualsASCII(trimmed_element, set[i]))
return true;
}
return false;
}
// Removes common name prefixes from |name_tokens|.
void StripPrefixes(std::vector<base::StringPiece16>* name_tokens) {
std::vector<base::StringPiece16>::iterator iter = name_tokens->begin();
while (iter != name_tokens->end()) {
if (!ContainsString(name_prefixes, arraysize(name_prefixes), *iter))
break;
++iter;
}
std::vector<base::StringPiece16> copy_vector;
copy_vector.assign(iter, name_tokens->end());
*name_tokens = copy_vector;
}
// Removes common name suffixes from |name_tokens|.
void StripSuffixes(std::vector<base::StringPiece16>* name_tokens) {
while (!name_tokens->empty()) {
if (!ContainsString(name_suffixes, arraysize(name_suffixes),
name_tokens->back())) {
break;
}
name_tokens->pop_back();
}
}
// Find whether |name| starts with any of the strings from the array
// |prefixes|. The returned value is the length of the prefix found, or 0 if
// none is found.
size_t StartsWithAny(base::StringPiece16 name, const char** prefixes,
size_t prefix_count) {
base::string16 buffer;
for (size_t i = 0; i < prefix_count; i++) {
buffer.clear();
base::UTF8ToUTF16(prefixes[i], strlen(prefixes[i]), &buffer);
if (base::StartsWith(name, buffer, base::CompareCase::SENSITIVE)) {
return buffer.size();
}
}
return 0;
}
// Returns true if |c| is a CJK (Chinese, Japanese, Korean) character, for any
// of the CJK alphabets.
bool IsCJKCharacter(UChar32 c) {
UErrorCode error = U_ZERO_ERROR;
switch (uscript_getScript(c, &error)) {
case USCRIPT_HAN: // CJK logographs, used by all 3 (but rarely for Korean)
case USCRIPT_HANGUL: // Korean alphabet
case USCRIPT_KATAKANA: // A Japanese syllabary
case USCRIPT_HIRAGANA: // A Japanese syllabary
case USCRIPT_BOPOMOFO: // Chinese semisyllabary, rarely used
return true;
default:
return false;
}
}
// Returns true if |c| is a Korean Hangul character.
bool IsHangulCharacter(UChar32 c) {
UErrorCode error = U_ZERO_ERROR;
return uscript_getScript(c, &error) == USCRIPT_HANGUL;
}
// Returns true if |name| looks like a Korean name, made up entirely of Hangul
// characters or spaces. |name| should already be confirmed to be a CJK name, as
// per |IsCJKName()|.
bool IsHangulName(base::StringPiece16 name) {
for (base::i18n::UTF16CharIterator iter(name.data(), name.length());
!iter.end(); iter.Advance()) {
UChar32 c = iter.get();
if (!IsHangulCharacter(c) && !base::IsUnicodeWhitespace(c)) {
return false;
}
}
return true;
}
// Tries to split a Chinese, Japanese, or Korean name into its given name &
// surname parts, and puts the result in |parts|. If splitting did not work for
// whatever reason, returns false.
bool SplitCJKName(const std::vector<base::StringPiece16>& name_tokens,
NameParts* parts) {
// The convention for CJK languages is to put the surname (last name) first,
// and the given name (first name) second. In a continuous text, there is
// normally no space between the two parts of the name. When entering their
// name into a field, though, some people add a space to disambiguate. CJK
// names (almost) never have a middle name.
if (name_tokens.size() == 1) {
// There is no space between the surname and given name. Try to infer where
// to separate between the two. Most Chinese and Korean surnames have only
// one character, but there are a few that have 2. If the name does not
// start with a surname from a known list, default to 1 character.
//
// TODO(crbug.com/89111): Japanese names with no space will be mis-split,
// since we don't have a list of Japanese last names. In the Han alphabet,
// it might also be difficult for us to differentiate between Chinese &
// Japanese names.
const base::StringPiece16& name = name_tokens.front();
const bool is_korean = IsHangulName(name);
size_t surname_length = 0;
if (is_korean && name.size() > 3) {
// 4-character Korean names are more likely to be 2/2 than 1/3, so use
// the full list of Korean 2-char surnames. (instead of only the common
// ones)
surname_length = std::max<size_t>(
1, StartsWithAny(name, korean_multi_char_surnames,
arraysize(korean_multi_char_surnames)));
} else {
// Default to 1 character if the surname is not in
// |common_cjk_multi_char_surnames|.
surname_length = std::max<size_t>(
1, StartsWithAny(name, common_cjk_multi_char_surnames,
arraysize(common_cjk_multi_char_surnames)));
}
parts->family = name.substr(0, surname_length).as_string();
parts->given = name.substr(surname_length).as_string();
return true;
}
if (name_tokens.size() == 2) {
// The user entered a space between the two name parts. This makes our job
// easier. Family name first, given name second.
parts->family = name_tokens[0].as_string();
parts->given = name_tokens[1].as_string();
return true;
}
// We don't know what to do if there are more than 2 tokens.
return false;
}
} // namespace
bool IsCJKName(base::StringPiece16 name) {
// The name is considered to be a CJK name if it is only CJK characters,
// spaces, and "middle dot" separators, with at least one CJK character, and
// no more than 2 words.
//
// Chinese and Japanese names are usually spelled out using the Han characters
// (logographs), which constitute the "CJK Unified Ideographs" block in
// Unicode, also referred to as Unihan. Korean names are usually spelled out
// in the Korean alphabet (Hangul), although they do have a Han equivalent as
// well.
//
// The middle dot is used as a separator for foreign names in Japanese.
static const base::char16 kKatakanaMiddleDot = u'\u30FB';
// A (common?) typo for 'KATAKANA MIDDLE DOT' (U+30FB).
static const base::char16 kMiddleDot = u'\u00B7';
bool previous_was_cjk = false;
size_t word_count = 0;
for (base::i18n::UTF16CharIterator iter(name.data(), name.length());
!iter.end(); iter.Advance()) {
UChar32 c = iter.get();
const bool is_cjk = IsCJKCharacter(c);
if (!is_cjk && !base::IsUnicodeWhitespace(c) && c != kKatakanaMiddleDot &&
c != kMiddleDot) {
return false;
}
if (is_cjk && !previous_was_cjk) {
word_count++;
}
previous_was_cjk = is_cjk;
}
return word_count > 0 && word_count < 3;
}
NameParts SplitName(base::StringPiece16 name) {
static const base::char16 kWordSeparators[] = {
u' ', // ASCII space.
u',', // ASCII comma.
u'\u3000', // 'IDEOGRAPHIC SPACE' (U+3000).
u'\u30FB', // 'KATAKANA MIDDLE DOT' (U+30FB).
u'\u00B7', // 'MIDDLE DOT' (U+00B7).
u'\0' // End of string.
};
std::vector<base::StringPiece16> name_tokens = base::SplitStringPiece(
name, kWordSeparators, base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
StripPrefixes(&name_tokens);
NameParts parts;
// TODO(crbug.com/89111): Hungarian, Tamil, Telugu, and Vietnamese also have
// the given name before the surname, and should be treated as special cases
// too.
// Treat CJK names differently.
if (IsCJKName(name) && SplitCJKName(name_tokens, &parts)) {
return parts;
}
// Don't assume "Ma" is a suffix in John Ma.
if (name_tokens.size() > 2)
StripSuffixes(&name_tokens);
if (name_tokens.empty()) {
// Bad things have happened; just assume the whole thing is a given name.
parts.given = name.as_string();
return parts;
}
// Only one token, assume given name.
if (name_tokens.size() == 1) {
parts.given = name_tokens[0].as_string();
return parts;
}
// 2 or more tokens. Grab the family, which is the last word plus any
// recognizable family prefixes.
std::vector<base::StringPiece16> reverse_family_tokens;
reverse_family_tokens.push_back(name_tokens.back());
name_tokens.pop_back();
while (name_tokens.size() >= 1 &&
ContainsString(family_name_prefixes, arraysize(family_name_prefixes),
name_tokens.back())) {
reverse_family_tokens.push_back(name_tokens.back());
name_tokens.pop_back();
}
std::vector<base::StringPiece16> family_tokens(reverse_family_tokens.rbegin(),
reverse_family_tokens.rend());
parts.family = base::JoinString(family_tokens, base::ASCIIToUTF16(" "));
// Take the last remaining token as the middle name (if there are at least 2
// tokens).
if (name_tokens.size() >= 2) {
parts.middle = name_tokens.back().as_string();
name_tokens.pop_back();
}
// Remainder is given name.
parts.given = base::JoinString(name_tokens, base::ASCIIToUTF16(" "));
return parts;
}
base::string16 JoinNameParts(base::StringPiece16 given,
base::StringPiece16 middle,
base::StringPiece16 family) {
// First Middle Last
std::vector<base::StringPiece16> full_name;
if (!given.empty())
full_name.push_back(given);
if (!middle.empty())
full_name.push_back(middle);
if (!family.empty())
full_name.push_back(family);
const char* separator = " ";
if (IsCJKName(given) && IsCJKName(family) && middle.empty()) {
// LastFirst
std::reverse(full_name.begin(), full_name.end());
separator = "";
}
return base::JoinString(full_name, base::ASCIIToUTF16(separator));
}
bool ProfileMatchesFullName(base::StringPiece16 full_name,
const autofill::AutofillProfile& profile) {
const base::string16 kSpace = base::ASCIIToUTF16(" ");
const base::string16 kPeriodSpace = base::ASCIIToUTF16(". ");
// First Last
base::string16 candidate = profile.GetRawInfo(autofill::NAME_FIRST) + kSpace +
profile.GetRawInfo(autofill::NAME_LAST);
if (!full_name.compare(candidate)) {
return true;
}
// First Middle Last
candidate = profile.GetRawInfo(autofill::NAME_FIRST) + kSpace +
profile.GetRawInfo(autofill::NAME_MIDDLE) + kSpace +
profile.GetRawInfo(autofill::NAME_LAST);
if (!full_name.compare(candidate)) {
return true;
}
// First M Last
candidate = profile.GetRawInfo(autofill::NAME_FIRST) + kSpace +
profile.GetRawInfo(autofill::NAME_MIDDLE_INITIAL) + kSpace +
profile.GetRawInfo(autofill::NAME_LAST);
if (!full_name.compare(candidate)) {
return true;
}
// First M. Last
candidate = profile.GetRawInfo(autofill::NAME_FIRST) + kSpace +
profile.GetRawInfo(autofill::NAME_MIDDLE_INITIAL) + kPeriodSpace +
profile.GetRawInfo(autofill::NAME_LAST);
if (!full_name.compare(candidate)) {
return true;
}
// Last First
candidate = profile.GetRawInfo(autofill::NAME_LAST) + kSpace +
profile.GetRawInfo(autofill::NAME_FIRST);
if (!full_name.compare(candidate)) {
return true;
}
// LastFirst
candidate = profile.GetRawInfo(autofill::NAME_LAST) +
profile.GetRawInfo(autofill::NAME_FIRST);
if (!full_name.compare(candidate)) {
return true;
}
return false;
}
const PaymentRequestData& GetPaymentRequestData(const std::string& type) {
for (const PaymentRequestData& data : kPaymentRequestData) {
if (type == data.card_type)
return data;
}
return kGenericPaymentRequestData;
}
const char* GetCardTypeForBasicCardPaymentType(
const std::string& basic_card_payment_type) {
for (const PaymentRequestData& data : kPaymentRequestData) {
if (basic_card_payment_type == data.basic_card_payment_type) {
return data.card_type;
}
}
return kGenericPaymentRequestData.card_type;
}
bool IsValidCountryCode(const std::string& country_code) {
if (country_code.size() != 2)
return false;
return re2::RE2::FullMatch(country_code, "^[A-Z]{2}$");
}
bool IsValidCountryCode(const base::string16& country_code) {
return IsValidCountryCode(base::UTF16ToUTF8(country_code));
}
} // namespace data_util
} // namespace autofill