blob: b104744610720d9412ee73faedc907aed1066531 [file] [log] [blame]
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// This file defines a helper class for selecting a supported language from a
// set of candidates. It is used to get localized strings that are directly
// embedded into the executable / library instead of stored in external
// .pak files.
#include "base/win/embedded_i18n/language_selector.h"
#include <algorithm>
#include <functional>
#include "base/logging.h"
#include "base/stl_util.h"
#include "base/strings/string16.h"
#include "base/strings/string_util.h"
#include "base/win/i18n.h"
namespace base {
namespace win {
namespace i18n {
namespace {
using LangToOffset = LanguageSelector::LangToOffset;
// Holds pointers to LangToOffset pairs for specific languages that are the
// targets of exceptions (where one language is mapped to another) or wildcards
// (where a raw language identifier is mapped to a specific localization).
struct AvailableLanguageAliases {
const LangToOffset* en_gb_language_offset;
const LangToOffset* en_us_language_offset;
const LangToOffset* es_language_offset;
const LangToOffset* es_419_language_offset;
const LangToOffset* fil_language_offset;
const LangToOffset* iw_language_offset;
const LangToOffset* no_language_offset;
const LangToOffset* pt_br_language_offset;
const LangToOffset* zh_cn_language_offset;
const LangToOffset* zh_tw_language_offset;
};
#if DCHECK_IS_ON()
// Returns true if the items in the given range are sorted. If
// |byNameAndOffset| is true, the items must be sorted by both name and offset.
bool IsArraySortedAndLowerCased(const LangToOffset* languages_to_offset_begin,
const LangToOffset* languages_to_offset_end,
bool byNameAndOffset) {
DCHECK(languages_to_offset_begin);
DCHECK(languages_to_offset_end);
int count_languages_to_offset =
languages_to_offset_end - languages_to_offset_begin;
if (count_languages_to_offset > 1) {
const LangToOffset* first = languages_to_offset_begin;
const LangToOffset* last = languages_to_offset_end;
for (--last; first != last; ++first) {
if (!(base::string16(first->language) < (first + 1)->language) ||
(byNameAndOffset && !(first->offset < (first + 1)->offset)) ||
base::string16(first->language) !=
base::ToLowerASCII(first->language)) {
return false;
}
}
} else if (count_languages_to_offset == 1) {
return base::ToLowerASCII(languages_to_offset_begin->language) ==
base::string16(languages_to_offset_begin->language);
}
return true;
}
// Validates that the language to offset mappings are properly sorted and
// lower cased.
void ValidateMappings(const LangToOffset* languages_to_offset_begin,
const LangToOffset* languages_to_offset_end) {
// Ensure that languages_to_offset is sorted.
DCHECK(IsArraySortedAndLowerCased(languages_to_offset_begin,
languages_to_offset_end, true))
<< "languages_to_offset is not sorted";
}
#endif // DCHECK_IS_ON()
// Determines the availability of all languages that may be used as aliases in
// GetAliasedLanguageOffset or GetCompatibleNeutralLanguageOffset
AvailableLanguageAliases DetermineAvailableAliases(
const LangToOffset* languages_to_offset_begin,
const LangToOffset* languages_to_offset_end) {
AvailableLanguageAliases available_aliases = {};
for (const LangToOffset* lang_to_offset = languages_to_offset_begin;
lang_to_offset != languages_to_offset_end; ++lang_to_offset) {
base::string16 available_language = lang_to_offset->language;
if (available_language == L"en-gb")
available_aliases.en_gb_language_offset = lang_to_offset;
else if (available_language == L"en-us")
available_aliases.en_us_language_offset = lang_to_offset;
else if (available_language == L"es")
available_aliases.es_language_offset = lang_to_offset;
else if (available_language == L"es-419")
available_aliases.es_419_language_offset = lang_to_offset;
else if (available_language == L"fil")
available_aliases.fil_language_offset = lang_to_offset;
else if (available_language == L"iw")
available_aliases.iw_language_offset = lang_to_offset;
else if (available_language == L"no")
available_aliases.no_language_offset = lang_to_offset;
else if (available_language == L"pt-br")
available_aliases.pt_br_language_offset = lang_to_offset;
else if (available_language == L"zh-cn")
available_aliases.zh_cn_language_offset = lang_to_offset;
else if (available_language == L"zh-tw")
available_aliases.zh_tw_language_offset = lang_to_offset;
}
// Fallback language must exist.
DCHECK(available_aliases.en_us_language_offset);
return available_aliases;
}
// Returns true if a LangToOffset entry can be found in |languages_to_offset|
// that matches the |language| exactly. |offset| will store the offset of the
// language that matches if any. |languages_to_offset| must be sorted by
// language and all languages must lower case.
bool GetExactLanguageOffset(const LangToOffset* languages_to_offset_begin,
const LangToOffset* languages_to_offset_end,
const base::string16& language,
const LangToOffset** matched_language_to_offset) {
DCHECK(languages_to_offset_begin);
DCHECK(languages_to_offset_end);
DCHECK(matched_language_to_offset);
// Binary search in the sorted arrays to find the offset corresponding
// to a given language |name|.
const LangToOffset* search_result = std::lower_bound(
languages_to_offset_begin, languages_to_offset_end, language,
[](const LangToOffset& left, const base::string16& to_find) {
return left.language < to_find;
});
if (languages_to_offset_end != search_result &&
search_result->language == language) {
*matched_language_to_offset = search_result;
return true;
}
return false;
}
// Returns true if the current language can be aliased to another language.
bool GetAliasedLanguageOffset(const AvailableLanguageAliases& available_aliases,
const base::string16& language,
const LangToOffset** matched_language_to_offset) {
DCHECK(matched_language_to_offset);
// Alias some English variants to British English (all others wildcard to
// US).
if (available_aliases.en_gb_language_offset &&
(language == L"en-au" || language == L"en-ca" || language == L"en-nz" ||
language == L"en-za")) {
*matched_language_to_offset = available_aliases.en_gb_language_offset;
return true;
}
// Alias es-es to es (all others wildcard to es-419).
if (available_aliases.es_language_offset && language == L"es-es") {
*matched_language_to_offset = available_aliases.es_language_offset;
return true;
}
// Google web properties use iw for he. Handle both just to be safe.
if (available_aliases.iw_language_offset && language == L"he") {
*matched_language_to_offset = available_aliases.iw_language_offset;
return true;
}
// Google web properties use no for nb. Handle both just to be safe.
if (available_aliases.no_language_offset && language == L"nb") {
*matched_language_to_offset = available_aliases.no_language_offset;
return true;
}
// Some Google web properties use tl for fil. Handle both just to be safe.
// They're not completely identical, but alias it here.
if (available_aliases.fil_language_offset && language == L"tl") {
*matched_language_to_offset = available_aliases.fil_language_offset;
return true;
}
if (available_aliases.zh_cn_language_offset &&
// Pre-Vista alias for Chinese w/ script subtag.
(language == L"zh-chs" ||
// Vista+ alias for Chinese w/ script subtag.
language == L"zh-hans" ||
// Although the wildcard entry for zh would result in this, alias zh-sg
// so that it will win if it precedes another valid tag in a list of
// candidates.
language == L"zh-sg")) {
*matched_language_to_offset = available_aliases.zh_cn_language_offset;
return true;
}
if (available_aliases.zh_tw_language_offset &&
// Pre-Vista alias for Chinese w/ script subtag.
(language == L"zh-cht" ||
// Vista+ alias for Chinese w/ script subtag.
language == L"zh-hant" ||
// Alias Hong Kong and Macau to Taiwan.
language == L"zh-hk" || language == L"zh-mo")) {
*matched_language_to_offset = available_aliases.zh_tw_language_offset;
return true;
}
return false;
}
// Returns true if the current neutral language can be aliased to another
// language.
bool GetCompatibleNeutralLanguageOffset(
const AvailableLanguageAliases& available_aliases,
const base::string16& neutral_language,
const LangToOffset** matched_language_to_offset) {
DCHECK(matched_language_to_offset);
if (available_aliases.en_us_language_offset && neutral_language == L"en") {
// Use the U.S. region for anything English.
*matched_language_to_offset = available_aliases.en_us_language_offset;
return true;
}
if (available_aliases.es_419_language_offset && neutral_language == L"es") {
// Use the Latin American region for anything Spanish.
*matched_language_to_offset = available_aliases.es_419_language_offset;
return true;
}
if (available_aliases.pt_br_language_offset && neutral_language == L"pt") {
// Use the Brazil region for anything Portugese.
*matched_language_to_offset = available_aliases.pt_br_language_offset;
return true;
}
if (available_aliases.zh_cn_language_offset && neutral_language == L"zh") {
// Use the P.R.C. region for anything Chinese.
*matched_language_to_offset = available_aliases.zh_cn_language_offset;
return true;
}
return false;
}
// Runs through the set of candidates, sending their downcased representation
// through |select_predicate|. Returns true if the predicate selects a
// candidate, in which case |matched_name| is assigned the value of the
// candidate and |matched_offset| is assigned the language offset of the
// selected translation.
// static
bool SelectIf(const std::vector<base::string16>& candidates,
const LangToOffset* languages_to_offset_begin,
const LangToOffset* languages_to_offset_end,
const AvailableLanguageAliases& available_aliases,
const LangToOffset** matched_language_to_offset,
base::string16* matched_name) {
DCHECK(matched_language_to_offset);
DCHECK(matched_name);
// Note: always perform the exact match first so that an alias is never
// selected in place of a future translation.
// An earlier candidate entry matching on an exact match or alias match takes
// precedence over a later candidate entry matching on an exact match.
for (const base::string16& scan : candidates) {
base::string16 lower_case_candidate = base::ToLowerASCII(scan);
if (GetExactLanguageOffset(languages_to_offset_begin,
languages_to_offset_end, lower_case_candidate,
matched_language_to_offset) ||
GetAliasedLanguageOffset(available_aliases, lower_case_candidate,
matched_language_to_offset)) {
matched_name->assign(scan);
return true;
}
}
// If no candidate matches exactly or by alias, try to match by locale neutral
// language.
for (const base::string16& scan : candidates) {
base::string16 lower_case_candidate = base::ToLowerASCII(scan);
// Extract the locale neutral language from the language to search and try
// to find an exact match for that language in the provided table.
base::string16 neutral_language =
lower_case_candidate.substr(0, lower_case_candidate.find(L'-'));
if (GetCompatibleNeutralLanguageOffset(available_aliases, neutral_language,
matched_language_to_offset)) {
matched_name->assign(scan);
return true;
}
}
return false;
}
void SelectLanguageMatchingCandidate(
const std::vector<base::string16>& candidates,
const LangToOffset* languages_to_offset_begin,
const LangToOffset* languages_to_offset_end,
int* selected_offset,
base::string16* matched_candidate,
base::string16* selected_language) {
DCHECK(selected_offset);
DCHECK(matched_candidate);
DCHECK(selected_language);
DCHECK_GT(languages_to_offset_end, languages_to_offset_begin);
DCHECK_EQ(*selected_offset,
languages_to_offset_end - languages_to_offset_begin);
DCHECK(matched_candidate->empty());
DCHECK(selected_language->empty());
#if DCHECK_IS_ON()
ValidateMappings(languages_to_offset_begin, languages_to_offset_end);
#endif // DCHECK_IS_ON()
// Get which languages that are commonly used as aliases and wildcards are
// available for use to match candidates.
AvailableLanguageAliases available_aliases = DetermineAvailableAliases(
languages_to_offset_begin, languages_to_offset_end);
// The fallback must exist.
DCHECK(available_aliases.en_us_language_offset);
// Try to find the first matching candidate from all the language mappings
// that are given. Failing that, used en-us as the fallback language.
const LangToOffset* matched_language_to_offset = nullptr;
if (!SelectIf(candidates, languages_to_offset_begin, languages_to_offset_end,
available_aliases, &matched_language_to_offset,
matched_candidate)) {
matched_language_to_offset = available_aliases.en_us_language_offset;
*matched_candidate = available_aliases.en_us_language_offset->language;
}
DCHECK(matched_language_to_offset);
// Get the real language being used for the matched candidate.
*selected_language = matched_language_to_offset->language;
*selected_offset = matched_language_to_offset->offset;
}
std::vector<base::string16> GetCandidatesFromSystem(
const base::string16& preferred_language) {
std::vector<base::string16> candidates;
// Get the intitial candidate list for this particular implementation (if
// applicable).
if (!preferred_language.empty())
candidates.push_back(preferred_language);
// Now try the UI languages. Use the thread preferred ones since that will
// kindly return us a list of all kinds of fallbacks.
base::win::i18n::GetThreadPreferredUILanguageList(&candidates);
return candidates;
}
} // namespace
LanguageSelector::LanguageSelector(
const base::string16& preferred_language,
const LangToOffset* languages_to_offset_begin,
const LangToOffset* languages_to_offset_end)
: LanguageSelector(GetCandidatesFromSystem(preferred_language),
languages_to_offset_begin,
languages_to_offset_end) {}
LanguageSelector::LanguageSelector(
const std::vector<base::string16>& candidates,
const LangToOffset* languages_to_offset_begin,
const LangToOffset* languages_to_offset_end)
: selected_offset_(languages_to_offset_end - languages_to_offset_begin) {
SelectLanguageMatchingCandidate(candidates, languages_to_offset_begin,
languages_to_offset_end, &selected_offset_,
&matched_candidate_, &selected_language_);
}
LanguageSelector::~LanguageSelector() = default;
} // namespace i18n
} // namespace win
} // namespace base