blob: 5bb8b516f010247a29d3b744c5c6ed254a9ee8ff [file] [log] [blame]
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/translate/core/language_detection/language_detection_util.h"
#include <stddef.h>
#include "base/logging.h"
#include "base/metrics/histogram_base.h"
#include "base/metrics/histogram_functions.h"
#include "base/metrics/histogram_macros.h"
#include "base/metrics/metrics_hashes.h"
#include "base/stl_util.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "components/translate/core/common/translate_constants.h"
#include "components/translate/core/common/translate_metrics.h"
#include "components/translate/core/common/translate_util.h"
#include "components/translate/core/language_detection/chinese_script_classifier.h"
#include "third_party/cld_3/src/src/nnet_language_identifier.h"
namespace {
// Similar language code list. Some languages are very similar and difficult
// for CLD to distinguish.
struct SimilarLanguageCode {
const char* const code;
int group;
};
const SimilarLanguageCode kSimilarLanguageCodes[] = {
{"bs", 1},
{"hr", 1},
{"hi", 2},
{"ne", 2},
};
// Checks |kSimilarLanguageCodes| and returns group code.
int GetSimilarLanguageGroupCode(const std::string& language) {
for (size_t i = 0; i < base::size(kSimilarLanguageCodes); ++i) {
if (language.find(kSimilarLanguageCodes[i].code) != 0)
continue;
return kSimilarLanguageCodes[i].group;
}
return 0;
}
// Well-known languages which often have wrong server configuration of
// Content-Language: en.
const char* const kWellKnownCodesOnWrongConfiguration[] = {
"es", "pt", "ja", "ru", "de", "zh-CN",
"zh-TW", "ar", "id", "fr", "it", "th"};
// Applies a series of language code modification in proper order.
void ApplyLanguageCodeCorrection(std::string* code) {
// Correct well-known format errors.
translate::CorrectLanguageCodeTypo(code);
if (!translate::IsValidLanguageCode(*code)) {
*code = std::string();
return;
}
translate::ToTranslateLanguageSynonym(code);
}
// Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
// failed.
// |is_cld_reliable| will be set as true if CLD says the detection is reliable.
std::string DetermineTextLanguage(const base::string16& text,
bool* is_cld_reliable) {
std::string language = translate::kUnknownLanguageCode;
const std::string utf8_text(base::UTF16ToUTF8(text));
// Make a prediction.
chrome_lang_id::NNetLanguageIdentifier lang_id;
const chrome_lang_id::NNetLanguageIdentifier::Result lang_id_result =
lang_id.FindTopNMostFreqLangs(utf8_text, /*num_langs=*/1).at(0);
const bool prediction_reliable = lang_id_result.is_reliable;
const std::string& predicted_language = lang_id_result.language;
// Update histograms.
const base::HistogramBase::Sample pred_lang_hash =
static_cast<base::HistogramBase::Sample>(
base::HashMetricName(predicted_language));
base::UmaHistogramSparse("Translate.CLD3.LanguageDetected", pred_lang_hash);
if (predicted_language != chrome_lang_id::NNetLanguageIdentifier::kUnknown) {
UMA_HISTOGRAM_PERCENTAGE("Translate.CLD3.LanguagePercentage",
static_cast<int>(100 * lang_id_result.proportion));
}
if (is_cld_reliable != nullptr) {
*is_cld_reliable = prediction_reliable;
}
// Ignore unreliable, "unknown", and xx-Latn predictions that are currently
// not supported.
if (prediction_reliable &&
predicted_language != "bg-Latn" &&
predicted_language != "el-Latn" &&
predicted_language != "ja-Latn" &&
predicted_language != "ru-Latn" &&
predicted_language != "zh-Latn" &&
predicted_language !=
chrome_lang_id::NNetLanguageIdentifier::kUnknown) {
if (predicted_language != "zh") {
language = predicted_language;
} else {
// If prediction is "zh" (Chinese), then we need to determine whether the
// text is zh-Hant (Chinese Traditional) or zh-Hans (Chinese Simplified).
translate::ChineseScriptClassifier zh_classifier;
// The Classify function returns either "zh-Hant" or "zh-Hans".
// Convert to the old-style language codes used by the Translate API.
const std::string zh_classification = zh_classifier.Classify(utf8_text);
if (zh_classification == "zh-Hant") {
language = "zh-TW";
} else if (zh_classification == "zh-Hans") {
language = "zh-CN";
} else {
language = translate::kUnknownLanguageCode;
}
}
}
VLOG(1) << "Detected language: " << language;
return language;
}
// Checks if CLD can complement a sub code when the page language doesn't know
// the sub code.
bool CanCLDComplementSubCode(
const std::string& page_language, const std::string& cld_language) {
// Translate server cannot treat general Chinese. If Content-Language and
// CLD agree that the language is Chinese and Content-Language doesn't know
// which dialect is used, CLD language has priority.
// TODO(hajimehoshi): How about the other dialects like zh-MO?
return page_language == "zh" &&
base::StartsWith(cld_language, "zh-",
base::CompareCase::INSENSITIVE_ASCII);
}
} // namespace
namespace translate {
std::string DeterminePageLanguage(const std::string& code,
const std::string& html_lang,
const base::string16& contents,
std::string* cld_language_p,
bool* is_cld_reliable_p) {
bool is_cld_reliable;
// Check if html lang attribute is valid.
std::string modified_html_lang;
if (!html_lang.empty()) {
modified_html_lang = html_lang;
ApplyLanguageCodeCorrection(&modified_html_lang);
translate::ReportHtmlLang(html_lang, modified_html_lang);
VLOG(9) << "html lang based language code: " << modified_html_lang;
}
// Check if Content-Language is valid.
std::string modified_code;
if (!code.empty()) {
modified_code = code;
ApplyLanguageCodeCorrection(&modified_code);
translate::ReportContentLanguage(code, modified_code);
}
std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
if (cld_language_p != nullptr)
*cld_language_p = cld_language;
if (is_cld_reliable_p != nullptr)
*is_cld_reliable_p = is_cld_reliable;
translate::ToTranslateLanguageSynonym(&cld_language);
// Adopt |modified_html_lang| if it is valid. Otherwise, adopt
// |modified_code|.
std::string language = modified_html_lang.empty() ? modified_code :
modified_html_lang;
// When the page language is English, log conflicting CLD results. We will use
// these metrics to decide when to favor CLD.
if (language.substr(0, 2) == "en" && cld_language.substr(0, 2) != "en" &&
cld_language != kUnknownLanguageCode) {
translate::ReportLanguageDetectionConflict(language, cld_language);
}
// If |language| is empty, just use CLD result even though it might be
// translate::kUnknownLanguageCode.
if (language.empty()) {
translate::ReportLanguageVerification(
translate::LANGUAGE_VERIFICATION_CLD_ONLY);
return cld_language;
}
if (cld_language == kUnknownLanguageCode) {
translate::ReportLanguageVerification(
translate::LANGUAGE_VERIFICATION_UNKNOWN);
return language;
}
if (CanCLDComplementSubCode(language, cld_language)) {
translate::ReportLanguageVerification(
translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);
return cld_language;
}
if (IsSameOrSimilarLanguages(language, cld_language)) {
translate::ReportLanguageVerification(
translate::LANGUAGE_VERIFICATION_CLD_AGREE);
return language;
}
if (MaybeServerWrongConfiguration(language, cld_language)) {
translate::ReportLanguageVerification(
translate::LANGUAGE_VERIFICATION_TRUST_CLD);
return cld_language;
}
// Content-Language value might be wrong because CLD says that this page is
// written in another language with confidence. In this case, Chrome doesn't
// rely on any of the language codes, and gives up suggesting a translation.
translate::ReportLanguageVerification(
translate::LANGUAGE_VERIFICATION_CLD_DISAGREE);
return kUnknownLanguageCode;
}
void CorrectLanguageCodeTypo(std::string* code) {
DCHECK(code);
size_t coma_index = code->find(',');
if (coma_index != std::string::npos) {
// There are more than 1 language specified, just keep the first one.
*code = code->substr(0, coma_index);
}
base::TrimWhitespaceASCII(*code, base::TRIM_ALL, code);
// An underscore instead of a dash is a frequent mistake.
size_t underscore_index = code->find('_');
if (underscore_index != std::string::npos)
(*code)[underscore_index] = '-';
// Change everything up to a dash to lower-case and everything after to upper.
size_t dash_index = code->find('-');
if (dash_index != std::string::npos) {
*code = base::ToLowerASCII(code->substr(0, dash_index)) +
base::ToUpperASCII(code->substr(dash_index));
} else {
*code = base::ToLowerASCII(*code);
}
}
bool IsValidLanguageCode(const std::string& code) {
// Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.
// TODO(hajimehoshi): How about es-419, which is used as an Accept language?
std::vector<base::StringPiece> chunks = base::SplitStringPiece(
code, "-", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
if (chunks.size() < 1 || 2 < chunks.size())
return false;
const base::StringPiece& main_code = chunks[0];
if (main_code.size() < 1 || 3 < main_code.size())
return false;
for (char c : main_code) {
if (!base::IsAsciiAlpha(c))
return false;
}
if (chunks.size() == 1)
return true;
const base::StringPiece& sub_code = chunks[1];
if (sub_code.size() != 2)
return false;
for (char c : sub_code) {
if (!base::IsAsciiAlpha(c))
return false;
}
return true;
}
bool IsSameOrSimilarLanguages(const std::string& page_language,
const std::string& cld_language) {
std::vector<std::string> chunks = base::SplitString(
page_language, "-", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
if (chunks.size() == 0)
return false;
std::string page_language_main_part = chunks[0]; // Need copy.
chunks = base::SplitString(
cld_language, "-", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
if (chunks.size() == 0)
return false;
const std::string& cld_language_main_part = chunks[0];
// Language code part of |page_language| is matched to one of |cld_language|.
// Country code is ignored here.
if (page_language_main_part == cld_language_main_part) {
// Languages are matched strictly. Reports false to metrics, but returns
// true.
translate::ReportSimilarLanguageMatch(false);
return true;
}
// Check if |page_language| and |cld_language| are in the similar language
// list and belong to the same language group.
int page_code = GetSimilarLanguageGroupCode(page_language);
bool match = page_code != 0 &&
page_code == GetSimilarLanguageGroupCode(cld_language);
translate::ReportSimilarLanguageMatch(match);
return match;
}
bool IsServerWrongConfigurationLanguage(const std::string& language_code) {
for (size_t i = 0; i < base::size(kWellKnownCodesOnWrongConfiguration); ++i) {
if (language_code == kWellKnownCodesOnWrongConfiguration[i])
return true;
}
return false;
}
bool MaybeServerWrongConfiguration(const std::string& page_language,
const std::string& cld_language) {
// If |page_language| is not "en-*", respect it and just return false here.
if (!base::StartsWith(page_language, "en",
base::CompareCase::INSENSITIVE_ASCII))
return false;
// A server provides a language meta information representing "en-*". But it
// might be just a default value due to missing user configuration.
// Let's trust |cld_language| if the determined language is not difficult to
// distinguish from English, and the language is one of well-known languages
// which often provide "en-*" meta information mistakenly.
return IsServerWrongConfigurationLanguage(cld_language);
}
} // namespace translate