chrome/tools/convert_dict/convert_dict_unittest.cc - chromium/src - Git at Google

 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include <stddef.h>

 #include <map>
 #include <string>

 #include "base/files/file_util.h"
 #include "base/format_macros.h"
 #include "base/i18n/icu_string_conversions.h"
 #include "base/macros.h"
 #include "base/stl_util.h"
 #include "base/strings/stringprintf.h"
 #include "base/strings/utf_string_conversions.h"
 #include "chrome/tools/convert_dict/aff_reader.h"
 #include "chrome/tools/convert_dict/dic_reader.h"
 #include "testing/gtest/include/gtest/gtest.h"
 #include "third_party/hunspell/google/bdict_reader.h"
 #include "third_party/hunspell/google/bdict_writer.h"

 namespace {

 // Compares the given word list with the serialized trie to make sure they
 // are the same.
 // (This function is copied from "chrome/tools/convert_dict/convert_dict.cc").
 bool VerifyWords(const convert_dict::DicReader::WordList& org_words,
                  const std::string& serialized) {
   hunspell::BDictReader reader;
   EXPECT_TRUE(
       reader.Init(reinterpret_cast<const unsigned char*>(serialized.data()),
       serialized.size()));

   hunspell::WordIterator iter = reader.GetAllWordIterator();

   int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD];

   static const int kBufSize = 128;
   char buf[kBufSize];
   for (size_t i = 0; i < org_words.size(); i++) {
     SCOPED_TRACE(base::StringPrintf(
         "org_words[%" PRIuS "]: %s", i, org_words[i].first.c_str()));

     int affix_matches = iter.Advance(buf, kBufSize, affix_ids);
     EXPECT_NE(0, affix_matches);
     EXPECT_EQ(org_words[i].first, std::string(buf));
     EXPECT_EQ(affix_matches, static_cast<int>(org_words[i].second.size()));

     // Check the individual affix indices.
     for (size_t affix_index = 0; affix_index < org_words[i].second.size();
          affix_index++) {
       EXPECT_EQ(affix_ids[affix_index], org_words[i].second[affix_index]);
     }
   }

   return true;
 }

 // Implements the test process used by ConvertDictTest.
 // This function encapsulates all complicated operations used by
 // ConvertDictTest so we can conceal them from the tests themselves.
 // This function consists of the following parts:
 // * Creates a dummy affix file and a dictionary file.
 // * Reads the dummy files.
 // * Creates bdict data.
 // * Verify the bdict data.
 void RunDictionaryTest(const char* codepage,
                        const std::map<base::string16, bool>& word_list) {
   // Create an affix data and a dictionary data.
   std::string aff_data(base::StringPrintf("SET %s\n", codepage));

   std::string dic_data(base::StringPrintf("%" PRIuS "\n", word_list.size()));
   for (auto it = word_list.begin(); it != word_list.end(); ++it) {
     std::string encoded_word;
     EXPECT_TRUE(UTF16ToCodepage(it->first,
                                 codepage,
                                 base::OnStringConversionError::FAIL,
                                 &encoded_word));
     dic_data += encoded_word;
     dic_data += "\n";
   }

   // Create a temporary affix file and a dictionary file from the test data.
   base::FilePath aff_file;
   base::CreateTemporaryFile(&aff_file);
   base::WriteFile(aff_file, aff_data.c_str(), aff_data.length());

   base::FilePath dic_file;
   base::CreateTemporaryFile(&dic_file);
   base::WriteFile(dic_file, dic_data.c_str(), dic_data.length());

   {
     // Read the above affix file with AffReader and read the dictionary file
     // with DicReader, respectively.
     convert_dict::AffReader aff_reader(aff_file);
     EXPECT_TRUE(aff_reader.Read());

     convert_dict::DicReader dic_reader(dic_file);
     EXPECT_TRUE(dic_reader.Read(&aff_reader));

     // Verify this DicReader includes all the input words.
     EXPECT_EQ(word_list.size(), dic_reader.words().size());
     for (size_t i = 0; i < dic_reader.words().size(); ++i) {
       SCOPED_TRACE(base::StringPrintf("dic_reader.words()[%" PRIuS "]: %s",
                                       i, dic_reader.words()[i].first.c_str()));
       base::string16 word(base::UTF8ToUTF16(dic_reader.words()[i].first));
       EXPECT_TRUE(word_list.find(word) != word_list.end());
     }

     // Create BDICT data and verify it.
     hunspell::BDictWriter writer;
     writer.SetComment(aff_reader.comments());
     writer.SetAffixRules(aff_reader.affix_rules());
     writer.SetAffixGroups(aff_reader.GetAffixGroups());
     writer.SetReplacements(aff_reader.replacements());
     writer.SetOtherCommands(aff_reader.other_commands());
     writer.SetWords(dic_reader.words());

     std::string bdict_data = writer.GetBDict();
     VerifyWords(dic_reader.words(), bdict_data);
     EXPECT_TRUE(hunspell::BDict::Verify(bdict_data.data(), bdict_data.size()));

     // Trim the end of this BDICT and verify our verifier tells these trimmed
     // BDICTs are corrupted.
     for (size_t i = 1; i < bdict_data.size(); ++i) {
       SCOPED_TRACE(base::StringPrintf("i = %" PRIuS, i));
       EXPECT_FALSE(hunspell::BDict::Verify(bdict_data.data(),
                                            bdict_data.size() - i));
     }
   }

   // Deletes the temporary files.
   // We need to delete them after the above AffReader and DicReader are deleted
   // since they close the input files in their destructors.
   base::DeleteFile(aff_file, false);
   base::DeleteFile(dic_file, false);
 }

 }  // namespace

 // Tests whether or not our DicReader can read all the input English words
 TEST(ConvertDictTest, English) {
   static constexpr char kCodepage[] = "UTF-8";
   static constexpr const wchar_t* kWords[] = {
       L"I", L"he", L"she", L"it", L"we", L"you", L"they",
   };

   std::map<base::string16, bool> word_list;
   for (size_t i = 0; i < base::size(kWords); ++i) {
     word_list.insert(
         std::make_pair<base::string16, bool>(base::WideToUTF16(kWords[i]),
                                              true));
   }

   RunDictionaryTest(kCodepage, word_list);
 }

 // Tests whether or not our DicReader can read all the input Russian words.
 TEST(ConvertDictTest, Russian) {
   static constexpr char kCodepage[] = "KOI8-R";
   static constexpr const wchar_t* kWords[] = {
       L"\x044f",
       L"\x0442\x044b",
       L"\x043e\x043d",
       L"\x043e\x043d\x0430",
       L"\x043e\x043d\x043e",
       L"\x043c\x044b",
       L"\x0432\x044b",
       L"\x043e\x043d\x0438",
   };

   std::map<base::string16, bool> word_list;
   for (size_t i = 0; i < base::size(kWords); ++i) {
     word_list.insert(
         std::make_pair<base::string16, bool>(base::WideToUTF16(kWords[i]),
                                              true));
   }

   RunDictionaryTest(kCodepage, word_list);
 }

 // Tests whether or not our DicReader can read all the input Hungarian words.
 TEST(ConvertDictTest, Hungarian) {
   static constexpr char kCodepage[] = "ISO8859-2";
   static constexpr const wchar_t* kWords[] = {
       L"\x00e9\x006e",
       L"\x0074\x0065",
       L"\x0151",
       L"\x00f6\x006e",
       L"\x006d\x0061\x0067\x0061",
       L"\x006d\x0069",
       L"\x0074\x0069",
       L"\x0151\x006b",
       L"\x00f6\x006e\x00f6\x006b",
       L"\x006d\x0061\x0067\x0075\x006b",
   };

   std::map<base::string16, bool> word_list;
   for (size_t i = 0; i < base::size(kWords); ++i) {
     word_list.insert(
         std::make_pair<base::string16, bool>(base::WideToUTF16(kWords[i]),
                                              true));
   }

   RunDictionaryTest(kCodepage, word_list);
 }
	// Copyright (c) 2011 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include <stddef.h>

	#include <map>
	#include <string>

	#include "base/files/file_util.h"
	#include "base/format_macros.h"
	#include "base/i18n/icu_string_conversions.h"
	#include "base/macros.h"
	#include "base/stl_util.h"
	#include "base/strings/stringprintf.h"
	#include "base/strings/utf_string_conversions.h"
	#include "chrome/tools/convert_dict/aff_reader.h"
	#include "chrome/tools/convert_dict/dic_reader.h"
	#include "testing/gtest/include/gtest/gtest.h"
	#include "third_party/hunspell/google/bdict_reader.h"
	#include "third_party/hunspell/google/bdict_writer.h"

	namespace {

	// Compares the given word list with the serialized trie to make sure they
	// are the same.
	// (This function is copied from "chrome/tools/convert_dict/convert_dict.cc").
	bool VerifyWords(const convert_dict::DicReader::WordList& org_words,
	const std::string& serialized) {
	hunspell::BDictReader reader;
	EXPECT_TRUE(
	reader.Init(reinterpret_cast<const unsigned char*>(serialized.data()),
	serialized.size()));

	hunspell::WordIterator iter = reader.GetAllWordIterator();

	int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD];

	static const int kBufSize = 128;
	char buf[kBufSize];
	for (size_t i = 0; i < org_words.size(); i++) {
	SCOPED_TRACE(base::StringPrintf(
	"org_words[%" PRIuS "]: %s", i, org_words[i].first.c_str()));

	int affix_matches = iter.Advance(buf, kBufSize, affix_ids);
	EXPECT_NE(0, affix_matches);
	EXPECT_EQ(org_words[i].first, std::string(buf));
	EXPECT_EQ(affix_matches, static_cast<int>(org_words[i].second.size()));

	// Check the individual affix indices.
	for (size_t affix_index = 0; affix_index < org_words[i].second.size();
	affix_index++) {
	EXPECT_EQ(affix_ids[affix_index], org_words[i].second[affix_index]);
	}
	}

	return true;
	}

	// Implements the test process used by ConvertDictTest.
	// This function encapsulates all complicated operations used by
	// ConvertDictTest so we can conceal them from the tests themselves.
	// This function consists of the following parts:
	// * Creates a dummy affix file and a dictionary file.
	// * Reads the dummy files.
	// * Creates bdict data.
	// * Verify the bdict data.
	void RunDictionaryTest(const char* codepage,
	const std::map<base::string16, bool>& word_list) {
	// Create an affix data and a dictionary data.
	std::string aff_data(base::StringPrintf("SET %s\n", codepage));

	std::string dic_data(base::StringPrintf("%" PRIuS "\n", word_list.size()));
	for (auto it = word_list.begin(); it != word_list.end(); ++it) {
	std::string encoded_word;
	EXPECT_TRUE(UTF16ToCodepage(it->first,
	codepage,
	base::OnStringConversionError::FAIL,
	&encoded_word));
	dic_data += encoded_word;
	dic_data += "\n";
	}

	// Create a temporary affix file and a dictionary file from the test data.
	base::FilePath aff_file;
	base::CreateTemporaryFile(&aff_file);
	base::WriteFile(aff_file, aff_data.c_str(), aff_data.length());

	base::FilePath dic_file;
	base::CreateTemporaryFile(&dic_file);
	base::WriteFile(dic_file, dic_data.c_str(), dic_data.length());

	{
	// Read the above affix file with AffReader and read the dictionary file
	// with DicReader, respectively.
	convert_dict::AffReader aff_reader(aff_file);
	EXPECT_TRUE(aff_reader.Read());

	convert_dict::DicReader dic_reader(dic_file);
	EXPECT_TRUE(dic_reader.Read(&aff_reader));

	// Verify this DicReader includes all the input words.
	EXPECT_EQ(word_list.size(), dic_reader.words().size());
	for (size_t i = 0; i < dic_reader.words().size(); ++i) {
	SCOPED_TRACE(base::StringPrintf("dic_reader.words()[%" PRIuS "]: %s",
	i, dic_reader.words()[i].first.c_str()));
	base::string16 word(base::UTF8ToUTF16(dic_reader.words()[i].first));
	EXPECT_TRUE(word_list.find(word) != word_list.end());
	}

	// Create BDICT data and verify it.
	hunspell::BDictWriter writer;
	writer.SetComment(aff_reader.comments());
	writer.SetAffixRules(aff_reader.affix_rules());
	writer.SetAffixGroups(aff_reader.GetAffixGroups());
	writer.SetReplacements(aff_reader.replacements());
	writer.SetOtherCommands(aff_reader.other_commands());
	writer.SetWords(dic_reader.words());

	std::string bdict_data = writer.GetBDict();
	VerifyWords(dic_reader.words(), bdict_data);
	EXPECT_TRUE(hunspell::BDict::Verify(bdict_data.data(), bdict_data.size()));

	// Trim the end of this BDICT and verify our verifier tells these trimmed
	// BDICTs are corrupted.
	for (size_t i = 1; i < bdict_data.size(); ++i) {
	SCOPED_TRACE(base::StringPrintf("i = %" PRIuS, i));
	EXPECT_FALSE(hunspell::BDict::Verify(bdict_data.data(),
	bdict_data.size() - i));
	}
	}

	// Deletes the temporary files.
	// We need to delete them after the above AffReader and DicReader are deleted
	// since they close the input files in their destructors.
	base::DeleteFile(aff_file, false);
	base::DeleteFile(dic_file, false);
	}

	} // namespace

	// Tests whether or not our DicReader can read all the input English words
	TEST(ConvertDictTest, English) {
	static constexpr char kCodepage[] = "UTF-8";
	static constexpr const wchar_t* kWords[] = {
	L"I", L"he", L"she", L"it", L"we", L"you", L"they",
	};

	std::map<base::string16, bool> word_list;
	for (size_t i = 0; i < base::size(kWords); ++i) {
	word_list.insert(
	std::make_pair<base::string16, bool>(base::WideToUTF16(kWords[i]),
	true));
	}

	RunDictionaryTest(kCodepage, word_list);
	}

	// Tests whether or not our DicReader can read all the input Russian words.
	TEST(ConvertDictTest, Russian) {
	static constexpr char kCodepage[] = "KOI8-R";
	static constexpr const wchar_t* kWords[] = {
	L"\x044f",
	L"\x0442\x044b",
	L"\x043e\x043d",
	L"\x043e\x043d\x0430",
	L"\x043e\x043d\x043e",
	L"\x043c\x044b",
	L"\x0432\x044b",
	L"\x043e\x043d\x0438",
	};

	std::map<base::string16, bool> word_list;
	for (size_t i = 0; i < base::size(kWords); ++i) {
	word_list.insert(
	std::make_pair<base::string16, bool>(base::WideToUTF16(kWords[i]),
	true));
	}

	RunDictionaryTest(kCodepage, word_list);
	}

	// Tests whether or not our DicReader can read all the input Hungarian words.
	TEST(ConvertDictTest, Hungarian) {
	static constexpr char kCodepage[] = "ISO8859-2";
	static constexpr const wchar_t* kWords[] = {
	L"\x00e9\x006e",
	L"\x0074\x0065",
	L"\x0151",
	L"\x00f6\x006e",
	L"\x006d\x0061\x0067\x0061",
	L"\x006d\x0069",
	L"\x0074\x0069",
	L"\x0151\x006b",
	L"\x00f6\x006e\x00f6\x006b",
	L"\x006d\x0061\x0067\x0075\x006b",
	};

	std::map<base::string16, bool> word_list;
	for (size_t i = 0; i < base::size(kWords); ++i) {
	word_list.insert(
	std::make_pair<base::string16, bool>(base::WideToUTF16(kWords[i]),
	true));
	}

	RunDictionaryTest(kCodepage, word_list);
	}