14 #include "absl/strings/ascii.h"
15 #include "absl/strings/str_cat.h"
16 #include "absl/strings/str_split.h"
17 #include "allheaders.h"
28 class UnicharcompressTest :
public ::testing::Test {
31 std::locale::global(std::locale(
""));
35 void LoadUnicharset(
const std::string& unicharset_name) {
44 STRING radical_str(radical_data.c_str());
57 LOG(
INFO) <<
"Wrote encoding to:" << output_name;
60 void SerializeAndUndo() {
66 rfp.Open(&data[0], data.
size());
71 return lang ==
"chi_sim" || lang ==
"chi_tra" || lang ==
"kor" ||
76 return lang ==
"asm" || lang ==
"ben" || lang ==
"bih" || lang ==
"hin" ||
77 lang ==
"mar" || lang ==
"nep" || lang ==
"san" || lang ==
"bod" ||
78 lang ==
"dzo" || lang ==
"guj" || lang ==
"kan" || lang ==
"mal" ||
79 lang ==
"ori" || lang ==
"pan" || lang ==
"sin" || lang ==
"tam" ||
90 std::vector<RecodedCharID> times_seen(code_range, zeros);
107 EXPECT_EQ(unichar_id,
compressed_.DecodeUnichar(code));
109 for (
int i = 0; i < len; ++i) {
110 int code_val = code(i);
111 EXPECT_GE(code_val, 0);
112 EXPECT_LT(code_val, code_range);
113 times_seen[code_val].Set(i, times_seen[code_val](i) + 1);
117 for (
int c = 0; c < code_range; ++c) {
120 if (times_seen[c](i) != 0) ++num_used;
122 EXPECT_GE(num_used, 1) <<
"c=" << c <<
"/" << code_range;
127 CheckCodeExtensions(code, times_seen);
131 if (IsCJKLang(lang) || IsIndicLang(lang)) {
141 void CheckCodeExtensions(
const RecodedCharID& code,
142 const std::vector<RecodedCharID>& times_seen) {
143 RecodedCharID extended = code;
144 int length = code.length();
146 if (final_codes !=
nullptr) {
147 for (
int i = 0; i < final_codes->
size(); ++i) {
148 int ending = (*final_codes)[i];
149 EXPECT_GT(times_seen[ending](length), 0);
150 extended.Set(length, ending);
151 int unichar_id =
compressed_.DecodeUnichar(extended);
152 EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);
156 if (next_codes !=
nullptr) {
157 for (
int i = 0; i < next_codes->
size(); ++i) {
158 int extension = (*next_codes)[i];
159 EXPECT_GT(times_seen[extension](length), 0);
160 extended.Set(length, extension);
161 CheckCodeExtensions(extended, times_seen);
173 TEST_F(UnicharcompressTest, DoesChinese) {
174 LOG(
INFO) <<
"Testing chi_tra";
175 LoadUnicharset(
"chi_tra.unicharset");
176 ExpectCorrect(
"chi_tra");
177 LOG(
INFO) <<
"Testing chi_sim";
178 LoadUnicharset(
"chi_sim.unicharset");
179 ExpectCorrect(
"chi_sim");
182 TEST_F(UnicharcompressTest, DoesJapanese) {
184 LoadUnicharset(
"jpn.unicharset");
185 ExpectCorrect(
"jpn");
188 TEST_F(UnicharcompressTest, DoesKorean) {
190 LoadUnicharset(
"kor.unicharset");
191 ExpectCorrect(
"kor");
194 TEST_F(UnicharcompressTest, DoesKannada) {
196 LoadUnicharset(
"kan.unicharset");
197 ExpectCorrect(
"kan");
199 ExpectCorrect(
"kan");
202 TEST_F(UnicharcompressTest, DoesMarathi) {
204 LoadUnicharset(
"mar.unicharset");
205 ExpectCorrect(
"mar");
208 TEST_F(UnicharcompressTest, DoesEnglish) {
210 LoadUnicharset(
"eng.unicharset");
211 ExpectCorrect(
"eng");
216 TEST_F(UnicharcompressTest, DoesLigaturesWithDoubles) {
217 LOG(
INFO) <<
"Testing por with ligatures";
218 LoadUnicharset(
"por.unicharset");
219 ExpectCorrect(
"por");
227 for (
int i = 0; i < len; ++i) {
236 TEST_F(UnicharcompressTest, GetEncodingAsString) {
237 LoadUnicharset(
"trivial.unicharset");
238 ExpectCorrect(
"trivial");
241 std::vector<std::string> lines =
242 absl::StrSplit(encoding_str,
"\n", absl::SkipEmpty());
243 EXPECT_EQ(5, lines.size());
245 EXPECT_EQ(
"0\t ", lines[0]);
247 EXPECT_EQ(
"1\ti", lines[1]);
249 EXPECT_EQ(
"2\tf", lines[2]);
252 EXPECT_EQ(
"2,1\tfi", lines[3]);
254 EXPECT_EQ(
"3\t<nul>", lines[4]);