tesseract  5.0.0-alpha-619-ge9db
unicharcompress_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include <string>
13 
14 #include "absl/strings/ascii.h"
15 #include "absl/strings/str_cat.h"
16 #include "absl/strings/str_split.h"
17 #include "allheaders.h"
18 
19 #include "include_gunit.h"
20 #include "log.h" // for LOG
21 #include <tesseract/serialis.h>
22 #include "tprintf.h"
23 #include "unicharcompress.h"
24 
25 namespace tesseract {
26 namespace {
27 
28 class UnicharcompressTest : public ::testing::Test {
29  protected:
30  void SetUp() {
31  std::locale::global(std::locale(""));
32  }
33 
34  // Loads and compresses the given unicharset.
35  void LoadUnicharset(const std::string& unicharset_name) {
36  std::string radical_stroke_file =
37  file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
38  std::string unicharset_file =
39  file::JoinPath(TESTDATA_DIR, unicharset_name);
40  std::string radical_data;
41  CHECK_OK(file::GetContents(radical_stroke_file, &radical_data,
42  file::Defaults()));
43  CHECK(unicharset_.load_from_file(unicharset_file.c_str()));
44  STRING radical_str(radical_data.c_str());
45  null_char_ =
47  compressed_.ComputeEncoding(unicharset_, null_char_, &radical_str);
48  // Get the encoding of the null char.
49  RecodedCharID code;
50  compressed_.EncodeUnichar(null_char_, &code);
51  encoded_null_char_ = code(0);
52  std::string output_name = file::JoinPath(
53  FLAGS_test_tmpdir, absl::StrCat(unicharset_name, ".encoding.txt"));
54  STRING encoding = compressed_.GetEncodingAsString(unicharset_);
55  std::string encoding_str(&encoding[0], encoding.size());
56  CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
57  LOG(INFO) << "Wrote encoding to:" << output_name;
58  }
59  // Serializes and de-serializes compressed_ over itself.
60  void SerializeAndUndo() {
62  TFile wfp;
63  wfp.OpenWrite(&data);
64  EXPECT_TRUE(compressed_.Serialize(&wfp));
65  TFile rfp;
66  rfp.Open(&data[0], data.size());
67  EXPECT_TRUE(compressed_.DeSerialize(&rfp));
68  }
69  // Returns true if the lang is in CJK.
70  bool IsCJKLang(const std::string& lang) {
71  return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" ||
72  lang == "jpn";
73  }
74  // Returns true if the lang is Indic.
75  bool IsIndicLang(const std::string& lang) {
76  return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" ||
77  lang == "mar" || lang == "nep" || lang == "san" || lang == "bod" ||
78  lang == "dzo" || lang == "guj" || lang == "kan" || lang == "mal" ||
79  lang == "ori" || lang == "pan" || lang == "sin" || lang == "tam" ||
80  lang == "tel";
81  }
82 
83  // Expects the appropriate results from the compressed_ unicharset_.
84  void ExpectCorrect(const std::string& lang) {
85  // Count the number of times each code is used in each element of
86  // RecodedCharID.
87  RecodedCharID zeros;
88  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) zeros.Set(i, 0);
89  int code_range = compressed_.code_range();
90  std::vector<RecodedCharID> times_seen(code_range, zeros);
91  for (int u = 0; u <= unicharset_.size(); ++u) {
92  if (u != UNICHAR_SPACE && u != null_char_ &&
95  continue; // Not used so not encoded.
96  }
97  RecodedCharID code;
98  int len = compressed_.EncodeUnichar(u, &code);
99  // Check round-trip encoding.
100  int unichar_id;
101  GenericVector<UNICHAR_ID> normed_ids;
102  if (u == null_char_ || u == unicharset_.size()) {
103  unichar_id = null_char_;
104  } else {
105  unichar_id = u;
106  }
107  EXPECT_EQ(unichar_id, compressed_.DecodeUnichar(code));
108  // Check that the codes are valid.
109  for (int i = 0; i < len; ++i) {
110  int code_val = code(i);
111  EXPECT_GE(code_val, 0);
112  EXPECT_LT(code_val, code_range);
113  times_seen[code_val].Set(i, times_seen[code_val](i) + 1);
114  }
115  }
116  // Check that each code is used in at least one position.
117  for (int c = 0; c < code_range; ++c) {
118  int num_used = 0;
119  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
120  if (times_seen[c](i) != 0) ++num_used;
121  }
122  EXPECT_GE(num_used, 1) << "c=" << c << "/" << code_range;
123  }
124  // Check that GetNextCodes/GetFinalCodes lists match the times_seen,
125  // and create valid codes.
126  RecodedCharID code;
127  CheckCodeExtensions(code, times_seen);
128  // Finally, we achieved all that using a codebook < 10% of the size of
129  // the original unicharset, for CK or Indic, and 20% with J, but just
130  // no bigger for all others.
131  if (IsCJKLang(lang) || IsIndicLang(lang)) {
132  EXPECT_LT(code_range, unicharset_.size() / (lang == "jpn" ? 5 : 10));
133  } else {
134  EXPECT_LE(code_range, unicharset_.size() + 1);
135  }
136  LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to "
137  << code_range;
138  }
139  // Checks for extensions of the current code that either finish a code, or
140  // extend it and checks those extensions recursively.
141  void CheckCodeExtensions(const RecodedCharID& code,
142  const std::vector<RecodedCharID>& times_seen) {
143  RecodedCharID extended = code;
144  int length = code.length();
145  const GenericVector<int>* final_codes = compressed_.GetFinalCodes(code);
146  if (final_codes != nullptr) {
147  for (int i = 0; i < final_codes->size(); ++i) {
148  int ending = (*final_codes)[i];
149  EXPECT_GT(times_seen[ending](length), 0);
150  extended.Set(length, ending);
151  int unichar_id = compressed_.DecodeUnichar(extended);
152  EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);
153  }
154  }
155  const GenericVector<int>* next_codes = compressed_.GetNextCodes(code);
156  if (next_codes != nullptr) {
157  for (int i = 0; i < next_codes->size(); ++i) {
158  int extension = (*next_codes)[i];
159  EXPECT_GT(times_seen[extension](length), 0);
160  extended.Set(length, extension);
161  CheckCodeExtensions(extended, times_seen);
162  }
163  }
164  }
165 
166  UnicharCompress compressed_;
169  // The encoding of the null_char_.
171 };
172 
173 TEST_F(UnicharcompressTest, DoesChinese) {
174  LOG(INFO) << "Testing chi_tra";
175  LoadUnicharset("chi_tra.unicharset");
176  ExpectCorrect("chi_tra");
177  LOG(INFO) << "Testing chi_sim";
178  LoadUnicharset("chi_sim.unicharset");
179  ExpectCorrect("chi_sim");
180 }
181 
182 TEST_F(UnicharcompressTest, DoesJapanese) {
183  LOG(INFO) << "Testing jpn";
184  LoadUnicharset("jpn.unicharset");
185  ExpectCorrect("jpn");
186 }
187 
188 TEST_F(UnicharcompressTest, DoesKorean) {
189  LOG(INFO) << "Testing kor";
190  LoadUnicharset("kor.unicharset");
191  ExpectCorrect("kor");
192 }
193 
194 TEST_F(UnicharcompressTest, DoesKannada) {
195  LOG(INFO) << "Testing kan";
196  LoadUnicharset("kan.unicharset");
197  ExpectCorrect("kan");
198  SerializeAndUndo();
199  ExpectCorrect("kan");
200 }
201 
202 TEST_F(UnicharcompressTest, DoesMarathi) {
203  LOG(INFO) << "Testing mar";
204  LoadUnicharset("mar.unicharset");
205  ExpectCorrect("mar");
206 }
207 
208 TEST_F(UnicharcompressTest, DoesEnglish) {
209  LOG(INFO) << "Testing eng";
210  LoadUnicharset("eng.unicharset");
211  ExpectCorrect("eng");
212 }
213 
214 // Tests that a unicharset that contains double-letter ligatures (eg ff) has
215 // no null char in the encoding at all.
216 TEST_F(UnicharcompressTest, DoesLigaturesWithDoubles) {
217  LOG(INFO) << "Testing por with ligatures";
218  LoadUnicharset("por.unicharset");
219  ExpectCorrect("por");
220  // Check that any unichar-id that is encoded with multiple codes has the
221  // correct encoded_nulll_char_ in between.
222  for (int u = 0; u <= unicharset_.size(); ++u) {
223  RecodedCharID code;
224  int len = compressed_.EncodeUnichar(u, &code);
225  if (len > 1) {
226  // The should not be any null char in the code.
227  for (int i = 0; i < len; ++i) {
228  EXPECT_NE(encoded_null_char_, code(i));
229  }
230  }
231  }
232 }
233 
234 // Tests that GetEncodingAsString returns the right result for a trivial
235 // unicharset.
236 TEST_F(UnicharcompressTest, GetEncodingAsString) {
237  LoadUnicharset("trivial.unicharset");
238  ExpectCorrect("trivial");
240  std::string encoding_str(&encoding[0], encoding.length());
241  std::vector<std::string> lines =
242  absl::StrSplit(encoding_str, "\n", absl::SkipEmpty());
243  EXPECT_EQ(5, lines.size());
244  // The first line is always space.
245  EXPECT_EQ("0\t ", lines[0]);
246  // Next we have i.
247  EXPECT_EQ("1\ti", lines[1]);
248  // Next we have f.
249  EXPECT_EQ("2\tf", lines[2]);
250  // Next we have the fi ligature: fi. There are no nulls in it, as there are no
251  // repeated letter ligatures in this unicharset, unlike por.unicharset above.
252  EXPECT_EQ("2,1\tfi", lines[3]);
253  // Finally the null character.
254  EXPECT_EQ("3\t<nul>", lines[4]);
255 }
256 
257 } // namespace
258 } // namespace tesseract
UNICHARSET::load_from_file
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:378
file::JoinPath
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:43
string
std::string string
Definition: equationdetect_test.cc:21
INFO
Definition: log.h:29
CHECK_OK
#define CHECK_OK(test)
Definition: include_gunit.h:62
STRING
Definition: strngs.h:45
file::Defaults
static int Defaults()
Definition: include_gunit.h:39
include_gunit.h
tesseract::TEST_F
TEST_F(EquationFinderTest, IdentifySpecialText)
Definition: equationdetect_test.cc:181
file::GetContents
static bool GetContents(const std::string &filename, std::string *out, int)
Definition: include_gunit.h:31
STRING::size
int32_t size() const
Definition: strngs.h:68
UNICHAR_BROKEN
Definition: unicharset.h:36
compressed_
UnicharCompress compressed_
Definition: unicharcompress_test.cc:166
CHECK
#define CHECK(test)
Definition: include_gunit.h:57
file::SetContents
static bool SetContents(const std::string &name, const std::string &contents, bool)
Definition: include_gunit.h:35
FLAGS_test_tmpdir
const char * FLAGS_test_tmpdir
Definition: include_gunit.h:20
UNICHAR_SPACE
Definition: unicharset.h:34
encoded_null_char_
int encoded_null_char_
Definition: unicharcompress_test.cc:170
UNICHARSET
Definition: unicharset.h:145
SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:38
tesseract
Definition: baseapi.h:65
null_char_
int null_char_
Definition: unicharcompress_test.cc:168
unicharset_
UNICHARSET unicharset_
Definition: unicharcompress_test.cc:167
UNICHARSET::has_special_codes
bool has_special_codes() const
Definition: unicharset.h:712
tprintf.h
GenericVector< char >
STRING::length
int32_t length() const
Definition: strngs.cpp:187
tesseract::RecodedCharID::kMaxCodeLen
static const int kMaxCodeLen
Definition: unicharcompress.h:37
unicharcompress.h
tesseract::UnicharCompress::GetEncodingAsString
STRING GetEncodingAsString(const UNICHARSET &unicharset) const
Definition: unicharcompress.cpp:319
serialis.h
log.h
tesseract::UnicharCompress::EncodeUnichar
int EncodeUnichar(int unichar_id, RecodedCharID *code) const
Definition: unicharcompress.cpp:283
LOG
Definition: cleanapi_test.cc:19
GenericVector::size
int size() const
Definition: genericvector.h:71
UNICHARSET::size
int size() const
Definition: unicharset.h:341