tesseract  5.0.0-alpha-619-ge9db
recodebeam_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include "recodebeam.h"
13 #include "matrix.h"
14 #include "pageres.h"
15 #include "ratngs.h"
17 #include <tesseract/helpers.h>
18 #include "unicharcompress.h"
19 #include "normstrngs.h"
21 
22 #include "include_gunit.h"
23 #include "log.h" // for LOG
24 #include "absl/strings/str_format.h" // for absl::StrFormat
25 
26 using tesseract::CCUtil;
27 using tesseract::Dict;
32 using tesseract::TRand;
34 
35 namespace {
36 
37 // Number of characters to test beam search with.
38 const int kNumChars = 100;
39 // Amount of extra random data to pad with after.
40 const int kPadding = 64;
41 // Dictionary test data.
42 // The top choice is: "Gef s wordsright.".
43 // The desired phrase is "Gets words right.".
44 // There is a competing dictionary phrase: "Get swords right.".
45 // ... due to the following errors from the network:
46 // f stronger than t in "Get".
47 // weak space between Gef and s and between s and words.
48 // weak space between words and right.
49 const char* kGWRTops[] = {"G", "e", "f", " ", "s", " ", "w", "o", "r", "d",
50  "s", "", "r", "i", "g", "h", "t", ".", nullptr};
51 const float kGWRTopScores[] = {0.99, 0.85, 0.87, 0.55, 0.99, 0.65,
52  0.89, 0.99, 0.99, 0.99, 0.99, 0.95,
53  0.99, 0.90, 0.90, 0.90, 0.95, 0.75};
54 const char* kGWR2nds[] = {"C", "c", "t", "", "S", "", "W", "O", "t", "h",
55  "S", " ", "t", "I", "9", "b", "f", ",", nullptr};
56 const float kGWR2ndScores[] = {0.01, 0.10, 0.12, 0.42, 0.01, 0.25,
57  0.10, 0.01, 0.01, 0.01, 0.01, 0.05,
58  0.01, 0.09, 0.09, 0.09, 0.05, 0.25};
59 
60 const char* kZHTops[] = {"实", "学", "储", "啬", "投", "学", "生", nullptr};
61 const float kZHTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98};
62 const char* kZH2nds[] = {"学", "储", "投", "生", "学", "生", "实", nullptr};
63 const float kZH2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
64 
65 const char* kViTops[] = {"v", "ậ", "y", " ", "t", "ộ", "i", nullptr};
66 const float kViTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.97};
67 const char* kVi2nds[] = {"V", "a", "v", "", "l", "o", "", nullptr};
68 const float kVi2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
69 
70 class RecodeBeamTest : public ::testing::Test {
71  protected:
72  void SetUp() {
73  std::locale::global(std::locale(""));
74  }
75 
76  RecodeBeamTest() : lstm_dict_(&ccutil_) {}
77  ~RecodeBeamTest() { lstm_dict_.End(); }
78 
79  // Loads and compresses the given unicharset.
80  void LoadUnicharset(const std::string& unicharset_name) {
81  std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR,
82  "radical-stroke.txt");
83  std::string unicharset_file =
84  file::JoinPath(TESTDATA_DIR, unicharset_name);
85  std::string radical_data;
86  CHECK_OK(file::GetContents(radical_stroke_file, &radical_data,
87  file::Defaults()));
88  CHECK(ccutil_.unicharset.load_from_file(unicharset_file.c_str()));
89  unichar_null_char_ = ccutil_.unicharset.has_special_codes()
91  : ccutil_.unicharset.size();
92  STRING radical_str(radical_data.c_str());
93  EXPECT_TRUE(recoder_.ComputeEncoding(ccutil_.unicharset, unichar_null_char_,
94  &radical_str));
95  RecodedCharID code;
96  recoder_.EncodeUnichar(unichar_null_char_, &code);
97  encoded_null_char_ = code(0);
98  // Space should encode as itself.
99  recoder_.EncodeUnichar(UNICHAR_SPACE, &code);
100  EXPECT_EQ(UNICHAR_SPACE, code(0));
101  std::string output_name = file::JoinPath(FLAGS_test_tmpdir, "testenc.txt");
102  STRING encoding = recoder_.GetEncodingAsString(ccutil_.unicharset);
103  std::string encoding_str(&encoding[0], encoding.size());
104  CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
105  LOG(INFO) << "Wrote encoding to:" << output_name << "\n";
106  }
107  // Loads the dictionary.
108  void LoadDict(const std::string& lang) {
109  std::string traineddata_name = lang + ".traineddata";
110  std::string traineddata_file =
111  file::JoinPath(TESTDATA_DIR, traineddata_name);
112  lstm_dict_.SetupForLoad(nullptr);
114  mgr.Init(traineddata_file.c_str());
115  lstm_dict_.LoadLSTM(lang.c_str(), &mgr);
116  lstm_dict_.FinishLoad();
117  }
118 
119  // Expects the appropriate results from the compressed_ ccutil_.unicharset.
120  void ExpectCorrect(const GENERIC_2D_ARRAY<float>& output,
121  const GenericVector<int>& transcription) {
122  // Get the utf8 string of the transcription.
123  std::string truth_utf8;
124  for (int i = 0; i < transcription.size(); ++i) {
125  truth_utf8 += ccutil_.unicharset.id_to_unichar(transcription[i]);
126  }
128  ExpectCorrect(output, truth_utf8, nullptr, &words);
129  }
130  void ExpectCorrect(const GENERIC_2D_ARRAY<float>& output,
131  const std::string& truth_utf8, Dict* dict,
132  PointerVector<WERD_RES>* words) {
133  RecodeBeamSearch beam_search(recoder_, encoded_null_char_, false, dict);
134  beam_search.Decode(output, 3.5, -0.125, -25.0, nullptr);
135  // Uncomment and/or change nullptr above to &ccutil_.unicharset to debug:
136  // beam_search.DebugBeams(ccutil_.unicharset);
137  GenericVector<int> labels, xcoords;
138  beam_search.ExtractBestPathAsLabels(&labels, &xcoords);
139  LOG(INFO) << "Labels size = " << labels.size() << " coords "
140  << xcoords.size() << "\n";
141  // Now decode using recoder_.
142  std::string decoded;
143  int end = 1;
144  for (int start = 0; start < labels.size(); start = end) {
145  RecodedCharID code;
146  int index = start;
147  int uni_id = INVALID_UNICHAR_ID;
148  do {
149  code.Set(code.length(), labels[index++]);
150  uni_id = recoder_.DecodeUnichar(code);
151  } while (index < labels.size() &&
152  code.length() < RecodedCharID::kMaxCodeLen &&
153  (uni_id == INVALID_UNICHAR_ID ||
154  !recoder_.IsValidFirstCode(labels[index])));
155  EXPECT_NE(INVALID_UNICHAR_ID, uni_id)
156  << "index=" << index << "/" << labels.size();
157  // To the extent of truth_utf8, we expect decoded to match, but if
158  // transcription is shorter, that is OK too, as we may just be testing
159  // that we get a valid sequence when padded with random data.
160  if (uni_id != unichar_null_char_ && decoded.size() < truth_utf8.size())
161  decoded += ccutil_.unicharset.id_to_unichar(uni_id);
162  end = index;
163  }
164  EXPECT_EQ(truth_utf8, decoded);
165 
166  // Check that ExtractBestPathAsUnicharIds does the same thing.
167  GenericVector<int> unichar_ids;
168  GenericVector<float> certainties, ratings;
169  beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset,
170  &unichar_ids, &certainties,
171  &ratings, &xcoords);
172  std::string u_decoded;
173  float total_rating = 0.0f;
174  for (int u = 0; u < unichar_ids.size(); ++u) {
175  // To the extent of truth_utf8, we expect decoded to match, but if
176  // transcription is shorter, that is OK too, as we may just be testing
177  // that we get a valid sequence when padded with random data.
178  if (u_decoded.size() < truth_utf8.size()) {
179  const char* str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]);
180  total_rating += ratings[u];
181  LOG(INFO) << absl::StrFormat("%d:u_id=%d=%s, c=%g, r=%g, r_sum=%g @%d", u,
182  unichar_ids[u], str, certainties[u],
183  ratings[u], total_rating, xcoords[u]) << "\n";
184  if (str[0] == ' ') total_rating = 0.0f;
185  u_decoded += str;
186  }
187  }
188  EXPECT_EQ(truth_utf8, u_decoded);
189 
190  // Check that ExtractBestPathAsWords does the same thing.
191  TBOX line_box(0, 0, 100, 10);
192  for (int i = 0; i < 2; ++i) {
193  beam_search.ExtractBestPathAsWords(line_box, 1.0f, false,
194  &ccutil_.unicharset, words);
195  std::string w_decoded;
196  for (int w = 0; w < words->size(); ++w) {
197  const WERD_RES* word = (*words)[w];
198  if (w_decoded.size() < truth_utf8.size()) {
199  if (!w_decoded.empty() && word->word->space()) w_decoded += " ";
200  w_decoded += word->best_choice->unichar_string().c_str();
201  }
202  LOG(INFO) << absl::StrFormat("Word:%d = %s, c=%g, r=%g, perm=%d", w,
203  word->best_choice->unichar_string().c_str(),
204  word->best_choice->certainty(),
205  word->best_choice->rating(),
206  word->best_choice->permuter()) << "\n";
207  }
208  std::string w_trunc(w_decoded.data(), truth_utf8.size());
209  if (truth_utf8 != w_trunc) {
212  tesseract::GraphemeNorm::kNone, w_decoded.c_str(), &w_decoded);
213  w_trunc.assign(w_decoded.data(), truth_utf8.size());
214  }
215  EXPECT_EQ(truth_utf8, w_trunc);
216  }
217  }
218  // Generates easy encoding of the given unichar_ids, and pads with at least
219  // padding of random data.
220  GENERIC_2D_ARRAY<float> GenerateRandomPaddedOutputs(
221  const GenericVector<int>& unichar_ids, int padding) {
222  int width = unichar_ids.size() * 2 * RecodedCharID::kMaxCodeLen;
223  int num_codes = recoder_.code_range();
224  GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
225  // Fill with random data.
226  TRand random;
227  for (int t = 0; t < width; ++t) {
228  for (int i = 0; i < num_codes; ++i)
229  outputs(t, i) = random.UnsignedRand(0.25);
230  }
231  int t = 0;
232  for (int i = 0; i < unichar_ids.size(); ++i) {
233  RecodedCharID code;
234  int len = recoder_.EncodeUnichar(unichar_ids[i], &code);
235  EXPECT_NE(0, len);
236  for (int j = 0; j < len; ++j) {
237  // Make the desired answer a clear winner.
238  if (j > 0 && code(j) == code(j - 1)) {
239  // We will collapse adjacent equal codes so put a null in between.
240  outputs(t++, encoded_null_char_) = 1.0f;
241  }
242  outputs(t++, code(j)) = 1.0f;
243  }
244  // Put a 0 as a null char in between.
245  outputs(t++, encoded_null_char_) = 1.0f;
246  }
247  // Normalize the probs.
248  for (int t = 0; t < width; ++t) {
249  double sum = 0.0;
250  for (int i = 0; i < num_codes; ++i) sum += outputs(t, i);
251  for (int i = 0; i < num_codes; ++i) outputs(t, i) /= sum;
252  }
253 
254  return outputs;
255  }
256  // Encodes a utf8 string (character) as unichar_id, then recodes, and sets
257  // the score for the appropriate sequence of codes, returning the ending t.
258  int EncodeUTF8(const char* utf8_str, float score, int start_t, TRand* random,
259  GENERIC_2D_ARRAY<float>* outputs) {
260  int t = start_t;
261  GenericVector<int> unichar_ids;
262  EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids,
263  nullptr, nullptr));
264  if (unichar_ids.empty() || utf8_str[0] == '\0') {
265  unichar_ids.clear();
266  unichar_ids.push_back(unichar_null_char_);
267  }
268  int num_ids = unichar_ids.size();
269  for (int u = 0; u < num_ids; ++u) {
270  RecodedCharID code;
271  int len = recoder_.EncodeUnichar(unichar_ids[u], &code);
272  EXPECT_NE(0, len);
273  for (int i = 0; i < len; ++i) {
274  // Apply the desired score.
275  (*outputs)(t++, code(i)) = score;
276  if (random != nullptr &&
277  t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
278  int dups = static_cast<int>(random->UnsignedRand(3.0));
279  for (int d = 0; d < dups; ++d) {
280  // Duplicate the desired score.
281  (*outputs)(t++, code(i)) = score;
282  }
283  }
284  }
285  if (random != nullptr &&
286  t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
287  int dups = static_cast<int>(random->UnsignedRand(3.0));
288  for (int d = 0; d < dups; ++d) {
289  // Add a random number of nulls as well.
290  (*outputs)(t++, encoded_null_char_) = score;
291  }
292  }
293  }
294  return t;
295  }
296  // Generates an encoding of the given 4 arrays as synthetic network scores.
297  // uses scores1 for chars1 and scores2 for chars2, and everything else gets
298  // the leftovers shared out equally. Note that empty string encodes as the
299  // null_char_.
300  GENERIC_2D_ARRAY<float> GenerateSyntheticOutputs(const char* chars1[],
301  const float scores1[],
302  const char* chars2[],
303  const float scores2[],
304  TRand* random) {
305  int width = 0;
306  while (chars1[width] != nullptr) ++width;
307  int padding = width * RecodedCharID::kMaxCodeLen;
308  int num_codes = recoder_.code_range();
309  GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
310  int t = 0;
311  for (int i = 0; i < width; ++i) {
312  // In case there is overlap in the codes between 1st and 2nd choice, it
313  // is better to encode the 2nd choice first.
314  int end_t2 = EncodeUTF8(chars2[i], scores2[i], t, random, &outputs);
315  int end_t1 = EncodeUTF8(chars1[i], scores1[i], t, random, &outputs);
316  // Advance t to the max end, setting everything else to the leftovers.
317  int max_t = std::max(end_t1, end_t2);
318  int min_t = std::min(end_t1, end_t2);
319  while (t < max_t) {
320  double total_score = 0.0;
321  for (int j = 0; j < num_codes; ++j) total_score += outputs(t, j);
322  double null_remainder = (1.0 - total_score) / 2.0;
323  double remainder = null_remainder / (num_codes - 2);
324  if (outputs(t, encoded_null_char_) < null_remainder) {
325  outputs(t, encoded_null_char_) += null_remainder;
326  } else {
327  remainder += remainder;
328  }
329  for (int j = 0; j < num_codes; ++j) {
330  if (outputs(t, j) == 0.0f) outputs(t, j) = remainder;
331  }
332  ++t;
333  }
334  }
335  // Fill the rest with null chars.
336  while (t < width + padding) {
337  outputs(t++, encoded_null_char_) = 1.0f;
338  }
339  return outputs;
340  }
341  UnicharCompress recoder_;
342  int unichar_null_char_ = 0;
343  int encoded_null_char_ = 0;
344  CCUtil ccutil_;
345  Dict lstm_dict_;
346 };
347 
348 TEST_F(RecodeBeamTest, DoesChinese) {
349  LOG(INFO) << "Testing chi_tra" << "\n";
350  LoadUnicharset("chi_tra.unicharset");
351  // Correctly reproduce the first kNumchars characters from easy output.
352  GenericVector<int> transcription;
353  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
354  transcription.push_back(i);
355  GENERIC_2D_ARRAY<float> outputs =
356  GenerateRandomPaddedOutputs(transcription, kPadding);
357  ExpectCorrect(outputs, transcription);
358  LOG(INFO) << "Testing chi_sim" << "\n";
359  LoadUnicharset("chi_sim.unicharset");
360  // Correctly reproduce the first kNumchars characters from easy output.
361  transcription.clear();
362  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
363  transcription.push_back(i);
364  outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
365  ExpectCorrect(outputs, transcription);
366 }
367 
368 TEST_F(RecodeBeamTest, DoesJapanese) {
369  LOG(INFO) << "Testing jpn" << "\n";
370  LoadUnicharset("jpn.unicharset");
371  // Correctly reproduce the first kNumchars characters from easy output.
372  GenericVector<int> transcription;
373  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
374  transcription.push_back(i);
375  GENERIC_2D_ARRAY<float> outputs =
376  GenerateRandomPaddedOutputs(transcription, kPadding);
377  ExpectCorrect(outputs, transcription);
378 }
379 
380 TEST_F(RecodeBeamTest, DoesKorean) {
381  LOG(INFO) << "Testing kor" << "\n";
382  LoadUnicharset("kor.unicharset");
383  // Correctly reproduce the first kNumchars characters from easy output.
384  GenericVector<int> transcription;
385  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
386  transcription.push_back(i);
387  GENERIC_2D_ARRAY<float> outputs =
388  GenerateRandomPaddedOutputs(transcription, kPadding);
389  ExpectCorrect(outputs, transcription);
390 }
391 
392 TEST_F(RecodeBeamTest, DoesKannada) {
393  LOG(INFO) << "Testing kan" << "\n";
394  LoadUnicharset("kan.unicharset");
395  // Correctly reproduce the first kNumchars characters from easy output.
396  GenericVector<int> transcription;
397  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
398  transcription.push_back(i);
399  GENERIC_2D_ARRAY<float> outputs =
400  GenerateRandomPaddedOutputs(transcription, kPadding);
401  ExpectCorrect(outputs, transcription);
402 }
403 
404 TEST_F(RecodeBeamTest, DoesMarathi) {
405  LOG(INFO) << "Testing mar" << "\n";
406  LoadUnicharset("mar.unicharset");
407  // Correctly reproduce the first kNumchars characters from easy output.
408  GenericVector<int> transcription;
409  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
410  transcription.push_back(i);
411  GENERIC_2D_ARRAY<float> outputs =
412  GenerateRandomPaddedOutputs(transcription, kPadding);
413  ExpectCorrect(outputs, transcription);
414 }
415 
416 TEST_F(RecodeBeamTest, DoesEnglish) {
417  LOG(INFO) << "Testing eng" << "\n";
418  LoadUnicharset("eng.unicharset");
419  // Correctly reproduce the first kNumchars characters from easy output.
420  GenericVector<int> transcription;
421  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
422  transcription.push_back(i);
423  GENERIC_2D_ARRAY<float> outputs =
424  GenerateRandomPaddedOutputs(transcription, kPadding);
425  ExpectCorrect(outputs, transcription);
426 }
427 
428 TEST_F(RecodeBeamTest, DISABLED_EngDictionary) {
429  LOG(INFO) << "Testing eng dictionary" << "\n";
430  LoadUnicharset("eng_beam.unicharset");
431  GENERIC_2D_ARRAY<float> outputs = GenerateSyntheticOutputs(
432  kGWRTops, kGWRTopScores, kGWR2nds, kGWR2ndScores, nullptr);
433  std::string default_str;
434  for (int i = 0; kGWRTops[i] != nullptr; ++i) default_str += kGWRTops[i];
436  ExpectCorrect(outputs, default_str, nullptr, &words);
437  // Now try again with the dictionary.
438  LoadDict("eng_beam");
439  ExpectCorrect(outputs, "Gets words right.", &lstm_dict_, &words);
440 }
441 
442 TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) {
443  LOG(INFO) << "Testing zh_hans dictionary" << "\n";
444  LoadUnicharset("zh_hans.unicharset");
445  GENERIC_2D_ARRAY<float> outputs = GenerateSyntheticOutputs(
446  kZHTops, kZHTopScores, kZH2nds, kZH2ndScores, nullptr);
448  ExpectCorrect(outputs, "实学储啬投学生", nullptr, &words);
449  // Each is an individual word, with permuter = top choice.
450  EXPECT_EQ(7, words.size());
451  for (int w = 0; w < words.size(); ++w) {
452  EXPECT_EQ(TOP_CHOICE_PERM, words[w]->best_choice->permuter());
453  }
454  // Now try again with the dictionary.
455  LoadDict("zh_hans");
456  ExpectCorrect(outputs, "实学储啬投学生", &lstm_dict_, &words);
457  // Number of words expected.
458  const int kNumWords = 5;
459  // Content of the words.
460  const char* kWords[kNumWords] = {"实学", "储", "啬", "投", "学生"};
461  // Permuters of the words.
462  const int kWordPerms[kNumWords] = {SYSTEM_DAWG_PERM, TOP_CHOICE_PERM,
465  EXPECT_EQ(kNumWords, words.size());
466  for (int w = 0; w < kNumWords && w < words.size(); ++w) {
467  EXPECT_STREQ(kWords[w], words[w]->best_choice->unichar_string().c_str());
468  EXPECT_EQ(kWordPerms[w], words[w]->best_choice->permuter());
469  }
470 }
471 
472 // Tests that a recoder built with decomposed unicode allows true ctc
473 // arbitrary duplicates and inserted nulls inside the multicode sequence.
474 TEST_F(RecodeBeamTest, DISABLED_MultiCodeSequences) {
475  LOG(INFO) << "Testing duplicates in multi-code sequences" << "\n";
476  LoadUnicharset("vie.d.unicharset");
477  tesseract::SetupBasicProperties(false, true, &ccutil_.unicharset);
478  TRand random;
479  GENERIC_2D_ARRAY<float> outputs = GenerateSyntheticOutputs(
480  kViTops, kViTopScores, kVi2nds, kVi2ndScores, &random);
482  std::string truth_str;
485  tesseract::GraphemeNorm::kNone, "vậy tội", &truth_str);
486  ExpectCorrect(outputs, truth_str, nullptr, &words);
487 }
488 
489 } // namespace
tesseract::OCRNorm::kNone
file::JoinPath
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:43
string
std::string string
Definition: equationdetect_test.cc:21
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
tesseract::NormalizeUTF8String
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:163
INFO
Definition: log.h:29
CHECK_OK
#define CHECK_OK(test)
Definition: include_gunit.h:62
unicharset_training_utils.h
pageres.h
tesseract::RecodeBeamSearch
Definition: recodebeam.h:180
tesseract::TessdataManager
Definition: tessdatamanager.h:126
SYSTEM_DAWG_PERM
Definition: ratngs.h:239
tesseract::PointerVector
Definition: genericvector.h:417
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
STRING
Definition: strngs.h:45
tesseract::SetupBasicProperties
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
Definition: unicharset_training_utils.cpp:40
recodebeam.h
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:334
file::Defaults
static int Defaults()
Definition: include_gunit.h:39
WERD_RES
Definition: pageres.h:160
tesseract::RecodeNode
Definition: recodebeam.h:93
include_gunit.h
tesseract::TEST_F
TEST_F(EquationFinderTest, IdentifySpecialText)
Definition: equationdetect_test.cc:181
file::GetContents
static bool GetContents(const std::string &filename, std::string *out, int)
Definition: include_gunit.h:31
GENERIC_2D_ARRAY< float >
ratngs.h
genericvector.h
STRING::size
int32_t size() const
Definition: strngs.h:68
UNICHAR_BROKEN
Definition: unicharset.h:36
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
CHECK
#define CHECK(test)
Definition: include_gunit.h:57
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
file::SetContents
static bool SetContents(const std::string &name, const std::string &contents, bool)
Definition: include_gunit.h:35
WERD::space
uint8_t space()
Definition: werd.h:98
FLAGS_test_tmpdir
const char * FLAGS_test_tmpdir
Definition: include_gunit.h:20
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::RecodedCharID::Set
void Set(int index, int value)
Definition: unicharcompress.h:44
UNICHAR_SPACE
Definition: unicharset.h:34
encoded_null_char_
int encoded_null_char_
Definition: unicharcompress_test.cc:170
matrix.h
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
tesseract::UnicodeNormMode::kNFKC
helpers.h
SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:38
TOP_CHOICE_PERM
Definition: ratngs.h:233
tesseract::RecodedCharID::length
int length() const
Definition: unicharcompress.h:57
tesseract::RecodedCharID
Definition: unicharcompress.h:34
GenericVector< int >
normstrngs.h
tesseract::Dict
Definition: dict.h:91
tesseract::TessdataManager::Init
bool Init(const char *data_file_name)
Definition: tessdatamanager.cpp:97
GenericVector::clear
void clear()
Definition: genericvector.h:857
unicharcompress.h
tesseract::TRand::UnsignedRand
double UnsignedRand(double range)
Definition: helpers.h:89
WERD_RES::word
WERD * word
Definition: pageres.h:180
log.h
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
LOG
Definition: cleanapi_test.cc:19
tesseract::UnicharCompress
Definition: unicharcompress.h:128
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::TRand
Definition: helpers.h:50
tesseract::CCUtil
Definition: ccutil.h:40
GENERIC_2D_ARRAY::dim1
int dim1() const
Definition: matrix.h:205
tesseract::OCRNorm::kNormalize
tesseract::UnicodeNormMode::kNFKD
TBOX
Definition: rect.h:33