tesseract  5.0.0-alpha-619-ge9db
resultiterator_test.cc
Go to the documentation of this file.
1 
3 #include <string>
4 #include "allheaders.h"
5 #include <tesseract/baseapi.h>
7 #include "scrollview.h"
8 
9 #include "include_gunit.h"
10 #include "log.h" // for LOG
11 #include "absl/strings/str_format.h" // for absl::StrFormat
12 
13 namespace {
14 
15 // DEFINE_string(tess_config, "", "config file for tesseract");
16 // DEFINE_bool(visual_test, false, "Runs a visual test using scrollview");
17 
21 
22 // Helper functions for converting to STL vectors
23 template <typename T>
24 void ToVector(const GenericVector<T>& from, std::vector<T>* to) {
25  to->clear();
26  for (int i = 0; i < from.size(); i++) to->push_back(from[i]);
27 }
28 
29 template <typename T>
30 void ToVector(const GenericVectorEqEq<T>& from, std::vector<T>* to) {
31  to->clear();
32  for (int i = 0; i < from.size(); i++) to->push_back(from[i]);
33 }
34 
35 // The fixture for testing Tesseract.
36 class ResultIteratorTest : public testing::Test {
37  protected:
38  std::string TestDataNameToPath(const std::string& name) {
39  return file::JoinPath(TESTING_DIR , name);
40  }
41  std::string TessdataPath() {
42  return file::JoinPath(TESSDATA_DIR, "");
43  }
44  std::string OutputNameToPath(const std::string& name) {
45  return file::JoinPath(FLAGS_test_tmpdir, name);
46  }
47 
48  ResultIteratorTest() { src_pix_ = nullptr; }
49  ~ResultIteratorTest() {}
50 
51  void SetImage(const char* filename) {
52  src_pix_ = pixRead(TestDataNameToPath(filename).c_str());
53  api_.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY);
54 // if (!FLAGS_tess_config.empty())
55 // api_.ReadConfigFile(FLAGS_tess_config.c_str());
56  api_.SetPageSegMode(tesseract::PSM_AUTO);
57  api_.SetImage(src_pix_);
58  pixDestroy(&src_pix_);
59  src_pix_ = api_.GetInputImage();
60  }
61 
62  // Rebuilds the image using the binary images at the given level, and
63  // EXPECTs that the number of pixels in the xor of the rebuilt image with
64  // the original is at most max_diff.
65  void VerifyRebuild(int max_diff, PageIteratorLevel level, PageIterator* it) {
66  it->Begin();
67  int width = pixGetWidth(src_pix_);
68  int height = pixGetHeight(src_pix_);
69  int depth = pixGetDepth(src_pix_);
70  Pix* pix = pixCreate(width, height, depth);
71  EXPECT_TRUE(depth == 1 || depth == 8);
72  if (depth == 8) pixSetAll(pix);
73  do {
74  int left, top, right, bottom;
75  PageIteratorLevel im_level = level;
76  // If the return is false, it is a non-text block so get the block image.
77  if (!it->BoundingBox(level, &left, &top, &right, &bottom)) {
78  im_level = tesseract::RIL_BLOCK;
79  EXPECT_TRUE(it->BoundingBox(im_level, &left, &top, &right, &bottom));
80  }
81  LOG(INFO) << "BBox: [L:" << left << ", T:" << top << ", R:" << right
82  << ", B:" << bottom << "]" << "\n";
83  Pix* block_pix;
84  if (depth == 1) {
85  block_pix = it->GetBinaryImage(im_level);
86  pixRasterop(pix, left, top, right - left, bottom - top,
87  PIX_SRC ^ PIX_DST, block_pix, 0, 0);
88  } else {
89  block_pix = it->GetImage(im_level, 2, src_pix_, &left, &top);
90  pixRasterop(pix, left, top, pixGetWidth(block_pix),
91  pixGetHeight(block_pix), PIX_SRC & PIX_DST, block_pix, 0,
92  0);
93  }
94  CHECK(block_pix != nullptr);
95  pixDestroy(&block_pix);
96  } while (it->Next(level));
97 // if (base::GetFlag(FLAGS_v) >= 1)
98 // pixWrite(OutputNameToPath("rebuilt.png").c_str(), pix, IFF_PNG);
99  pixRasterop(pix, 0, 0, width, height, PIX_SRC ^ PIX_DST, src_pix_, 0, 0);
100  if (depth == 8) {
101  Pix* binary_pix = pixThresholdToBinary(pix, 128);
102  pixDestroy(&pix);
103  pixInvert(binary_pix, binary_pix);
104  pix = binary_pix;
105  }
106 // if (base::GetFlag(FLAGS_v) >= 1)
107 // pixWrite(OutputNameToPath("rebuiltxor.png").c_str(), pix, IFF_PNG);
108  l_int32 pixcount;
109  pixCountPixels(pix, &pixcount, nullptr);
110  if (pixcount > max_diff) {
111  std::string outfile = OutputNameToPath("failedxor.png");
112  LOG(INFO) << "outfile = " << outfile << "\n";
113  pixWrite(outfile.c_str(), pix, IFF_PNG);
114  }
115  pixDestroy(&pix);
116  LOG(INFO) << absl::StrFormat("At level %d: pix diff = %d\n", level, pixcount);
117  EXPECT_LE(pixcount, max_diff);
118 // if (base::GetFlag(FLAGS_v) > 1) CHECK_LE(pixcount, max_diff);
119  }
120 
121  // Rebuilds the text from the iterator strings at the given level, and
122  // EXPECTs that the rebuild string exactly matches the truth string.
123  void VerifyIteratorText(const std::string& truth, PageIteratorLevel level,
124  ResultIterator* it) {
125  LOG(INFO) << "Text Test Level " << level << "\n";
126  it->Begin();
127  std::string result;
128  do {
129  char* text = it->GetUTF8Text(level);
130  result += text;
131  delete[] text;
132  if ((level == tesseract::RIL_WORD || level == tesseract::RIL_SYMBOL) &&
133  it->IsAtFinalElement(tesseract::RIL_WORD, level)) {
134  if (it->IsAtFinalElement(tesseract::RIL_TEXTLINE, level)) {
135  result += '\n';
136  } else {
137  result += ' ';
138  }
139  if (it->IsAtFinalElement(tesseract::RIL_PARA, level) &&
140  !(it->IsAtFinalElement(tesseract::RIL_BLOCK, level)))
141  result += '\n';
142  }
143  } while (it->Next(level));
144  EXPECT_STREQ(truth.c_str(), result.c_str())
145  << "Rebuild failed at Text Level " << level;
146  }
147 
148  void VerifyRebuilds(int block_limit, int para_limit, int line_limit,
149  int word_limit, int symbol_limit, PageIterator* it) {
150  VerifyRebuild(block_limit, tesseract::RIL_BLOCK, it);
151  VerifyRebuild(para_limit, tesseract::RIL_PARA, it);
152  VerifyRebuild(line_limit, tesseract::RIL_TEXTLINE, it);
153  VerifyRebuild(word_limit, tesseract::RIL_WORD, it);
154  VerifyRebuild(symbol_limit, tesseract::RIL_SYMBOL, it);
155  }
156 
157  void VerifyAllText(const std::string& truth, ResultIterator* it) {
158  VerifyIteratorText(truth, tesseract::RIL_BLOCK, it);
159  VerifyIteratorText(truth, tesseract::RIL_PARA, it);
160  VerifyIteratorText(truth, tesseract::RIL_TEXTLINE, it);
161  VerifyIteratorText(truth, tesseract::RIL_WORD, it);
162  VerifyIteratorText(truth, tesseract::RIL_SYMBOL, it);
163  }
164 
165  // Verifies that ResultIterator::CalculateTextlineOrder() produces the right
166  // results given an array of word directions (word_dirs[num_words]), an
167  // expected output reading order
168  // (expected_reading_order[num_reading_order_entries]) and a given reading
169  // context (ltr or rtl).
170  void ExpectTextlineReadingOrder(bool in_ltr_context,
171  StrongScriptDirection* word_dirs,
172  int num_words, int* expected_reading_order,
173  int num_reading_order_entries) const {
175  for (int i = 0; i < num_words; i++) {
176  gv_word_dirs.push_back(word_dirs[i]);
177  }
178 
179  GenericVectorEqEq<int> output;
180  ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs,
181  &output);
182  // STL vector can be used with EXPECT_EQ, so convert...
183  std::vector<int> correct_order(
184  expected_reading_order,
185  expected_reading_order + num_reading_order_entries);
186  std::vector<int> calculated_order;
187  ToVector(output, &calculated_order);
188  EXPECT_EQ(correct_order, calculated_order);
189  }
190 
191  // Verify that ResultIterator::CalculateTextlineOrder() produces sane output
192  // for a given array of word_dirs[num_words] in ltr or rtl context.
193  // Sane means that the output contains some permutation of the indices
194  // 0..[num_words - 1] interspersed optionally with negative (marker) values.
195  void VerifySaneTextlineOrder(bool in_ltr_context,
196  StrongScriptDirection* word_dirs,
197  int num_words) const {
199  for (int i = 0; i < num_words; i++) {
200  gv_word_dirs.push_back(word_dirs[i]);
201  }
202 
203  GenericVectorEqEq<int> output;
204  ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs,
205  &output);
206  ASSERT_GE(output.size(), num_words);
207  GenericVector<int> output_copy(output);
208  output_copy.sort();
209  bool sane = true;
210  int j = 0;
211  while (j < output_copy.size() && output_copy[j] < 0) j++;
212  for (int i = 0; i < num_words; i++, j++) {
213  if (output_copy[j] != i) {
214  sane = false;
215  break;
216  }
217  }
218  if (j != output_copy.size()) {
219  sane = false;
220  }
221  if (!sane) {
222  std::vector<int> output_copy2, empty;
223  ToVector(output, &output_copy2);
224  EXPECT_EQ(output_copy2, empty)
225  << " permutation of 0.." << num_words - 1 << " not found in "
226  << (in_ltr_context ? "ltr" : "rtl") << " context.";
227  }
228  }
229 
230  // Objects declared here can be used by all tests in the test case for Foo.
231  Pix* src_pix_; // Borrowed from api_. Do not destroy.
232  std::string ocr_text_;
234 };
235 
236 // Tests layout analysis output (and scrollview) on the UNLV page numbered
237 // 8087_054.3G.tif. (Dubrovnik), but only if --visual_test is true.
238 //
239 //TEST_F(ResultIteratorTest, VisualTest) {
240 // if (!FLAGS_visual_test) return;
241 // const char* kIms[] = {"8087_054.3G.tif", "8071_093.3B.tif", nullptr};
242 // for (int i = 0; kIms[i] != nullptr; ++i) {
243 // SetImage(kIms[i]);
244 // // Just run layout analysis.
245 // PageIterator* it = api_.AnalyseLayout();
246 // EXPECT_FALSE(it == nullptr);
247 // // Make a scrollview window for the display.
248 // int width = pixGetWidth(src_pix_);
249 // int height = pixGetHeight(src_pix_);
250 // ScrollView* win =
251 // new ScrollView(kIms[i], 100, 100, width / 2, height / 2, width, height);
252 // win->Image(src_pix_, 0, 0);
253 // it->Begin();
254 // ScrollView::Color color = ScrollView::RED;
255 // win->Brush(ScrollView::NONE);
256 // do {
257 // Pta* pts = it->BlockPolygon();
258 // if (pts != nullptr) {
259 // win->Pen(color);
260 // int num_pts = ptaGetCount(pts);
261 // l_float32 x, y;
262 // ptaGetPt(pts, num_pts - 1, &x, &y);
263 // win->SetCursor(static_cast<int>(x), static_cast<int>(y));
264 // for (int p = 0; p < num_pts; ++p) {
265 // ptaGetPt(pts, p, &x, &y);
266 // win->DrawTo(static_cast<int>(x), static_cast<int>(y));
267 // }
268 // }
269 // ptaDestroy(&pts);
270 // } while (it->Next(tesseract::RIL_BLOCK));
271 // win->Update();
272 // delete win->AwaitEvent(SVET_DESTROY);
273 // delete win;
274 // delete it;
275 // }
276 //}
277 
278 // Tests that Tesseract gets exactly the right answer on phototest.
279 TEST_F(ResultIteratorTest, EasyTest) {
280  SetImage("phototest.tif");
281  // Just run layout analysis.
282  PageIterator* p_it = api_.AnalyseLayout();
283  EXPECT_FALSE(p_it == nullptr);
284  // Check iterator position.
285  EXPECT_TRUE(p_it->IsAtBeginningOf(tesseract::RIL_BLOCK));
286  // This should be a single block.
287  EXPECT_FALSE(p_it->Next(tesseract::RIL_BLOCK));
288  EXPECT_FALSE(p_it->IsAtBeginningOf(tesseract::RIL_BLOCK));
289 
290  // The images should rebuild almost perfectly.
291  LOG(INFO) << "Verifying image rebuilds 1 (pageiterator)" << "\n";
292  VerifyRebuilds(10, 10, 0, 0, 0, p_it);
293  delete p_it;
294 
295  char* result = api_.GetUTF8Text();
296  ocr_text_ = result;
297  delete[] result;
298  ResultIterator* r_it = api_.GetIterator();
299  // The images should rebuild almost perfectly.
300  LOG(INFO) << "Verifying image rebuilds 2a (resultiterator)" << "\n";
301  VerifyRebuilds(8, 8, 0, 0, 40, r_it);
302  // Test the text.
303  LOG(INFO) << "Verifying text rebuilds 1 (resultiterator)" << "\n";
304  VerifyAllText(ocr_text_, r_it);
305 
306  // The images should rebuild almost perfectly.
307  LOG(INFO) << "Verifying image rebuilds 2b (resultiterator)" << "\n";
308  VerifyRebuilds(8, 8, 0, 0, 40, r_it);
309 
310  r_it->Begin();
311  // Test baseline of the first line.
312  int x1, y1, x2, y2;
313  r_it->Baseline(tesseract::RIL_TEXTLINE, &x1, &y1, &x2, &y2);
314  LOG(INFO) << absl::StrFormat("Baseline (%d,%d)->(%d,%d)", x1, y1, x2, y2) << "\n";
315  // Make sure we have a decent vector.
316  EXPECT_GE(x2, x1 + 400);
317  // The point 200,116 should be very close to the baseline.
318  // (x3,y3) is the vector from (x1,y1) to (200,116)
319  int x3 = 200 - x1;
320  int y3 = 116 - y1;
321  x2 -= x1;
322  y2 -= y1;
323  // The cross product (x2,y1)x(x3,y3) should be small.
324  int product = x2 * y3 - x3 * y2;
325  EXPECT_LE(abs(product), x2);
326 
327  // Test font attributes for each word.
328  do {
329  bool bold, italic, underlined, monospace, serif, smallcaps;
330  int pointsize, font_id;
331  const char* font =
332  r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
333  &serif, &smallcaps, &pointsize, &font_id);
334  float confidence = r_it->Confidence(tesseract::RIL_WORD);
335  EXPECT_GE(confidence, 80.0f);
336  char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
337  LOG(INFO) << absl::StrFormat("Word %s in font %s, id %d, size %d, conf %g",
338  word_str, font, font_id, pointsize, confidence) << "\n";
339  delete[] word_str;
340  EXPECT_FALSE(bold);
341  EXPECT_FALSE(italic);
342  EXPECT_FALSE(underlined);
343  EXPECT_FALSE(monospace);
344  EXPECT_FALSE(serif);
345  // The text is about 31 pixels high. Above we say the source is 200 ppi,
346  // which translates to:
347  // 31 pixels / textline * (72 pts / inch) / (200 pixels / inch) = 11.16 pts
348  EXPECT_GE(pointsize, 11.16 - 1.50);
349  EXPECT_LE(pointsize, 11.16 + 1.50);
350  } while (r_it->Next(tesseract::RIL_WORD));
351  delete r_it;
352 }
353 
354 // Tests image rebuild on the UNLV page numbered 8087_054.3B.tif. (Dubrovnik)
355 TEST_F(ResultIteratorTest, ComplexTest) {
356  SetImage("8087_054.3B.tif");
357  // Just run layout analysis.
358  PageIterator* it = api_.AnalyseLayout();
359  EXPECT_FALSE(it == nullptr);
360  // The images should rebuild almost perfectly.
361  VerifyRebuilds(400, 400, 400, 400, 650, it);
362  delete it;
363 }
364 
365 // Tests image rebuild on the UNLV page numbered 8087_054.3G.tif. (Dubrovnik)
366 TEST_F(ResultIteratorTest, GreyTest) {
367  SetImage("8087_054.3G.tif");
368  // Just run layout analysis.
369  PageIterator* it = api_.AnalyseLayout();
370  EXPECT_FALSE(it == nullptr);
371  // The images should rebuild almost perfectly.
372  VerifyRebuilds(600, 600, 600, 600, 600, it);
373  delete it;
374 }
375 
376 // Tests that Tesseract gets smallcaps and dropcaps.
377 TEST_F(ResultIteratorTest, SmallCapDropCapTest) {
378  SetImage("8071_093.3B.tif");
379  char* result = api_.GetUTF8Text();
380  delete[] result;
381  ResultIterator* r_it = api_.GetIterator();
382  // Iterate over the words.
383  int found_dropcaps = 0;
384  int found_smallcaps = 0;
385  int false_positives = 0;
386  do {
387  bool bold, italic, underlined, monospace, serif, smallcaps;
388  int pointsize, font_id;
389  r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif,
390  &smallcaps, &pointsize, &font_id);
391  char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
392  if (word_str != nullptr) {
393  LOG(INFO) << absl::StrFormat("Word %s is %s", word_str,
394  smallcaps ? "SMALLCAPS" : "Normal") << "\n";
395  if (r_it->SymbolIsDropcap()) {
396  ++found_dropcaps;
397  }
398  if (strcmp(word_str, "SHE") == 0 || strcmp(word_str, "MOPED") == 0 ||
399  strcmp(word_str, "RALPH") == 0 ||
400  strcmp(word_str, "KINNEY") == 0 || // Not working yet.
401  strcmp(word_str, "BENNETT") == 0) {
402  EXPECT_TRUE(smallcaps) << word_str;
403  ++found_smallcaps;
404  } else {
405  if (smallcaps) ++false_positives;
406  }
407  // No symbol other than the first of any word should be dropcap.
408  ResultIterator s_it(*r_it);
409  while (s_it.Next(tesseract::RIL_SYMBOL) &&
410  !s_it.IsAtBeginningOf(tesseract::RIL_WORD)) {
411  if (s_it.SymbolIsDropcap()) {
412  char* sym_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL);
413  LOG(ERROR) << absl::StrFormat("Symbol %s of word %s is dropcap", sym_str,
414  word_str);
415  delete[] sym_str;
416  }
417  EXPECT_FALSE(s_it.SymbolIsDropcap());
418  }
419  delete[] word_str;
420  }
421  } while (r_it->Next(tesseract::RIL_WORD));
422  delete r_it;
423  EXPECT_EQ(1, found_dropcaps);
424  EXPECT_GE(4, found_smallcaps);
425  EXPECT_LE(false_positives, 3);
426 }
427 
428 #if 0
429 // TODO(rays) uncomment on the next change to layout analysis.
430 // CL 22736106 breaks it, but it is fixed in the change when
431 // the textline finders start to collapse.
432 
433 // Tests that Tesseract gets subscript and superscript.
434 // TODO(rays) This test is a bit feeble, due to bad textline finding on this
435 // image, so beef up the test a bit when we get less false positive subs.
436 TEST_F(ResultIteratorTest, SubSuperTest) {
437  SetImage("0146_281.3B.tif");
438  char* result = api_.GetUTF8Text();
439  delete [] result;
440  ResultIterator* r_it = api_.GetIterator();
441  // Iterate over the symbols.
442  // Accuracy isn't great, so just count up and expect a decent count of
443  // positives and negatives.
444  const char kAllowedSupers[] = "O0123456789-";
445  int found_subs = 0;
446  int found_supers = 0;
447  int found_normal = 0;
448  do {
449  if (r_it->SymbolIsSubscript()) {
450  ++found_subs;
451  } else if (r_it->SymbolIsSuperscript()) {
452  result = r_it->GetUTF8Text(tesseract::RIL_SYMBOL);
453  if (strchr(kAllowedSupers, result[0]) == nullptr) {
454  char* word = r_it->GetUTF8Text(tesseract::RIL_WORD);
455  LOG(ERROR) << absl::StrFormat("Char %s in word %s is unexpected super!",
456  result, word);
457  delete [] word;
458  EXPECT_TRUE(strchr(kAllowedSupers, result[0]) != nullptr);
459  }
460  delete [] result;
461  ++found_supers;
462  } else {
463  ++found_normal;
464  }
465  } while (r_it->Next(tesseract::RIL_SYMBOL));
466  delete r_it;
467  LOG(INFO) << absl::StrFormat("Subs = %d, supers= %d, normal = %d",
468  found_subs, found_supers, found_normal) << "\n";
469  EXPECT_GE(found_subs, 25);
470  EXPECT_GE(found_supers, 25);
471  EXPECT_GE(found_normal, 1350);
472 }
473 #endif
474 
475 static const StrongScriptDirection dL = DIR_LEFT_TO_RIGHT;
476 static const StrongScriptDirection dR = DIR_RIGHT_TO_LEFT;
477 static const StrongScriptDirection dN = DIR_NEUTRAL;
478 static const StrongScriptDirection dZ = DIR_MIX;
479 
480 // Test that a sequence of words that could be interpreted to start from
481 // the left side left-to-right or from the right side right-to-left is
482 // interpreted appropriately in different contexts.
483 TEST_F(ResultIteratorTest, DualStartTextlineOrderTest) {
484  StrongScriptDirection word_dirs[] = {dL, dL, dN, dL, dN, dR, dR, dR};
485  int reading_order_rtl_context[] = {7, 6, 5, 4, ResultIterator::kMinorRunStart,
486  0, 1, 2, 3, ResultIterator::kMinorRunEnd};
487  int reading_order_ltr_context[] = {0, 1,
488  2, 3,
489  4, ResultIterator::kMinorRunStart,
490  7, 6,
491  5, ResultIterator::kMinorRunEnd};
492 
493  ExpectTextlineReadingOrder(true, word_dirs, ABSL_ARRAYSIZE(word_dirs),
494  reading_order_ltr_context,
495  ABSL_ARRAYSIZE(reading_order_ltr_context));
496  ExpectTextlineReadingOrder(false, word_dirs, ABSL_ARRAYSIZE(word_dirs),
497  reading_order_rtl_context,
498  ABSL_ARRAYSIZE(reading_order_rtl_context));
499 }
500 
501 // Tests that clearly left-direction text (with no right-to-left indications)
502 // comes out strictly left to right no matter the context.
503 TEST_F(ResultIteratorTest, LeftwardTextlineOrderTest) {
504  StrongScriptDirection word_dirs[] = {dL, dL, dN, dL, dN, dN, dL, dL};
505  // The order here is just left to right, nothing fancy.
506  int reading_order_ltr_context[] = {0, 1, 2, 3, 4, 5, 6, 7};
507  // In the strange event that this shows up in an RTL paragraph, nonetheless
508  // just presume the whole thing is an LTR line.
509  int reading_order_rtl_context[] = {
510  ResultIterator::kMinorRunStart, 0, 1, 2, 3, 4, 5, 6, 7,
511  ResultIterator::kMinorRunEnd};
512 
513  ExpectTextlineReadingOrder(true, word_dirs, ABSL_ARRAYSIZE(word_dirs),
514  reading_order_ltr_context,
515  ABSL_ARRAYSIZE(reading_order_ltr_context));
516  ExpectTextlineReadingOrder(false, word_dirs, ABSL_ARRAYSIZE(word_dirs),
517  reading_order_rtl_context,
518  ABSL_ARRAYSIZE(reading_order_rtl_context));
519 }
520 
521 // Test that right-direction text comes out strictly right-to-left in
522 // a right-to-left context.
523 TEST_F(ResultIteratorTest, RightwardTextlineOrderTest) {
524  StrongScriptDirection word_dirs[] = {dR, dR, dN, dR, dN, dN, dR, dR};
525  // The order here is just right-to-left, nothing fancy.
526  int reading_order_rtl_context[] = {7, 6, 5, 4, 3, 2, 1, 0};
527  ExpectTextlineReadingOrder(false, word_dirs, ABSL_ARRAYSIZE(word_dirs),
528  reading_order_rtl_context,
529  ABSL_ARRAYSIZE(reading_order_rtl_context));
530 }
531 
532 TEST_F(ResultIteratorTest, TextlineOrderSanityCheck) {
533  // Iterate through all 7-word sequences and make sure that the output
534  // contains each of the indices 0..6 exactly once.
535  const int kNumWords(7);
536  const int kNumCombos = 1 << (2 * kNumWords); // 4 ^ 7 combinations
537  StrongScriptDirection word_dirs[kNumWords];
538  for (int i = 0; i < kNumCombos; i++) {
539  // generate the next combination.
540  int tmp = i;
541  for (int j = 0; j < kNumWords; j++) {
542  word_dirs[j] = static_cast<StrongScriptDirection>(tmp % 4);
543  tmp = tmp / 4;
544  }
545  VerifySaneTextlineOrder(true, word_dirs, kNumWords);
546  VerifySaneTextlineOrder(false, word_dirs, kNumWords);
547  }
548 }
549 
550 // TODO: Missing image
551 TEST_F(ResultIteratorTest, DISABLED_NonNullChoicesTest) {
552  SetImage("5318c4b679264.jpg");
553  char* result = api_.GetUTF8Text();
554  delete[] result;
555  ResultIterator* r_it = api_.GetIterator();
556  // Iterate over the words.
557  do {
558  char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
559  if (word_str != nullptr) {
560  LOG(INFO) << absl::StrFormat("Word %s:", word_str) << "\n";
561  ResultIterator s_it = *r_it;
562  do {
563  tesseract::ChoiceIterator c_it(s_it);
564  do {
565  const char* char_str = c_it.GetUTF8Text();
566  if (char_str == nullptr)
567  LOG(INFO) << "Null char choice" << "\n";
568  else
569  LOG(INFO) << "Char choice " << char_str << "\n";
570  CHECK(char_str != nullptr);
571  } while (c_it.Next());
572  } while (
573  !s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) &&
574  s_it.Next(tesseract::RIL_SYMBOL));
575  delete[] word_str;
576  }
577  } while (r_it->Next(tesseract::RIL_WORD));
578  delete r_it;
579 }
580 
581 // TODO: Missing image
582 TEST_F(ResultIteratorTest, NonNullConfidencesTest) {
583 // SetImage("line6.tiff");
584  SetImage("trainingitalline.tif");
585  api_.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK);
586  // Force recognition so we can used the result iterator.
587  // We don't care about the return from GetUTF8Text.
588  char* result = api_.GetUTF8Text();
589  delete[] result;
590  ResultIterator* r_it = api_.GetIterator();
591  // Iterate over the words.
592  do {
593  char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
594  if (word_str != nullptr) {
595  EXPECT_FALSE(r_it->Empty(tesseract::RIL_WORD));
596  EXPECT_FALSE(r_it->Empty(tesseract::RIL_SYMBOL));
597  ResultIterator s_it = *r_it;
598  do {
599  const char* char_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL);
600  CHECK(char_str != nullptr);
601  float confidence = s_it.Confidence(tesseract::RIL_SYMBOL);
602  LOG(INFO) << absl::StrFormat("Char %s has confidence %g\n", char_str,
603  confidence);
604  delete[] char_str;
605  } while (
606  !s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) &&
607  s_it.Next(tesseract::RIL_SYMBOL));
608  delete[] word_str;
609  } else {
610  LOG(INFO) << "Empty word found" << "\n";
611  }
612  } while (r_it->Next(tesseract::RIL_WORD));
613  delete r_it;
614 }
615 
616 } // namespace
file::JoinPath
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:43
string
std::string string
Definition: equationdetect_test.cc:21
INFO
Definition: log.h:29
tesseract::RIL_WORD
Definition: publictypes.h:220
tesseract::RIL_BLOCK
Definition: publictypes.h:217
tesseract::ChoiceIterator
Definition: ltrresultiterator.h:186
tesseract::PageIterator
Definition: pageiterator.h:52
StrongScriptDirection
StrongScriptDirection
Definition: unichar.h:43
tesseract::PSM_SINGLE_BLOCK
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:168
include_gunit.h
tesseract::TEST_F
TEST_F(EquationFinderTest, IdentifySpecialText)
Definition: equationdetect_test.cc:181
ERROR
Definition: log.h:29
resultiterator.h
DIR_LEFT_TO_RIGHT
Definition: unichar.h:45
tesseract::RIL_SYMBOL
Definition: publictypes.h:221
genericvector.h
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
CHECK
#define CHECK(test)
Definition: include_gunit.h:57
baseapi.h
FLAGS_test_tmpdir
const char * FLAGS_test_tmpdir
Definition: include_gunit.h:20
tesseract::TessBaseAPI
Definition: baseapi.h:98
tesseract::PageIteratorLevel
PageIteratorLevel
Definition: publictypes.h:216
GenericVectorEqEq
Definition: genericvector.h:640
tesseract::RIL_TEXTLINE
Definition: publictypes.h:219
GenericVector
Definition: baseapi.h:40
DIR_NEUTRAL
Definition: unichar.h:44
tesseract::ResultIterator
Definition: resultiterator.h:44
tesseract::PSM_AUTO
Fully automatic page segmentation, but no OSD.
Definition: publictypes.h:164
DIR_RIGHT_TO_LEFT
Definition: unichar.h:46
DIR_MIX
Definition: unichar.h:47
log.h
LOG
Definition: cleanapi_test.cc:19
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::OEM_TESSERACT_ONLY
Definition: publictypes.h:266
scrollview.h
tesseract::RIL_PARA
Definition: publictypes.h:218