tesseract  4.0.0-1-g2a2b
applybox.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: applybox.cpp (Formerly applybox.c)
3  * Description: Re segment rows according to box file data
4  * Author: Phil Cheatle
5  * Created: Wed Nov 24 09:11:23 GMT 1993
6  *
7  * (C) Copyright 1993, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <cctype>
21 #include <cerrno>
22 #include <cstring>
23 #include "allheaders.h"
24 #include "boxread.h"
25 #ifndef DISABLED_LEGACY_ENGINE
26 #include "chopper.h"
27 #endif
28 #include "pageres.h"
29 #include "unichar.h"
30 #include "unicharset.h"
31 #include "tesseractclass.h"
32 #include "genericvector.h"
33 
35 const int kMaxGroupSize = 4;
38 const double kMaxXHeightDeviationFraction = 0.125;
39 
75 namespace tesseract {
76 
77 #ifndef DISABLED_LEGACY_ENGINE
78 static void clear_any_old_text(BLOCK_LIST *block_list) {
79  BLOCK_IT block_it(block_list);
80  for (block_it.mark_cycle_pt();
81  !block_it.cycled_list(); block_it.forward()) {
82  ROW_IT row_it(block_it.data()->row_list());
83  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
84  WERD_IT word_it(row_it.data()->word_list());
85  for (word_it.mark_cycle_pt();
86  !word_it.cycled_list(); word_it.forward()) {
87  word_it.data()->set_text("");
88  }
89  }
90  }
91 }
92 
93 // Applies the box file based on the image name fname, and resegments
94 // the words in the block_list (page), with:
95 // blob-mode: one blob per line in the box file, words as input.
96 // word/line-mode: one blob per space-delimited unit after the #, and one word
97 // per line in the box file. (See comment above for box file format.)
98 // If find_segmentation is true, (word/line mode) then the classifier is used
99 // to re-segment words/lines to match the space-delimited truth string for
100 // each box. In this case, the input box may be for a word or even a whole
101 // text line, and the output words will contain multiple blobs corresponding
102 // to the space-delimited input string.
103 // With find_segmentation false, no classifier is needed, but the chopper
104 // can still be used to correctly segment touching characters with the help
105 // of the input boxes.
106 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
107 // from normal classification, ie. with a word, chopped_word, rebuild_word,
108 // seam_array, denorm, box_word, and best_state, but NO best_choice or
109 // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
110 // Instead, the correct_text member of WERD_RES is set, and this may be later
111 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
112 // is not required before calling ApplyBoxTraining.
114  bool find_segmentation,
115  BLOCK_LIST *block_list) {
116  GenericVector<TBOX> boxes;
117  GenericVector<STRING> texts, full_texts;
118  if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
119  nullptr)) {
120  return nullptr; // Can't do it.
121  }
122 
123  const int box_count = boxes.size();
124  int box_failures = 0;
125 
126  // In word mode, we use the boxes to make a word for each box, but
127  // in blob mode we use the existing words and maximally chop them first.
128  PAGE_RES* page_res = find_segmentation ?
129  nullptr : SetupApplyBoxes(boxes, block_list);
130  clear_any_old_text(block_list);
131 
132  for (int i = 0; i < box_count; i++) {
133  bool foundit = false;
134  if (page_res != nullptr) {
135  foundit = ResegmentCharBox(page_res,
136  (i == 0) ? nullptr : &boxes[i - 1],
137  boxes[i],
138  (i == box_count - 1) ? nullptr : &boxes[i + 1],
139  full_texts[i].string());
140  } else {
141  foundit = ResegmentWordBox(block_list, boxes[i],
142  (i == box_count - 1) ? nullptr : &boxes[i + 1],
143  texts[i].string());
144  }
145  if (!foundit) {
146  box_failures++;
147  ReportFailedBox(i, boxes[i], texts[i].string(),
148  "FAILURE! Couldn't find a matching blob");
149  }
150  }
151 
152  if (page_res == nullptr) {
153  // In word/line mode, we now maximally chop all the words and resegment
154  // them with the classifier.
155  page_res = SetupApplyBoxes(boxes, block_list);
156  ReSegmentByClassification(page_res);
157  }
158  if (applybox_debug > 0) {
159  tprintf("APPLY_BOXES:\n");
160  tprintf(" Boxes read from boxfile: %6d\n", box_count);
161  if (box_failures > 0)
162  tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
163  }
164  TidyUp(page_res);
165  return page_res;
166 }
167 #endif // ndef DISABLED_LEGACY_ENGINE
168 
169 // Helper computes median xheight in the image.
170 static double MedianXHeight(BLOCK_LIST *block_list) {
171  BLOCK_IT block_it(block_list);
172  STATS xheights(0, block_it.data()->pdblk.bounding_box().height());
173  for (block_it.mark_cycle_pt();
174  !block_it.cycled_list(); block_it.forward()) {
175  ROW_IT row_it(block_it.data()->row_list());
176  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
177  xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
178  }
179  }
180  return xheights.median();
181 }
182 
185 void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
186  const double median_xheight = MedianXHeight(block_list);
187  const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
188  // Strip all fuzzy space markers to simplify the PAGE_RES.
189  BLOCK_IT b_it(block_list);
190  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
191  BLOCK* block = b_it.data();
192  ROW_IT r_it(block->row_list());
193  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
194  ROW* row = r_it.data();
195  const double diff = fabs(row->x_height() - median_xheight);
196  if (diff > max_deviation) {
197  if (applybox_debug) {
198  tprintf("row xheight=%g, but median xheight = %g\n",
199  row->x_height(), median_xheight);
200  }
201  row->set_x_height(static_cast<float>(median_xheight));
202  }
203  }
204  }
205 }
206 
207 #ifndef DISABLED_LEGACY_ENGINE
208 
212  BLOCK_LIST *block_list) {
213  PreenXHeights(block_list);
214  // Strip all fuzzy space markers to simplify the PAGE_RES.
215  BLOCK_IT b_it(block_list);
216  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
217  BLOCK* block = b_it.data();
218  ROW_IT r_it(block->row_list());
219  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
220  ROW* row = r_it.data();
221  WERD_IT w_it(row->word_list());
222  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
223  WERD* word = w_it.data();
224  if (word->cblob_list()->empty()) {
225  delete w_it.extract();
226  } else {
227  word->set_flag(W_FUZZY_SP, false);
228  word->set_flag(W_FUZZY_NON, false);
229  }
230  }
231  }
232  }
233  PAGE_RES* page_res = new PAGE_RES(false, block_list, nullptr);
234  PAGE_RES_IT pr_it(page_res);
235  WERD_RES* word_res;
236  while ((word_res = pr_it.word()) != nullptr) {
237  MaximallyChopWord(boxes, pr_it.block()->block,
238  pr_it.row()->row, word_res);
239  pr_it.forward();
240  }
241  return page_res;
242 }
243 
248  BLOCK* block, ROW* row,
249  WERD_RES* word_res) {
250  if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
251  tessedit_ocr_engine_mode, nullptr,
255  row, block)) {
256  word_res->CloneChoppedToRebuild();
257  return;
258  }
259  if (chop_debug) {
260  tprintf("Maximally chopping word at:");
261  word_res->word->bounding_box().print();
262  }
263  GenericVector<BLOB_CHOICE*> blob_choices;
264  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
265  float rating = static_cast<float>(INT8_MAX);
266  for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
267  // The rating and certainty are not quite arbitrary. Since
268  // select_blob_to_chop uses the worst certainty to choose, they all have
269  // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
270  // in here, and then divide by e each time they are chopped, which
271  // should guarantee a set of unequal values for the whole tree of blobs
272  // produced, however much chopping is required. The chops are thus only
273  // limited by the ability of the chopper to find suitable chop points,
274  // and not by the value of the certainties.
275  BLOB_CHOICE* choice =
276  new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
277  blob_choices.push_back(choice);
278  rating -= 0.125f;
279  }
280  const double e = exp(1.0); // The base of natural logs.
281  int blob_number;
282  int right_chop_index = 0;
284  // We only chop if the language is not fixed pitch like CJK.
285  SEAM* seam = nullptr;
286  while ((seam = chop_one_blob(boxes, blob_choices, word_res,
287  &blob_number)) != nullptr) {
288  word_res->InsertSeam(blob_number, seam);
289  BLOB_CHOICE* left_choice = blob_choices[blob_number];
290  rating = left_choice->rating() / e;
291  left_choice->set_rating(rating);
292  left_choice->set_certainty(-rating);
293  // combine confidence w/ serial #
294  BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
295  rating - 0.125f, -rating, -1,
296  0.0f, 0.0f, 0.0f, BCC_FAKE);
297  blob_choices.insert(right_choice, blob_number + 1);
298  }
299  }
300  word_res->CloneChoppedToRebuild();
301  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
302 }
303 
304 #endif // ndef DISABLED_LEGACY_ENGINE
305 
317 static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
318  const int overlap_area = box1.intersection(box2).area();
319  const int a = box1.area();
320  const int b = box2.area();
321  ASSERT_HOST(a != 0 && b != 0);
322  return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
323 }
324 
325 #ifndef DISABLED_LEGACY_ENGINE
326 
337 bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
338  const TBOX& box, const TBOX* next_box,
339  const char* correct_text) {
340  if (applybox_debug > 1) {
341  tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
342  }
343  PAGE_RES_IT page_res_it(page_res);
344  WERD_RES* word_res;
345  for (word_res = page_res_it.word(); word_res != nullptr;
346  word_res = page_res_it.forward()) {
347  if (!word_res->box_word->bounding_box().major_overlap(box))
348  continue;
349  if (applybox_debug > 1) {
350  tprintf("Checking word box:");
351  word_res->box_word->bounding_box().print();
352  }
353  int word_len = word_res->box_word->length();
354  for (int i = 0; i < word_len; ++i) {
355  TBOX char_box = TBOX();
356  int blob_count = 0;
357  for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
358  TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
359  if (!blob_box.major_overlap(box))
360  break;
361  if (word_res->correct_text[i + blob_count].length() > 0)
362  break; // Blob is claimed already.
363  if (next_box != nullptr) {
364  const double current_box_miss_metric = BoxMissMetric(blob_box, box);
365  const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
366  if (applybox_debug > 2) {
367  tprintf("Checking blob:");
368  blob_box.print();
369  tprintf("Current miss metric = %g, next = %g\n",
370  current_box_miss_metric, next_box_miss_metric);
371  }
372  if (current_box_miss_metric > next_box_miss_metric)
373  break; // Blob is a better match for next box.
374  }
375  char_box += blob_box;
376  }
377  if (blob_count > 0) {
378  if (applybox_debug > 1) {
379  tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
380  }
381  if (!char_box.almost_equal(box, 3) &&
382  ((next_box != nullptr && box.x_gap(*next_box) < -3)||
383  (prev_box != nullptr && prev_box->x_gap(box) < -3))) {
384  return false;
385  }
386  // We refine just the box_word, best_state and correct_text here.
387  // The rebuild_word is made in TidyUp.
388  // blob_count blobs are put together to match the box. Merge the
389  // box_word boxes, save the blob_count in the state and the text.
390  word_res->box_word->MergeBoxes(i, i + blob_count);
391  word_res->best_state[i] = blob_count;
392  word_res->correct_text[i] = correct_text;
393  if (applybox_debug > 2) {
394  tprintf("%d Blobs match: blob box:", blob_count);
395  word_res->box_word->BlobBox(i).print();
396  tprintf("Matches box:");
397  box.print();
398  if (next_box != nullptr) {
399  tprintf("With next box:");
400  next_box->print();
401  }
402  }
403  // Eliminated best_state and correct_text entries for the consumed
404  // blobs.
405  for (int j = 1; j < blob_count; ++j) {
406  word_res->best_state.remove(i + 1);
407  word_res->correct_text.remove(i + 1);
408  }
409  // Assume that no box spans multiple source words, so we are done with
410  // this box.
411  if (applybox_debug > 1) {
412  tprintf("Best state = ");
413  for (int j = 0; j < word_res->best_state.size(); ++j) {
414  tprintf("%d ", word_res->best_state[j]);
415  }
416  tprintf("\n");
417  tprintf("Correct text = [[ ");
418  for (int j = 0; j < word_res->correct_text.size(); ++j) {
419  tprintf("%s ", word_res->correct_text[j].string());
420  }
421  tprintf("]]\n");
422  }
423  return true;
424  }
425  }
426  }
427  if (applybox_debug > 0) {
428  tprintf("FAIL!\n");
429  }
430  return false; // Failure.
431 }
432 
439 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
440  const TBOX& box, const TBOX* next_box,
441  const char* correct_text) {
442  if (applybox_debug > 1) {
443  tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
444  }
445  WERD* new_word = nullptr;
446  BLOCK_IT b_it(block_list);
447  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
448  BLOCK* block = b_it.data();
449  if (!box.major_overlap(block->pdblk.bounding_box()))
450  continue;
451  ROW_IT r_it(block->row_list());
452  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
453  ROW* row = r_it.data();
454  if (!box.major_overlap(row->bounding_box()))
455  continue;
456  WERD_IT w_it(row->word_list());
457  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
458  WERD* word = w_it.data();
459  if (applybox_debug > 2) {
460  tprintf("Checking word:");
461  word->bounding_box().print();
462  }
463  if (word->text() != nullptr && word->text()[0] != '\0')
464  continue; // Ignore words that are already done.
465  if (!box.major_overlap(word->bounding_box()))
466  continue;
467  C_BLOB_IT blob_it(word->cblob_list());
468  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
469  blob_it.forward()) {
470  C_BLOB* blob = blob_it.data();
471  TBOX blob_box = blob->bounding_box();
472  if (!blob_box.major_overlap(box))
473  continue;
474  if (next_box != nullptr) {
475  const double current_box_miss_metric = BoxMissMetric(blob_box, box);
476  const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
477  if (applybox_debug > 2) {
478  tprintf("Checking blob:");
479  blob_box.print();
480  tprintf("Current miss metric = %g, next = %g\n",
481  current_box_miss_metric, next_box_miss_metric);
482  }
483  if (current_box_miss_metric > next_box_miss_metric)
484  continue; // Blob is a better match for next box.
485  }
486  if (applybox_debug > 2) {
487  tprintf("Blob match: blob:");
488  blob_box.print();
489  tprintf("Matches box:");
490  box.print();
491  if (next_box != nullptr) {
492  tprintf("With next box:");
493  next_box->print();
494  }
495  }
496  if (new_word == nullptr) {
497  // Make a new word with a single blob.
498  new_word = word->shallow_copy();
499  new_word->set_text(correct_text);
500  w_it.add_to_end(new_word);
501  }
502  C_BLOB_IT new_blob_it(new_word->cblob_list());
503  new_blob_it.add_to_end(blob_it.extract());
504  }
505  }
506  }
507  }
508  if (new_word == nullptr && applybox_debug > 0) tprintf("FAIL!\n");
509  return new_word != nullptr;
510 }
511 
515  PAGE_RES_IT pr_it(page_res);
516  WERD_RES* word_res;
517  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
518  const WERD* word = word_res->word;
519  if (word->text() == nullptr || word->text()[0] == '\0')
520  continue; // Ignore words that have no text.
521  // Convert the correct text to a vector of UNICHAR_ID
522  GenericVector<UNICHAR_ID> target_text;
523  if (!ConvertStringToUnichars(word->text(), &target_text)) {
524  tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
525  word->text());
526  pr_it.DeleteCurrentWord();
527  continue;
528  }
529  if (!FindSegmentation(target_text, word_res)) {
530  tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
531  word->text());
532  pr_it.DeleteCurrentWord();
533  continue;
534  }
535  }
536 }
537 
538 #endif // ndef DISABLED_LEGACY_ENGINE
539 
542 bool Tesseract::ConvertStringToUnichars(const char* utf8,
543  GenericVector<UNICHAR_ID>* class_ids) {
544  for (int step = 0; *utf8 != '\0'; utf8 += step) {
545  const char* next_space = strchr(utf8, ' ');
546  if (next_space == nullptr)
547  next_space = utf8 + strlen(utf8);
548  step = next_space - utf8;
549  UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
550  if (class_id == INVALID_UNICHAR_ID) {
551  return false;
552  }
553  while (utf8[step] == ' ')
554  ++step;
555  class_ids->push_back(class_id);
556  }
557  return true;
558 }
559 
560 #ifndef DISABLED_LEGACY_ENGINE
561 
562 
570  WERD_RES* word_res) {
571  // Classify all required combinations of blobs and save results in choices.
572  const int word_length = word_res->box_word->length();
574  new GenericVector<BLOB_CHOICE_LIST*>[word_length];
575  for (int i = 0; i < word_length; ++i) {
576  for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
577  BLOB_CHOICE_LIST* match_result = classify_piece(
578  word_res->seam_array, i, i + j - 1, "Applybox",
579  word_res->chopped_word, word_res->blamer_bundle);
580  if (applybox_debug > 2) {
581  tprintf("%d+%d:", i, j);
582  print_ratings_list("Segment:", match_result, unicharset);
583  }
584  choices[i].push_back(match_result);
585  }
586  }
587  // Search the segmentation graph for the target text. Must be an exact
588  // match. Using wildcards makes it difficult to find the correct
589  // segmentation even when it is there.
590  word_res->best_state.clear();
591  GenericVector<int> search_segmentation;
592  float best_rating = 0.0f;
593  SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
594  &search_segmentation, &best_rating, &word_res->best_state);
595  for (int i = 0; i < word_length; ++i)
596  choices[i].delete_data_pointers();
597  delete [] choices;
598  if (word_res->best_state.empty()) {
599  // Build the original segmentation and if it is the same length as the
600  // truth, assume it will do.
601  int blob_count = 1;
602  for (int s = 0; s < word_res->seam_array.size(); ++s) {
603  SEAM* seam = word_res->seam_array[s];
604  if (!seam->HasAnySplits()) {
605  word_res->best_state.push_back(blob_count);
606  blob_count = 1;
607  } else {
608  ++blob_count;
609  }
610  }
611  word_res->best_state.push_back(blob_count);
612  if (word_res->best_state.size() != target_text.size()) {
613  word_res->best_state.clear(); // No good. Original segmentation bad size.
614  return false;
615  }
616  }
617  word_res->correct_text.clear();
618  for (int i = 0; i < target_text.size(); ++i) {
619  word_res->correct_text.push_back(
620  STRING(unicharset.id_to_unichar(target_text[i])));
621  }
622  return true;
623 }
624 
640  int choices_pos, int choices_length,
641  const GenericVector<UNICHAR_ID>& target_text,
642  int text_index,
643  float rating, GenericVector<int>* segmentation,
644  float* best_rating,
645  GenericVector<int>* best_segmentation) {
647  for (int length = 1; length <= choices[choices_pos].size(); ++length) {
648  // Rating of matching choice or worst choice if no match.
649  float choice_rating = 0.0f;
650  // Find the corresponding best BLOB_CHOICE.
651  BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
652  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
653  choice_it.forward()) {
654  const BLOB_CHOICE* choice = choice_it.data();
655  choice_rating = choice->rating();
656  UNICHAR_ID class_id = choice->unichar_id();
657  if (class_id == target_text[text_index]) {
658  break;
659  }
660  // Search ambigs table.
661  if (class_id < table.size() && table[class_id] != nullptr) {
662  AmbigSpec_IT spec_it(table[class_id]);
663  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
664  spec_it.forward()) {
665  const AmbigSpec *ambig_spec = spec_it.data();
666  // We'll only do 1-1.
667  if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
668  ambig_spec->correct_ngram_id == target_text[text_index])
669  break;
670  }
671  if (!spec_it.cycled_list())
672  break; // Found an ambig.
673  }
674  }
675  if (choice_it.cycled_list())
676  continue; // No match.
677  segmentation->push_back(length);
678  if (choices_pos + length == choices_length &&
679  text_index + 1 == target_text.size()) {
680  // This is a complete match. If the rating is good record a new best.
681  if (applybox_debug > 2) {
682  tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
683  rating + choice_rating, *best_rating, segmentation->size(),
684  best_segmentation->size());
685  }
686  if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
687  *best_segmentation = *segmentation;
688  *best_rating = rating + choice_rating;
689  }
690  } else if (choices_pos + length < choices_length &&
691  text_index + 1 < target_text.size()) {
692  if (applybox_debug > 3) {
693  tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
694  target_text[text_index],
695  unicharset.id_to_unichar(target_text[text_index]),
696  choice_it.data()->unichar_id() == target_text[text_index]
697  ? "Match" : "Ambig",
698  choices_pos, length);
699  }
700  SearchForText(choices, choices_pos + length, choices_length, target_text,
701  text_index + 1, rating + choice_rating, segmentation,
702  best_rating, best_segmentation);
703  if (applybox_debug > 3) {
704  tprintf("End recursion for %d=%s\n", target_text[text_index],
705  unicharset.id_to_unichar(target_text[text_index]));
706  }
707  }
708  segmentation->truncate(segmentation->size() - 1);
709  }
710 }
711 
716 void Tesseract::TidyUp(PAGE_RES* page_res) {
717  int ok_blob_count = 0;
718  int bad_blob_count = 0;
719  int ok_word_count = 0;
720  int unlabelled_words = 0;
721  PAGE_RES_IT pr_it(page_res);
722  WERD_RES* word_res;
723  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
724  int ok_in_word = 0;
725  int blob_count = word_res->correct_text.size();
726  WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
727  word_choice->set_permuter(TOP_CHOICE_PERM);
728  for (int c = 0; c < blob_count; ++c) {
729  if (word_res->correct_text[c].length() > 0) {
730  ++ok_in_word;
731  }
732  // Since we only need a fake word_res->best_choice, the actual
733  // unichar_ids do not matter. Which is fortunate, since TidyUp()
734  // can be called while training Tesseract, at the stage where
735  // unicharset is not meaningful yet.
737  INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
738  }
739  if (ok_in_word > 0) {
740  ok_blob_count += ok_in_word;
741  bad_blob_count += word_res->correct_text.size() - ok_in_word;
742  word_res->LogNewRawChoice(word_choice);
743  word_res->LogNewCookedChoice(1, false, word_choice);
744  } else {
745  ++unlabelled_words;
746  if (applybox_debug > 0) {
747  tprintf("APPLY_BOXES: Unlabelled word at :");
748  word_res->word->bounding_box().print();
749  }
750  pr_it.DeleteCurrentWord();
751  delete word_choice;
752  }
753  }
754  pr_it.restart_page();
755  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
756  // Denormalize back to a BoxWord.
757  word_res->RebuildBestState();
758  word_res->SetupBoxWord();
759  word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
760  word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
761  }
762  if (applybox_debug > 0) {
763  tprintf(" Found %d good blobs.\n", ok_blob_count);
764  if (bad_blob_count > 0) {
765  tprintf(" Leaving %d unlabelled blobs in %d words.\n",
766  bad_blob_count, ok_word_count);
767  }
768  if (unlabelled_words > 0)
769  tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
770  }
771 }
772 
773 #endif // ndef DISABLED_LEGACY_ENGINE
774 
776 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
777  const char *box_ch, const char *err_msg) {
778  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
779  boxfile_lineno + 1, box_ch,
780  box.left(), box.bottom(), box.right(), box.top(), err_msg);
781 }
782 
785  PAGE_RES_IT pr_it(page_res);
786  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
787  word_res = pr_it.forward()) {
788  WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
789  word_res->correct_text.size());
790  for (int i = 0; i < word_res->correct_text.size(); ++i) {
791  // The part before the first space is the real ground truth, and the
792  // rest is the bounding box location and page number.
793  GenericVector<STRING> tokens;
794  word_res->correct_text[i].split(' ', &tokens);
795  UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
796  choice->append_unichar_id_space_allocated(char_id,
797  word_res->best_state[i],
798  0.0f, 0.0f);
799  }
800  word_res->ClearWordChoices();
801  word_res->LogNewRawChoice(choice);
802  word_res->LogNewCookedChoice(1, false, choice);
803  }
804 }
805 
806 #ifndef DISABLED_LEGACY_ENGINE
807 
808 
811 void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
812  PAGE_RES_IT pr_it(page_res);
813  int word_count = 0;
814  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
815  word_res = pr_it.forward()) {
816  LearnWord(fontname.string(), word_res);
817  ++word_count;
818  }
819  tprintf("Generated training data for %d words\n", word_count);
820 }
821 
822 #endif // ndef DISABLED_LEGACY_ENGINE
823 
824 } // namespace tesseract
BLOCK_RES * block() const
Definition: pageres.h:757
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
void ReSegmentByClassification(PAGE_RES *page_res)
int UNICHAR_ID
Definition: unichar.h:35
int size() const
Definition: genericvector.h:71
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
ROW_RES * row() const
Definition: pageres.h:754
Dict & getDict() override
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:104
void print() const
Definition: rect.h:278
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:452
const char * string() const
Definition: strngs.cpp:196
TBOX bounding_box() const
Definition: werd.cpp:159
bool classify_bln_numeric_mode
Definition: classify.h:541
Definition: seam.h:44
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:424
void MergeBoxes(int start, int end)
Definition: boxword.cpp:131
void remove(int index)
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:118
Definition: rect.h:34
WERD * shallow_copy()
Definition: werd.cpp:351
int NumBlobs() const
Definition: blobs.h:432
WERD_LIST * word_list()
Definition: ocrrow.h:55
Definition: werd.h:35
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:52
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
int x_gap(const TBOX &box) const
Definition: rect.h:225
void set_text(const char *new_text)
Definition: werd.h:124
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:127
Definition: statistc.h:33
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
const TBOX & BlobBox(int index) const
Definition: boxword.h:84
Pix * BestPix() const
ROW_RES * next_row() const
Definition: pageres.h:763
GenericVector< STRING > correct_text
Definition: pageres.h:275
WERD_RES * restart_page()
Definition: pageres.h:698
BLOCK * block
Definition: pageres.h:117
void SetupBoxWord()
Definition: pageres.cpp:855
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
int16_t left() const
Definition: rect.h:72
void set_certainty(float newrat)
Definition: ratngs.h:151
void insert(const T &t, int index)
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:38
int16_t top() const
Definition: rect.h:58
const int kMaxGroupSize
Definition: applybox.cpp:35
ROW_RES * prev_row() const
Definition: pageres.h:745
void set_x_height(float new_xheight)
Definition: ocrrow.h:67
float x_height() const
Definition: ocrrow.h:64
const char * text() const
Definition: werd.h:123
void set_rating(float newrat)
Definition: ratngs.h:148
UNICHARSET unicharset
Definition: ccutil.h:68
bool major_overlap(const TBOX &box) const
Definition: rect.h:368
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:883
const TBOX & bounding_box() const
Definition: boxword.h:80
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:610
int IntCastRounded(double x)
Definition: helpers.h:168
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:152
int length() const
Definition: genericvector.h:85
void SearchForText(const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
WERD_RES * word() const
Definition: pageres.h:751
GenericVector< int > best_state
Definition: pageres.h:271
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:308
void CloneChoppedToRebuild()
Definition: pageres.cpp:841
Definition: werd.h:59
TBOX bounding_box() const
Definition: ocrrow.h:88
Definition: ocrrow.h:36
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: werd.h:34
Definition: ocrblock.h:30
C_BLOB_LIST * cblob_list()
Definition: werd.h:98
int32_t area() const
Definition: rect.h:122
GenericVector< SEAM * > seam_array
Definition: pageres.h:217
void TidyUp(PAGE_RES *page_res)
void add(int32_t value, int32_t count)
Definition: statistc.cpp:100
int push_back(T object)
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
float rating() const
Definition: ratngs.h:80
Definition: strngs.h:45
TBOX bounding_box() const
Definition: stepblob.cpp:255
void DeleteCurrentWord()
Definition: pageres.cpp:1450
const UNICHARSET * uch_set
Definition: pageres.h:206
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:230
int length() const
Definition: boxword.h:83
BlamerBundle * blamer_bundle
Definition: pageres.h:246
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:60
void ClearWordChoices()
Definition: pageres.cpp:1178
int16_t right() const
Definition: rect.h:79
void truncate(int size)
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:836
void RebuildBestState()
Definition: pageres.cpp:814
WERD_RES * forward()
Definition: pageres.h:731
void CorrectClassifyWords(PAGE_RES *page_res)
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM *> &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:55
TWERD * chopped_word
Definition: pageres.h:215
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:141
bool HasAnySplits() const
Definition: seam.h:67
int16_t bottom() const
Definition: rect.h:65
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:379
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
PDBLK pdblk
Definition: ocrblock.h:192
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:626
void PreenXHeights(BLOCK_LIST *block_list)
tesseract::BoxWord * box_word
Definition: pageres.h:266
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:251
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
ROW * row
Definition: pageres.h:143
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_permuter(uint8_t perm)
Definition: ratngs.h:375
WERD * word
Definition: pageres.h:189