tesseract  5.0.0-alpha-619-ge9db
applybox.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: applybox.cpp (Formerly applybox.c)
3  * Description: Re segment rows according to box file data
4  * Author: Phil Cheatle
5  *
6  * (C) Copyright 1993, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include <cctype>
20 #include <cerrno>
21 #include <cstring>
22 #include "allheaders.h"
23 #include "boxread.h"
24 #include "pageres.h"
25 #include <tesseract/unichar.h>
26 #include "unicharset.h"
27 #include "tesseractclass.h"
29 
31 const int kMaxGroupSize = 4;
34 const double kMaxXHeightDeviationFraction = 0.125;
35 
71 namespace tesseract {
72 
73 #ifndef DISABLED_LEGACY_ENGINE
74 static void clear_any_old_text(BLOCK_LIST *block_list) {
75  BLOCK_IT block_it(block_list);
76  for (block_it.mark_cycle_pt();
77  !block_it.cycled_list(); block_it.forward()) {
78  ROW_IT row_it(block_it.data()->row_list());
79  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
80  WERD_IT word_it(row_it.data()->word_list());
81  for (word_it.mark_cycle_pt();
82  !word_it.cycled_list(); word_it.forward()) {
83  word_it.data()->set_text("");
84  }
85  }
86  }
87 }
88 
89 // Applies the box file based on the image name fname, and resegments
90 // the words in the block_list (page), with:
91 // blob-mode: one blob per line in the box file, words as input.
92 // word/line-mode: one blob per space-delimited unit after the #, and one word
93 // per line in the box file. (See comment above for box file format.)
94 // If find_segmentation is true, (word/line mode) then the classifier is used
95 // to re-segment words/lines to match the space-delimited truth string for
96 // each box. In this case, the input box may be for a word or even a whole
97 // text line, and the output words will contain multiple blobs corresponding
98 // to the space-delimited input string.
99 // With find_segmentation false, no classifier is needed, but the chopper
100 // can still be used to correctly segment touching characters with the help
101 // of the input boxes.
102 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
103 // from normal classification, ie. with a word, chopped_word, rebuild_word,
104 // seam_array, denorm, box_word, and best_state, but NO best_choice or
105 // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
106 // Instead, the correct_text member of WERD_RES is set, and this may be later
107 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
108 // is not required before calling ApplyBoxTraining.
110  bool find_segmentation,
111  BLOCK_LIST *block_list) {
112  GenericVector<TBOX> boxes;
113  GenericVector<STRING> texts, full_texts;
114  if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
115  nullptr)) {
116  return nullptr; // Can't do it.
117  }
118 
119  const int box_count = boxes.size();
120  int box_failures = 0;
121 
122  // In word mode, we use the boxes to make a word for each box, but
123  // in blob mode we use the existing words and maximally chop them first.
124  PAGE_RES* page_res = find_segmentation ?
125  nullptr : SetupApplyBoxes(boxes, block_list);
126  clear_any_old_text(block_list);
127 
128  for (int i = 0; i < box_count; i++) {
129  bool foundit = false;
130  if (page_res != nullptr) {
131  foundit = ResegmentCharBox(page_res,
132  (i == 0) ? nullptr : &boxes[i - 1],
133  boxes[i],
134  (i == box_count - 1) ? nullptr : &boxes[i + 1],
135  full_texts[i].c_str());
136  } else {
137  foundit = ResegmentWordBox(block_list, boxes[i],
138  (i == box_count - 1) ? nullptr : &boxes[i + 1],
139  texts[i].c_str());
140  }
141  if (!foundit) {
142  box_failures++;
143  ReportFailedBox(i, boxes[i], texts[i].c_str(),
144  "FAILURE! Couldn't find a matching blob");
145  }
146  }
147 
148  if (page_res == nullptr) {
149  // In word/line mode, we now maximally chop all the words and resegment
150  // them with the classifier.
151  page_res = SetupApplyBoxes(boxes, block_list);
152  ReSegmentByClassification(page_res);
153  }
154  if (applybox_debug > 0) {
155  tprintf("APPLY_BOXES:\n");
156  tprintf(" Boxes read from boxfile: %6d\n", box_count);
157  if (box_failures > 0)
158  tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
159  }
160  TidyUp(page_res);
161  return page_res;
162 }
163 #endif // ndef DISABLED_LEGACY_ENGINE
164 
165 // Helper computes median xheight in the image.
166 static double MedianXHeight(BLOCK_LIST *block_list) {
167  BLOCK_IT block_it(block_list);
168  STATS xheights(0, block_it.data()->pdblk.bounding_box().height());
169  for (block_it.mark_cycle_pt();
170  !block_it.cycled_list(); block_it.forward()) {
171  ROW_IT row_it(block_it.data()->row_list());
172  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
173  xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
174  }
175  }
176  return xheights.median();
177 }
178 
181 void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
182  const double median_xheight = MedianXHeight(block_list);
183  const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
184  // Strip all fuzzy space markers to simplify the PAGE_RES.
185  BLOCK_IT b_it(block_list);
186  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
187  BLOCK* block = b_it.data();
188  ROW_IT r_it(block->row_list());
189  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
190  ROW* row = r_it.data();
191  const double diff = fabs(row->x_height() - median_xheight);
192  if (diff > max_deviation) {
193  if (applybox_debug) {
194  tprintf("row xheight=%g, but median xheight = %g\n",
195  row->x_height(), median_xheight);
196  }
197  row->set_x_height(static_cast<float>(median_xheight));
198  }
199  }
200  }
201 }
202 
203 #ifndef DISABLED_LEGACY_ENGINE
204 
208  BLOCK_LIST *block_list) {
209  PreenXHeights(block_list);
210  // Strip all fuzzy space markers to simplify the PAGE_RES.
211  BLOCK_IT b_it(block_list);
212  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
213  BLOCK* block = b_it.data();
214  ROW_IT r_it(block->row_list());
215  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
216  ROW* row = r_it.data();
217  WERD_IT w_it(row->word_list());
218  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
219  WERD* word = w_it.data();
220  if (word->cblob_list()->empty()) {
221  delete w_it.extract();
222  } else {
223  word->set_flag(W_FUZZY_SP, false);
224  word->set_flag(W_FUZZY_NON, false);
225  }
226  }
227  }
228  }
229  auto* page_res = new PAGE_RES(false, block_list, nullptr);
230  PAGE_RES_IT pr_it(page_res);
231  WERD_RES* word_res;
232  while ((word_res = pr_it.word()) != nullptr) {
233  MaximallyChopWord(boxes, pr_it.block()->block,
234  pr_it.row()->row, word_res);
235  pr_it.forward();
236  }
237  return page_res;
238 }
239 
244  BLOCK* block, ROW* row,
245  WERD_RES* word_res) {
246  if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
247  tessedit_ocr_engine_mode, nullptr,
251  row, block)) {
252  word_res->CloneChoppedToRebuild();
253  return;
254  }
255  if (chop_debug) {
256  tprintf("Maximally chopping word at:");
257  word_res->word->bounding_box().print();
258  }
259  GenericVector<BLOB_CHOICE*> blob_choices;
260  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
261  auto rating = static_cast<float>(INT8_MAX);
262  for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
263  // The rating and certainty are not quite arbitrary. Since
264  // select_blob_to_chop uses the worst certainty to choose, they all have
265  // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
266  // in here, and then divide by e each time they are chopped, which
267  // should guarantee a set of unequal values for the whole tree of blobs
268  // produced, however much chopping is required. The chops are thus only
269  // limited by the ability of the chopper to find suitable chop points,
270  // and not by the value of the certainties.
271  auto* choice =
272  new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
273  blob_choices.push_back(choice);
274  rating -= 0.125f;
275  }
276  const double e = exp(1.0); // The base of natural logs.
277  int blob_number;
278  int right_chop_index = 0;
280  // We only chop if the language is not fixed pitch like CJK.
281  SEAM* seam = nullptr;
282  while ((seam = chop_one_blob(boxes, blob_choices, word_res,
283  &blob_number)) != nullptr) {
284  word_res->InsertSeam(blob_number, seam);
285  BLOB_CHOICE* left_choice = blob_choices[blob_number];
286  rating = left_choice->rating() / e;
287  left_choice->set_rating(rating);
288  left_choice->set_certainty(-rating);
289  // combine confidence w/ serial #
290  auto* right_choice = new BLOB_CHOICE(++right_chop_index,
291  rating - 0.125f, -rating, -1,
292  0.0f, 0.0f, 0.0f, BCC_FAKE);
293  blob_choices.insert(right_choice, blob_number + 1);
294  }
295  }
296  word_res->CloneChoppedToRebuild();
297  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
298 }
299 
311 static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
312  const int overlap_area = box1.intersection(box2).area();
313  const int a = box1.area();
314  const int b = box2.area();
315  ASSERT_HOST(a != 0 && b != 0);
316  return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
317 }
318 
329 bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
330  const TBOX& box, const TBOX* next_box,
331  const char* correct_text) {
332  if (applybox_debug > 1) {
333  tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
334  }
335  PAGE_RES_IT page_res_it(page_res);
336  WERD_RES* word_res;
337  for (word_res = page_res_it.word(); word_res != nullptr;
338  word_res = page_res_it.forward()) {
339  if (!word_res->box_word->bounding_box().major_overlap(box))
340  continue;
341  if (applybox_debug > 1) {
342  tprintf("Checking word box:");
343  word_res->box_word->bounding_box().print();
344  }
345  int word_len = word_res->box_word->length();
346  for (int i = 0; i < word_len; ++i) {
347  TBOX char_box = TBOX();
348  int blob_count = 0;
349  for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
350  TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
351  if (!blob_box.major_overlap(box))
352  break;
353  if (word_res->correct_text[i + blob_count].length() > 0)
354  break; // Blob is claimed already.
355  if (next_box != nullptr) {
356  const double current_box_miss_metric = BoxMissMetric(blob_box, box);
357  const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
358  if (applybox_debug > 2) {
359  tprintf("Checking blob:");
360  blob_box.print();
361  tprintf("Current miss metric = %g, next = %g\n",
362  current_box_miss_metric, next_box_miss_metric);
363  }
364  if (current_box_miss_metric > next_box_miss_metric)
365  break; // Blob is a better match for next box.
366  }
367  char_box += blob_box;
368  }
369  if (blob_count > 0) {
370  if (applybox_debug > 1) {
371  tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
372  }
373  if (!char_box.almost_equal(box, 3) &&
374  ((next_box != nullptr && box.x_gap(*next_box) < -3)||
375  (prev_box != nullptr && prev_box->x_gap(box) < -3))) {
376  return false;
377  }
378  // We refine just the box_word, best_state and correct_text here.
379  // The rebuild_word is made in TidyUp.
380  // blob_count blobs are put together to match the box. Merge the
381  // box_word boxes, save the blob_count in the state and the text.
382  word_res->box_word->MergeBoxes(i, i + blob_count);
383  word_res->best_state[i] = blob_count;
384  word_res->correct_text[i] = correct_text;
385  if (applybox_debug > 2) {
386  tprintf("%d Blobs match: blob box:", blob_count);
387  word_res->box_word->BlobBox(i).print();
388  tprintf("Matches box:");
389  box.print();
390  if (next_box != nullptr) {
391  tprintf("With next box:");
392  next_box->print();
393  }
394  }
395  // Eliminated best_state and correct_text entries for the consumed
396  // blobs.
397  for (int j = 1; j < blob_count; ++j) {
398  word_res->best_state.remove(i + 1);
399  word_res->correct_text.remove(i + 1);
400  }
401  // Assume that no box spans multiple source words, so we are done with
402  // this box.
403  if (applybox_debug > 1) {
404  tprintf("Best state = ");
405  for (int j = 0; j < word_res->best_state.size(); ++j) {
406  tprintf("%d ", word_res->best_state[j]);
407  }
408  tprintf("\n");
409  tprintf("Correct text = [[ ");
410  for (int j = 0; j < word_res->correct_text.size(); ++j) {
411  tprintf("%s ", word_res->correct_text[j].c_str());
412  }
413  tprintf("]]\n");
414  }
415  return true;
416  }
417  }
418  }
419  if (applybox_debug > 0) {
420  tprintf("FAIL!\n");
421  }
422  return false; // Failure.
423 }
424 
431 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
432  const TBOX& box, const TBOX* next_box,
433  const char* correct_text) {
434  if (applybox_debug > 1) {
435  tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
436  }
437  WERD* new_word = nullptr;
438  BLOCK_IT b_it(block_list);
439  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
440  BLOCK* block = b_it.data();
441  if (!box.major_overlap(block->pdblk.bounding_box()))
442  continue;
443  ROW_IT r_it(block->row_list());
444  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
445  ROW* row = r_it.data();
446  if (!box.major_overlap(row->bounding_box()))
447  continue;
448  WERD_IT w_it(row->word_list());
449  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
450  WERD* word = w_it.data();
451  if (applybox_debug > 2) {
452  tprintf("Checking word:");
453  word->bounding_box().print();
454  }
455  if (word->text() != nullptr && word->text()[0] != '\0')
456  continue; // Ignore words that are already done.
457  if (!box.major_overlap(word->bounding_box()))
458  continue;
459  C_BLOB_IT blob_it(word->cblob_list());
460  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
461  blob_it.forward()) {
462  C_BLOB* blob = blob_it.data();
463  TBOX blob_box = blob->bounding_box();
464  if (!blob_box.major_overlap(box))
465  continue;
466  if (next_box != nullptr) {
467  const double current_box_miss_metric = BoxMissMetric(blob_box, box);
468  const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
469  if (applybox_debug > 2) {
470  tprintf("Checking blob:");
471  blob_box.print();
472  tprintf("Current miss metric = %g, next = %g\n",
473  current_box_miss_metric, next_box_miss_metric);
474  }
475  if (current_box_miss_metric > next_box_miss_metric)
476  continue; // Blob is a better match for next box.
477  }
478  if (applybox_debug > 2) {
479  tprintf("Blob match: blob:");
480  blob_box.print();
481  tprintf("Matches box:");
482  box.print();
483  if (next_box != nullptr) {
484  tprintf("With next box:");
485  next_box->print();
486  }
487  }
488  if (new_word == nullptr) {
489  // Make a new word with a single blob.
490  new_word = word->shallow_copy();
491  new_word->set_text(correct_text);
492  w_it.add_to_end(new_word);
493  }
494  C_BLOB_IT new_blob_it(new_word->cblob_list());
495  new_blob_it.add_to_end(blob_it.extract());
496  }
497  }
498  }
499  }
500  if (new_word == nullptr && applybox_debug > 0) tprintf("FAIL!\n");
501  return new_word != nullptr;
502 }
503 
507  PAGE_RES_IT pr_it(page_res);
508  WERD_RES* word_res;
509  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
510  const WERD* word = word_res->word;
511  if (word->text() == nullptr || word->text()[0] == '\0')
512  continue; // Ignore words that have no text.
513  // Convert the correct text to a vector of UNICHAR_ID
514  GenericVector<UNICHAR_ID> target_text;
515  if (!ConvertStringToUnichars(word->text(), &target_text)) {
516  tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
517  word->text());
518  pr_it.DeleteCurrentWord();
519  continue;
520  }
521  if (!FindSegmentation(target_text, word_res)) {
522  tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
523  word->text());
524  pr_it.DeleteCurrentWord();
525  continue;
526  }
527  }
528 }
529 
530 #endif // ndef DISABLED_LEGACY_ENGINE
531 
534 bool Tesseract::ConvertStringToUnichars(const char* utf8,
535  GenericVector<UNICHAR_ID>* class_ids) {
536  for (int step = 0; *utf8 != '\0'; utf8 += step) {
537  const char* next_space = strchr(utf8, ' ');
538  if (next_space == nullptr)
539  next_space = utf8 + strlen(utf8);
540  step = next_space - utf8;
541  UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
542  if (class_id == INVALID_UNICHAR_ID) {
543  return false;
544  }
545  while (utf8[step] == ' ')
546  ++step;
547  class_ids->push_back(class_id);
548  }
549  return true;
550 }
551 
552 #ifndef DISABLED_LEGACY_ENGINE
553 
554 
562  WERD_RES* word_res) {
563  // Classify all required combinations of blobs and save results in choices.
564  const int word_length = word_res->box_word->length();
565  auto* choices =
566  new GenericVector<BLOB_CHOICE_LIST*>[word_length];
567  for (int i = 0; i < word_length; ++i) {
568  for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
569  BLOB_CHOICE_LIST* match_result = classify_piece(
570  word_res->seam_array, i, i + j - 1, "Applybox",
571  word_res->chopped_word, word_res->blamer_bundle);
572  if (applybox_debug > 2) {
573  tprintf("%d+%d:", i, j);
574  print_ratings_list("Segment:", match_result, unicharset);
575  }
576  choices[i].push_back(match_result);
577  }
578  }
579  // Search the segmentation graph for the target text. Must be an exact
580  // match. Using wildcards makes it difficult to find the correct
581  // segmentation even when it is there.
582  word_res->best_state.clear();
583  GenericVector<int> search_segmentation;
584  float best_rating = 0.0f;
585  SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
586  &search_segmentation, &best_rating, &word_res->best_state);
587  for (int i = 0; i < word_length; ++i)
588  choices[i].delete_data_pointers();
589  delete [] choices;
590  if (word_res->best_state.empty()) {
591  // Build the original segmentation and if it is the same length as the
592  // truth, assume it will do.
593  int blob_count = 1;
594  for (int s = 0; s < word_res->seam_array.size(); ++s) {
595  SEAM* seam = word_res->seam_array[s];
596  if (!seam->HasAnySplits()) {
597  word_res->best_state.push_back(blob_count);
598  blob_count = 1;
599  } else {
600  ++blob_count;
601  }
602  }
603  word_res->best_state.push_back(blob_count);
604  if (word_res->best_state.size() != target_text.size()) {
605  word_res->best_state.clear(); // No good. Original segmentation bad size.
606  return false;
607  }
608  }
609  word_res->correct_text.clear();
610  for (int i = 0; i < target_text.size(); ++i) {
611  word_res->correct_text.push_back(
612  STRING(unicharset.id_to_unichar(target_text[i])));
613  }
614  return true;
615 }
616 
632  int choices_pos, int choices_length,
633  const GenericVector<UNICHAR_ID>& target_text,
634  int text_index,
635  float rating, GenericVector<int>* segmentation,
636  float* best_rating,
637  GenericVector<int>* best_segmentation) {
639  for (int length = 1; length <= choices[choices_pos].size(); ++length) {
640  // Rating of matching choice or worst choice if no match.
641  float choice_rating = 0.0f;
642  // Find the corresponding best BLOB_CHOICE.
643  BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
644  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
645  choice_it.forward()) {
646  const BLOB_CHOICE* choice = choice_it.data();
647  choice_rating = choice->rating();
648  UNICHAR_ID class_id = choice->unichar_id();
649  if (class_id == target_text[text_index]) {
650  break;
651  }
652  // Search ambigs table.
653  if (class_id < table.size() && table[class_id] != nullptr) {
654  AmbigSpec_IT spec_it(table[class_id]);
655  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
656  spec_it.forward()) {
657  const AmbigSpec *ambig_spec = spec_it.data();
658  // We'll only do 1-1.
659  if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
660  ambig_spec->correct_ngram_id == target_text[text_index])
661  break;
662  }
663  if (!spec_it.cycled_list())
664  break; // Found an ambig.
665  }
666  }
667  if (choice_it.cycled_list())
668  continue; // No match.
669  segmentation->push_back(length);
670  if (choices_pos + length == choices_length &&
671  text_index + 1 == target_text.size()) {
672  // This is a complete match. If the rating is good record a new best.
673  if (applybox_debug > 2) {
674  tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
675  rating + choice_rating, *best_rating, segmentation->size(),
676  best_segmentation->size());
677  }
678  if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
679  *best_segmentation = *segmentation;
680  *best_rating = rating + choice_rating;
681  }
682  } else if (choices_pos + length < choices_length &&
683  text_index + 1 < target_text.size()) {
684  if (applybox_debug > 3) {
685  tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
686  target_text[text_index],
687  unicharset.id_to_unichar(target_text[text_index]),
688  choice_it.data()->unichar_id() == target_text[text_index]
689  ? "Match" : "Ambig",
690  choices_pos, length);
691  }
692  SearchForText(choices, choices_pos + length, choices_length, target_text,
693  text_index + 1, rating + choice_rating, segmentation,
694  best_rating, best_segmentation);
695  if (applybox_debug > 3) {
696  tprintf("End recursion for %d=%s\n", target_text[text_index],
697  unicharset.id_to_unichar(target_text[text_index]));
698  }
699  }
700  segmentation->truncate(segmentation->size() - 1);
701  }
702 }
703 
708 void Tesseract::TidyUp(PAGE_RES* page_res) {
709  int ok_blob_count = 0;
710  int bad_blob_count = 0;
711  int ok_word_count = 0;
712  int unlabelled_words = 0;
713  PAGE_RES_IT pr_it(page_res);
714  WERD_RES* word_res;
715  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
716  int ok_in_word = 0;
717  int blob_count = word_res->correct_text.size();
718  auto* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
719  word_choice->set_permuter(TOP_CHOICE_PERM);
720  for (int c = 0; c < blob_count; ++c) {
721  if (word_res->correct_text[c].length() > 0) {
722  ++ok_in_word;
723  }
724  // Since we only need a fake word_res->best_choice, the actual
725  // unichar_ids do not matter. Which is fortunate, since TidyUp()
726  // can be called while training Tesseract, at the stage where
727  // unicharset is not meaningful yet.
728  word_choice->append_unichar_id_space_allocated(
729  INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
730  }
731  if (ok_in_word > 0) {
732  ok_blob_count += ok_in_word;
733  bad_blob_count += word_res->correct_text.size() - ok_in_word;
734  word_res->LogNewRawChoice(word_choice);
735  word_res->LogNewCookedChoice(1, false, word_choice);
736  } else {
737  ++unlabelled_words;
738  if (applybox_debug > 0) {
739  tprintf("APPLY_BOXES: Unlabelled word at :");
740  word_res->word->bounding_box().print();
741  }
742  pr_it.DeleteCurrentWord();
743  delete word_choice;
744  }
745  }
746  pr_it.restart_page();
747  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
748  // Denormalize back to a BoxWord.
749  word_res->RebuildBestState();
750  word_res->SetupBoxWord();
751  word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
752  word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
753  }
754  if (applybox_debug > 0) {
755  tprintf(" Found %d good blobs.\n", ok_blob_count);
756  if (bad_blob_count > 0) {
757  tprintf(" Leaving %d unlabelled blobs in %d words.\n",
758  bad_blob_count, ok_word_count);
759  }
760  if (unlabelled_words > 0)
761  tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
762  }
763 }
764 
765 #endif // ndef DISABLED_LEGACY_ENGINE
766 
768 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
769  const char *box_ch, const char *err_msg) {
770  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
771  boxfile_lineno + 1, box_ch,
772  box.left(), box.bottom(), box.right(), box.top(), err_msg);
773 }
774 
777  PAGE_RES_IT pr_it(page_res);
778  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
779  word_res = pr_it.forward()) {
780  auto* choice = new WERD_CHOICE(word_res->uch_set,
781  word_res->correct_text.size());
782  for (int i = 0; i < word_res->correct_text.size(); ++i) {
783  // The part before the first space is the real ground truth, and the
784  // rest is the bounding box location and page number.
785  GenericVector<STRING> tokens;
786  word_res->correct_text[i].split(' ', &tokens);
787  UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());
788  choice->append_unichar_id_space_allocated(char_id,
789  word_res->best_state[i],
790  0.0f, 0.0f);
791  }
792  word_res->ClearWordChoices();
793  word_res->LogNewRawChoice(choice);
794  word_res->LogNewCookedChoice(1, false, choice);
795  }
796 }
797 
798 #ifndef DISABLED_LEGACY_ENGINE
799 
800 
803 void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
804  PAGE_RES_IT pr_it(page_res);
805  int word_count = 0;
806  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
807  word_res = pr_it.forward()) {
808  LearnWord(fontname.c_str(), word_res);
809  ++word_count;
810  }
811  tprintf("Generated training data for %d words\n", word_count);
812 }
813 
814 #endif // ndef DISABLED_LEGACY_ENGINE
815 
816 } // namespace tesseract
TBOX
Definition: cleanapi_test.cc:19
WERD_RES::SetupBoxWord
void SetupBoxWord()
Definition: pageres.cpp:845
GenericVector::remove
void remove(int index)
Definition: genericvector.h:765
tesseract::Tesseract::poly_allow_detailed_fx
bool poly_allow_detailed_fx
Definition: tesseractclass.h:1061
C_BLOB::bounding_box
TBOX bounding_box() const
Definition: stepblob.cpp:247
WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:266
pageres.h
TBOX::intersection
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:83
PDBLK::bounding_box
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:58
boxread.h
kMaxGroupSize
const int kMaxGroupSize
Definition: applybox.cpp:30
tesseract::Tesseract::textord_use_cjk_fp_model
bool textord_use_cjk_fp_model
Definition: tesseractclass.h:1059
BCC_FAKE
Definition: ratngs.h:46
WERD::shallow_copy
WERD * shallow_copy()
Definition: werd.cpp:333
WERD_CHOICE
Definition: ratngs.h:261
tesseractclass.h
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
WERD_RES::LogNewRawChoice
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:600
WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:147
tesseract::Tesseract::applybox_debug
int applybox_debug
Definition: tesseractclass.h:823
GenericVector::insert
void insert(const T &t, int index)
Definition: genericvector.h:750
BLOCK::row_list
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:115
tesseract::Dict::getUnicharAmbigs
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:108
tesseract::Tesseract::SearchForText
void SearchForText(const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
tesseract::UnicharAmbigsVector
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:134
tesseract::Wordrec::chop_debug
int chop_debug
Definition: wordrec.h:204
TBOX::print
void print() const
Definition: rect.h:277
TBOX::top
int16_t top() const
Definition: rect.h:57
TBOX::area
int32_t area() const
Definition: rect.h:121
STRING
Definition: strngs.h:45
tesseract::Tesseract::ReportFailedBox
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
WERD_RES
Definition: pageres.h:160
tesseract::Classify::LearnWord
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:250
IntCastRounded
int IntCastRounded(double x)
Definition: helpers.h:173
WERD_RES::best_state
GenericVector< int > best_state
Definition: pageres.h:279
tesseract::Tesseract::ConvertStringToUnichars
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:75
C_BLOB
Definition: stepblob.h:36
tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:57
WERD_RES::CloneChoppedToRebuild
void CloneChoppedToRebuild()
Definition: pageres.cpp:831
tesseract::Tesseract::PreenXHeights
void PreenXHeights(BLOCK_LIST *block_list)
Definition: applybox.cpp:180
SEAM
Definition: seam.h:36
tesseract::Tesseract::BestPix
Pix * BestPix() const
Definition: tesseractclass.h:231
WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:197
WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:94
WERD_RES::blamer_bundle
BlamerBundle * blamer_bundle
Definition: pageres.h:246
BLOB_CHOICE::set_rating
void set_rating(float newrat)
Definition: ratngs.h:142
genericvector.h
tesseract::Tesseract::ApplyBoxTraining
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
tesseract::Tesseract::ResegmentWordBox
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
BLOCK
Definition: ocrblock.h:28
BLOCK::pdblk
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:189
tesseract::Tesseract::TidyUp
void TidyUp(PAGE_RES *page_res)
W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:54
ROW::x_height
float x_height() const
Definition: ocrrow.h:63
SEAM::HasAnySplits
bool HasAnySplits() const
Definition: seam.h:59
tesseract::BoxWord::BlobBox
const TBOX & BlobBox(int index) const
Definition: boxword.h:83
WERD_RES::SetupForRecognition
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:298
WERD_RES::LogNewCookedChoice
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:616
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
unicharset.h
WERD_RES::InsertSeam
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:414
tesseract::Tesseract::ApplyBoxes
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
Definition: applybox.cpp:108
BLOB_CHOICE::set_certainty
void set_certainty(float newrat)
Definition: ratngs.h:145
tesseract::Wordrec::classify_piece
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:52
WERD::set_flag
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:117
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
tesseract::UnicharAmbigs::dang_ambigs
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:145
W_EOL
end of line
Definition: werd.h:47
TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:457
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
tesseract::Tesseract::getDict
Dict & getDict() override
Definition: tesseractclass.cpp:564
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
tesseract::Tesseract::applybox_page
int applybox_page
Definition: tesseractclass.h:824
tesseract::Wordrec::chop_one_blob
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:369
ROW::bounding_box
TBOX bounding_box() const
Definition: ocrrow.h:87
WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:206
tesseract
Definition: baseapi.h:65
WERD::set_text
void set_text(const char *new_text)
Definition: werd.h:114
kMaxXHeightDeviationFraction
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:33
PAGE_RES
Definition: pageres.h:73
tesseract::Tesseract::MaximallyChopWord
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:242
STATS
Definition: statistc.h:30
TOP_CHOICE_PERM
Definition: ratngs.h:233
WERD_RES::seam_array
GenericVector< SEAM * > seam_array
Definition: pageres.h:208
BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:78
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
ReadAllBoxes
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:71
GenericVector< TBOX >
PAGE_RES_IT
Definition: pageres.h:668
tesseract::Tesseract::SetupApplyBoxes
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:206
WERD_RES::RebuildBestState
void RebuildBestState()
Definition: pageres.cpp:804
WERD::text
const char * text() const
Definition: werd.h:113
W_FUZZY_SP
fuzzy space
Definition: werd.h:53
BLOB_CHOICE
Definition: ratngs.h:49
tesseract::BoxWord::length
int length() const
Definition: boxword.h:82
TBOX::major_overlap
bool major_overlap(const TBOX &box) const
Definition: rect.h:362
WERD
Definition: werd.h:55
GenericVector::truncate
void truncate(int size)
Definition: genericvector.h:132
TBOX::left
int16_t left() const
Definition: rect.h:71
unichar.h
ROW
Definition: ocrrow.h:35
STATS::add
void add(int32_t value, int32_t count)
Definition: statistc.cpp:87
tesseract::Tesseract::ReSegmentByClassification
void ReSegmentByClassification(PAGE_RES *page_res)
WERD_RES::FakeClassifyWord
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:873
GenericVector::clear
void clear()
Definition: genericvector.h:857
print_ratings_list
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:835
TBOX::right
int16_t right() const
Definition: rect.h:78
TBOX::almost_equal
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:250
tesseract::Wordrec::assume_fixed_pitch_char_segment
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:225
tesseract::Tesseract::CorrectClassifyWords
void CorrectClassifyWords(PAGE_RES *page_res)
WERD_RES::correct_text
GenericVector< STRING > correct_text
Definition: pageres.h:283
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
WERD_RES::ClearWordChoices
void ClearWordChoices()
Definition: pageres.cpp:1125
WERD_RES::word
WERD * word
Definition: pageres.h:180
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
tesseract::BoxWord::MergeBoxes
void MergeBoxes(int start, int end)
Definition: boxword.cpp:131
tesseract::Classify::classify_bln_numeric_mode
bool classify_bln_numeric_mode
Definition: classify.h:508
tesseract::BoxWord::bounding_box
const TBOX & bounding_box() const
Definition: boxword.h:79
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::Tesseract::tessedit_ocr_engine_mode
int tessedit_ocr_engine_mode
Definition: tesseractclass.h:802
ROW::word_list
WERD_LIST * word_list()
Definition: ocrrow.h:54
tesseract::Tesseract::ResegmentCharBox
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
Definition: applybox.cpp:328
TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:446
W_BOL
start of line
Definition: werd.h:46
tesseract::Tesseract::FindSegmentation
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
TBOX::x_gap
int x_gap(const TBOX &box) const
Definition: rect.h:224
ROW::set_x_height
void set_x_height(float new_xheight)
Definition: ocrrow.h:66
TBOX
Definition: rect.h:33