tesseract  5.0.0-alpha-619-ge9db
control.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: control.cpp (Formerly control.c)
3  * Description: Module-independent matcher controller.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #include <cmath>
25 #include <cstdint> // for int16_t, int32_t
26 #include <cstdio> // for fclose, fopen, FILE
27 #include <ctime> // for clock
28 #include <cctype>
29 #include "callcpp.h"
30 #include "control.h"
31 #ifndef DISABLED_LEGACY_ENGINE
32 #include "docqual.h"
33 #include "drawfx.h"
34 #include "fixspace.h"
35 #endif
36 #include "lstmrecognizer.h"
37 #include <tesseract/ocrclass.h>
38 #include "output.h"
39 #include "pageres.h" // for WERD_RES, PAGE_RES_IT, PAGE_RES, BLO...
40 #ifndef DISABLED_LEGACY_ENGINE
41 #include "reject.h"
42 #endif
43 #include "sorthelper.h"
44 #include "tesseractclass.h"
45 #include "tessvars.h"
46 #include "werdit.h"
47 
48 const char* const kBackUpConfigFile = "tempconfigdata.config";
49 // Min believable x-height for any text when refitting as a fraction of
50 // original x-height
51 const double kMinRefitXHeightFraction = 0.5;
52 
53 
60 namespace tesseract {
61 
63  TBOX &selection_box) {
64  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
65  if (it != nullptr) {
67  it->DeleteCurrentWord();
68  delete it;
69  }
70 }
71 
78  int16_t char_qual;
79  int16_t good_char_qual;
80 
81  WordData word_data(*pr_it);
82  SetupWordPassN(2, &word_data);
83  // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
84  if (lstm_recognizer_ == nullptr) {
85 #ifndef DISABLED_LEGACY_ENGINE
86  classify_word_and_language(2, pr_it, &word_data);
87 #endif // ndef DISABLED_LEGACY_ENGINE
88  } else {
89  classify_word_and_language(1, pr_it, &word_data);
90  }
91 #ifndef DISABLED_LEGACY_ENGINE
93  WERD_RES* word_res = pr_it->word();
94  word_char_quality(word_res, &char_qual, &good_char_qual);
95  tprintf("\n%d chars; word_blob_quality: %d; outline_errs: %d; "
96  "char_quality: %d; good_char_quality: %d\n",
97  word_res->reject_map.length(),
98  word_blob_quality(word_res),
99  word_outline_errs(word_res), char_qual, good_char_qual);
100  }
101 #endif // ndef DISABLED_LEGACY_ENGINE
102  return true;
103 }
104 
105 // Helper function to check for a target word and handle it appropriately.
106 // Inspired by Jetsoft's requirement to process only single words on pass2
107 // and beyond.
108 // If word_config is not null:
109 // If the word_box and target_word_box overlap, read the word_config file
110 // else reset to previous config data.
111 // return true.
112 // else
113 // If the word_box and target_word_box overlap or pass <= 1, return true.
114 // Note that this function uses a fixed temporary file for storing the previous
115 // configs, so it is neither thread-safe, nor process-safe, but the assumption
116 // is that it will only be used for one debug window at a time.
117 //
118 // Since this function is used for debugging (and not to change OCR results)
119 // set only debug params from the word config file.
120 bool Tesseract::ProcessTargetWord(const TBOX& word_box,
121  const TBOX& target_word_box,
122  const char* word_config,
123  int pass) {
124  if (word_config != nullptr) {
125  if (word_box.major_overlap(target_word_box)) {
126  if (backup_config_file_ == nullptr) {
127  backup_config_file_ = kBackUpConfigFile;
128  FILE* config_fp = fopen(backup_config_file_, "wb");
129  if (config_fp == nullptr) {
130  tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
131  } else {
132  ParamUtils::PrintParams(config_fp, params());
133  fclose(config_fp);
134  }
135  ParamUtils::ReadParamsFile(word_config,
137  params());
138  }
139  } else {
140  if (backup_config_file_ != nullptr) {
141  ParamUtils::ReadParamsFile(backup_config_file_,
143  params());
144  backup_config_file_ = nullptr;
145  }
146  }
147  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
148  return false;
149  }
150  return true;
151 }
152 
155  const TBOX* target_word_box,
156  const char* word_config,
157  PAGE_RES* page_res,
158  GenericVector<WordData>* words) {
159  // Prepare all the words.
160  PAGE_RES_IT page_res_it(page_res);
161  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
162  page_res_it.forward()) {
163  if (target_word_box == nullptr ||
164  ProcessTargetWord(page_res_it.word()->word->bounding_box(),
165  *target_word_box, word_config, 1)) {
166  words->push_back(WordData(page_res_it));
167  }
168  }
169  // Setup all the words for recognition with polygonal approximation.
170  for (int w = 0; w < words->size(); ++w) {
171  SetupWordPassN(pass_n, &(*words)[w]);
172  if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
173  }
174 }
175 
176 // Sets up the single word ready for whichever engine is to be run.
177 void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
178  if (pass_n == 1 || !word->word->done) {
179  if (pass_n == 1) {
180  word->word->SetupForRecognition(unicharset, this, BestPix(),
181  tessedit_ocr_engine_mode, nullptr,
185  word->row, word->block);
186  } else if (pass_n == 2) {
187  // TODO(rays) Should we do this on pass1 too?
188  word->word->caps_height = 0.0;
189  if (word->word->x_height == 0.0f)
190  word->word->x_height = word->row->x_height();
191  }
192  word->lang_words.truncate(0);
193  for (int s = 0; s <= sub_langs_.size(); ++s) {
194  // The sub_langs_.size() entry is for the master language.
195  Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
196  auto* word_res = new WERD_RES;
197  word_res->InitForRetryRecognition(*word->word);
198  word->lang_words.push_back(word_res);
199  // LSTM doesn't get setup for pass2.
200  if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
201  word_res->SetupForRecognition(
202  lang_t->unicharset, lang_t, BestPix(),
203  lang_t->tessedit_ocr_engine_mode, nullptr,
205  lang_t->textord_use_cjk_fp_model,
206  lang_t->poly_allow_detailed_fx, word->row, word->block);
207  }
208  }
209  }
210 }
211 
212 // Runs word recognition on all the words.
213 bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
214  PAGE_RES_IT* pr_it,
215  GenericVector<WordData>* words) {
216  // TODO(rays) Before this loop can be parallelized (it would yield a massive
217  // speed-up) all remaining member globals need to be converted to local/heap
218  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
219  // added. The results will be significantly different with adaption on, and
220  // deterioration will need investigation.
221  pr_it->restart_page();
222  for (int w = 0; w < words->size(); ++w) {
223  WordData* word = &(*words)[w];
224  if (w > 0) word->prev_word = &(*words)[w - 1];
225  if (monitor != nullptr) {
226  monitor->ocr_alive = true;
227  if (pass_n == 1) {
228  monitor->progress = 70 * w / words->size();
229  } else {
230  monitor->progress = 70 + 30 * w / words->size();
231  }
232  if (monitor->progress_callback2 != nullptr) {
233  TBOX box = pr_it->word()->word->bounding_box();
234  (*monitor->progress_callback2)(monitor, box.left(),
235  box.right(), box.top(), box.bottom());
236  }
237  if (monitor->deadline_exceeded() ||
238  (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this,
239  words->size()))) {
240  // Timeout. Fake out the rest of the words.
241  for (; w < words->size(); ++w) {
242  (*words)[w].word->SetupFake(unicharset);
243  }
244  return false;
245  }
246  }
247  if (word->word->tess_failed) {
248  int s;
249  for (s = 0; s < word->lang_words.size() &&
250  word->lang_words[s]->tess_failed; ++s) {}
251  // If all are failed, skip it. Image words are skipped by this test.
252  if (s > word->lang_words.size()) continue;
253  }
254  // Sync pr_it with the wth WordData.
255  while (pr_it->word() != nullptr && pr_it->word() != word->word)
256  pr_it->forward();
257  ASSERT_HOST(pr_it->word() != nullptr);
258  bool make_next_word_fuzzy = false;
259  #ifndef DISABLED_LEGACY_ENGINE
260  if (!AnyLSTMLang() &&
261  ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
262  // Needs to be setup again to see the new outlines in the chopped_word.
263  SetupWordPassN(pass_n, word);
264  }
265  #endif // ndef DISABLED_LEGACY_ENGINE
266 
267  classify_word_and_language(pass_n, pr_it, word);
269  tprintf("Pass%d: %s [%s]\n", pass_n,
270  word->word->best_choice->unichar_string().c_str(),
271  word->word->best_choice->debug_string().c_str());
272  }
273  pr_it->forward();
274  if (make_next_word_fuzzy && pr_it->word() != nullptr) {
275  pr_it->MakeCurrentWordFuzzy();
276  }
277  }
278  return true;
279 }
280 
303  ETEXT_DESC* monitor,
304  const TBOX* target_word_box,
305  const char* word_config,
306  int dopasses) {
307  PAGE_RES_IT page_res_it(page_res);
308 
310  tessedit_test_adaption.set_value (true);
311  tessedit_minimal_rejection.set_value (true);
312  }
313 
314  if (dopasses==0 || dopasses==1) {
315  page_res_it.restart_page();
316  // ****************** Pass 1 *******************
317 
318  #ifndef DISABLED_LEGACY_ENGINE
319  // If the adaptive classifier is full switch to one we prepared earlier,
320  // ie on the previous page. If the current adaptive classifier is non-empty,
321  // prepare a backup starting at this page, in case it fills up. Do all this
322  // independently for each language.
323  if (AdaptiveClassifierIsFull()) {
325  } else if (!AdaptiveClassifierIsEmpty()) {
327  }
328  // Now check the sub-langs as well.
329  for (int i = 0; i < sub_langs_.size(); ++i) {
330  if (sub_langs_[i]->AdaptiveClassifierIsFull()) {
331  sub_langs_[i]->SwitchAdaptiveClassifier();
332  } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {
333  sub_langs_[i]->StartBackupAdaptiveClassifier();
334  }
335  }
336 
337  #endif // ndef DISABLED_LEGACY_ENGINE
338 
339  // Set up all words ready for recognition, so that if parallelism is on
340  // all the input and output classes are ready to run the classifier.
342  SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
343  #ifndef DISABLED_LEGACY_ENGINE
344  if (tessedit_parallelize) {
345  PrerecAllWordsPar(words);
346  }
347  #endif // ndef DISABLED_LEGACY_ENGINE
348 
349  stats_.word_count = words.size();
350 
351  stats_.dict_words = 0;
352  stats_.doc_blob_quality = 0;
353  stats_.doc_outline_errs = 0;
354  stats_.doc_char_quality = 0;
355  stats_.good_char_count = 0;
356  stats_.doc_good_char_quality = 0;
357 
358  most_recently_used_ = this;
359  // Run pass 1 word recognition.
360  if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
361  // Pass 1 post-processing.
362  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
363  page_res_it.forward()) {
364  if (page_res_it.word()->word->flag(W_REP_CHAR)) {
365  fix_rep_char(&page_res_it);
366  continue;
367  }
368 
369  // Count dict words.
370  if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
371  ++(stats_.dict_words);
372 
373  // Update misadaption log (we only need to do it on pass 1, since
374  // adaption only happens on this pass).
375  if (page_res_it.word()->blamer_bundle != nullptr &&
376  page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
377  page_res->misadaption_log.push_back(
378  page_res_it.word()->blamer_bundle->misadaption_debug());
379  }
380  }
381  }
382 
383  if (dopasses == 1) return true;
384 
385  #ifndef DISABLED_LEGACY_ENGINE
386 
387  // ****************** Pass 2 *******************
389  AnyTessLang()) {
390  page_res_it.restart_page();
392  SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
393  if (tessedit_parallelize) {
394  PrerecAllWordsPar(words);
395  }
396  most_recently_used_ = this;
397  // Run pass 2 word recognition.
398  if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
399  }
400 
401  // The next passes are only required for Tess-only.
402  if (AnyTessLang() && !AnyLSTMLang()) {
403  // ****************** Pass 3 *******************
404  // Fix fuzzy spaces.
406 
409  fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
410 
411  // ****************** Pass 4 *******************
414 
415  // ****************** Pass 5,6 *******************
416  rejection_passes(page_res, monitor, target_word_box, word_config);
417 
418  // ****************** Pass 8 *******************
419  font_recognition_pass(page_res);
420 
421  // ****************** Pass 9 *******************
422  // Check the correctness of the final results.
423  blamer_pass(page_res);
424  script_pos_pass(page_res);
425  }
426 
427  #endif // ndef DISABLED_LEGACY_ENGINE
428 
429  // Write results pass.
431  // This is now redundant, but retained commented so show how to obtain
432  // bounding boxes and style information.
433 
434  #ifndef DISABLED_LEGACY_ENGINE
435  // changed by jetsoft
436  // needed for dll to output memory structure
437  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
438  output_pass(page_res_it, target_word_box);
439  // end jetsoft
440  #endif //ndef DISABLED_LEGACY_ENGINE
441 
442  const auto pageseg_mode = static_cast<PageSegMode>(
443  static_cast<int>(tessedit_pageseg_mode));
444  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
445 
446  // Remove empty words, as these mess up the result iterators.
447  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
448  page_res_it.forward()) {
449  const WERD_RES* word = page_res_it.word();
450  const POLY_BLOCK* pb = page_res_it.block()->block != nullptr
451  ? page_res_it.block()->block->pdblk.poly_block()
452  : nullptr;
453  if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
454  (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
455  page_res_it.DeleteCurrentWord();
456  }
457  }
458 
459  if (monitor != nullptr) {
460  monitor->progress = 100;
461  }
462  return true;
463 }
464 
465 #ifndef DISABLED_LEGACY_ENGINE
466 
468  PAGE_RES_IT word_it(page_res);
469 
470  WERD_RES *w_prev = nullptr;
471  WERD_RES *w = word_it.word();
472  while (true) {
473  w_prev = w;
474  while (word_it.forward() != nullptr &&
475  (!word_it.word() || word_it.word()->part_of_combo)) {
476  // advance word_it, skipping over parts of combos
477  }
478  if (!word_it.word()) break;
479  w = word_it.word();
480  if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
481  continue;
482  }
483  if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
484  if (tessedit_bigram_debug) {
485  tprintf("Skipping because one of the words is W_REP_CHAR\n");
486  }
487  continue;
488  }
489  // Two words sharing the same language model, excellent!
490  GenericVector<WERD_CHOICE *> overrides_word1;
491  GenericVector<WERD_CHOICE *> overrides_word2;
492 
493  const STRING orig_w1_str = w_prev->best_choice->unichar_string();
494  const STRING orig_w2_str = w->best_choice->unichar_string();
495  WERD_CHOICE prev_best(w->uch_set);
496  {
497  int w1start, w1end;
498  w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
499  prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
500  }
501  WERD_CHOICE this_best(w->uch_set);
502  {
503  int w2start, w2end;
504  w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
505  this_best = w->best_choice->shallow_copy(w2start, w2end);
506  }
507 
508  if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
509  if (tessedit_bigram_debug) {
510  tprintf("Top choice \"%s %s\" verified by bigram model.\n",
511  orig_w1_str.c_str(), orig_w2_str.c_str());
512  }
513  continue;
514  }
515  if (tessedit_bigram_debug > 2) {
516  tprintf("Examining alt choices for \"%s %s\".\n",
517  orig_w1_str.c_str(), orig_w2_str.c_str());
518  }
519  if (tessedit_bigram_debug > 1) {
520  if (!w_prev->best_choices.singleton()) {
521  w_prev->PrintBestChoices();
522  }
523  if (!w->best_choices.singleton()) {
524  w->PrintBestChoices();
525  }
526  }
527  float best_rating = 0.0;
528  int best_idx = 0;
529  WERD_CHOICE_IT prev_it(&w_prev->best_choices);
530  for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
531  WERD_CHOICE *p1 = prev_it.data();
532  WERD_CHOICE strip1(w->uch_set);
533  {
534  int p1start, p1end;
535  p1->GetNonSuperscriptSpan(&p1start, &p1end);
536  strip1 = p1->shallow_copy(p1start, p1end);
537  }
538  WERD_CHOICE_IT w_it(&w->best_choices);
539  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
540  WERD_CHOICE *p2 = w_it.data();
541  WERD_CHOICE strip2(w->uch_set);
542  {
543  int p2start, p2end;
544  p2->GetNonSuperscriptSpan(&p2start, &p2end);
545  strip2 = p2->shallow_copy(p2start, p2end);
546  }
547  if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
548  overrides_word1.push_back(p1);
549  overrides_word2.push_back(p2);
550  if (overrides_word1.size() == 1 ||
551  p1->rating() + p2->rating() < best_rating) {
552  best_rating = p1->rating() + p2->rating();
553  best_idx = overrides_word1.size() - 1;
554  }
555  }
556  }
557  }
558  if (!overrides_word1.empty()) {
559  // Excellent, we have some bigram matches.
561  *overrides_word1[best_idx]) &&
563  *overrides_word2[best_idx])) {
564  if (tessedit_bigram_debug > 1) {
565  tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
566  "model.\n", orig_w1_str.c_str(), orig_w2_str.c_str());
567  }
568  continue;
569  }
570  const STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
571  const STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
572  if (new_w1_str != orig_w1_str) {
573  w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
574  }
575  if (new_w2_str != orig_w2_str) {
576  w->ReplaceBestChoice(overrides_word2[best_idx]);
577  }
578  if (tessedit_bigram_debug > 0) {
579  STRING choices_description;
580  int num_bigram_choices
581  = overrides_word1.size() * overrides_word2.size();
582  if (num_bigram_choices == 1) {
583  choices_description = "This was the unique bigram choice.";
584  } else {
585  if (tessedit_bigram_debug > 1) {
586  STRING bigrams_list;
587  const int kMaxChoicesToPrint = 20;
588  for (int i = 0; i < overrides_word1.size() &&
589  i < kMaxChoicesToPrint; i++) {
590  if (i > 0) { bigrams_list += ", "; }
591  WERD_CHOICE *p1 = overrides_word1[i];
592  WERD_CHOICE *p2 = overrides_word2[i];
593  bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
594  }
595  choices_description = "There were many choices: {";
596  choices_description += bigrams_list;
597  choices_description += "}";
598  } else {
599  choices_description.add_str_int("There were ", num_bigram_choices);
600  choices_description += " compatible bigrams.";
601  }
602  }
603  tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
604  orig_w1_str.c_str(), orig_w2_str.c_str(),
605  new_w1_str.c_str(), new_w2_str.c_str(),
606  choices_description.c_str());
607  }
608  }
609  }
610 }
611 
613  ETEXT_DESC* monitor,
614  const TBOX* target_word_box,
615  const char* word_config) {
616  PAGE_RES_IT page_res_it(page_res);
617  // ****************** Pass 5 *******************
618  // Gather statistics on rejects.
619  int word_index = 0;
620  while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
622  WERD_RES* word = page_res_it.word();
623  word_index++;
624  if (monitor != nullptr) {
625  monitor->ocr_alive = true;
626  monitor->progress = 95 + 5 * word_index / stats_.word_count;
627  }
628  if (word->rebuild_word == nullptr) {
629  // Word was not processed by tesseract.
630  page_res_it.forward();
631  continue;
632  }
633  check_debug_pt(word, 70);
634 
635  // changed by jetsoft
636  // specific to its needs to extract one word when need
637  if (target_word_box &&
639  *target_word_box, word_config, 4)) {
640  page_res_it.forward();
641  continue;
642  }
643  // end jetsoft
644 
645  page_res_it.rej_stat_word();
646  const int chars_in_word = word->reject_map.length();
647  const int rejects_in_word = word->reject_map.reject_count();
648 
649  const int blob_quality = word_blob_quality(word);
650  stats_.doc_blob_quality += blob_quality;
651  const int outline_errs = word_outline_errs(word);
652  stats_.doc_outline_errs += outline_errs;
653  int16_t all_char_quality;
654  int16_t accepted_all_char_quality;
655  word_char_quality(word, &all_char_quality, &accepted_all_char_quality);
656  stats_.doc_char_quality += all_char_quality;
657  const uint8_t permuter_type = word->best_choice->permuter();
658  if ((permuter_type == SYSTEM_DAWG_PERM) ||
659  (permuter_type == FREQ_DAWG_PERM) ||
660  (permuter_type == USER_DAWG_PERM)) {
661  stats_.good_char_count += chars_in_word - rejects_in_word;
662  stats_.doc_good_char_quality += accepted_all_char_quality;
663  }
664  check_debug_pt(word, 80);
666  (blob_quality == 0) && (outline_errs >= chars_in_word))
668  check_debug_pt(word, 90);
669  page_res_it.forward();
670  }
671 
673  tprintf
674  ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
675  " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
676  page_res->char_count, page_res->rej_count,
677  page_res->rej_count / static_cast<float>(page_res->char_count),
678  stats_.doc_blob_quality,
679  stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
680  stats_.doc_outline_errs,
681  stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
682  stats_.doc_char_quality,
683  stats_.doc_char_quality / static_cast<float>(page_res->char_count),
684  stats_.doc_good_char_quality,
685  (stats_.good_char_count > 0) ?
686  (stats_.doc_good_char_quality /
687  static_cast<float>(stats_.good_char_count)) : 0.0);
688  }
689  bool good_quality_doc =
690  ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
691  quality_rej_pc) &&
692  (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
693  quality_blob_pc) &&
694  (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
696  (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
698 
699  // ****************** Pass 6 *******************
700  // Do whole document or whole block rejection pass
701  if (!tessedit_test_adaption) {
703  quality_based_rejection(page_res_it, good_quality_doc);
704  }
705 }
706 
707 #endif // ndef DISABLED_LEGACY_ENGINE
708 
710  if (!wordrec_run_blamer) return;
711  PAGE_RES_IT page_res_it(page_res);
712  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
713  page_res_it.forward()) {
714  WERD_RES *word = page_res_it.word();
717  }
718  tprintf("Blame reasons:\n");
719  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
721  static_cast<IncorrectResultReason>(bl)),
722  page_res->blame_reasons[bl]);
723  }
724  if (page_res->misadaption_log.size() > 0) {
725  tprintf("Misadaption log:\n");
726  for (int i = 0; i < page_res->misadaption_log.size(); ++i) {
727  tprintf("%s\n", page_res->misadaption_log[i].c_str());
728  }
729  }
730 }
731 
732 // Sets script positions and detects smallcaps on all output words.
734  PAGE_RES_IT page_res_it(page_res);
735  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
736  page_res_it.forward()) {
737  WERD_RES* word = page_res_it.word();
738  if (word->word->flag(W_REP_CHAR)) {
739  page_res_it.forward();
740  continue;
741  }
742  const float x_height = page_res_it.block()->block->x_height();
743  float word_x_height = word->x_height;
744  if (word_x_height < word->best_choice->min_x_height() ||
745  word_x_height > word->best_choice->max_x_height()) {
746  word_x_height = (word->best_choice->min_x_height() +
747  word->best_choice->max_x_height()) / 2.0f;
748  }
749  // Test for small caps. Word capheight must be close to block xheight,
750  // and word must contain no lower case letters, and at least one upper case.
751  const double small_cap_xheight = x_height * kXHeightCapRatio;
752  const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
753  if (word->uch_set->script_has_xheight() &&
754  small_cap_xheight - small_cap_delta <= word_x_height &&
755  word_x_height <= small_cap_xheight + small_cap_delta) {
756  // Scan for upper/lower.
757  int num_upper = 0;
758  int num_lower = 0;
759  for (int i = 0; i < word->best_choice->length(); ++i) {
760  if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
761  ++num_upper;
762  else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
763  ++num_lower;
764  }
765  if (num_upper > 0 && num_lower == 0)
766  word->small_caps = true;
767  }
768  word->SetScriptPositions();
769  }
770 }
771 
772 // Helper finds the gap between the index word and the next.
773 static void WordGap(const PointerVector<WERD_RES>& words, int index, int* right,
774  int* next_left) {
775  *right = -INT32_MAX;
776  *next_left = INT32_MAX;
777  if (index < words.size()) {
778  *right = words[index]->word->bounding_box().right();
779  if (index + 1 < words.size())
780  *next_left = words[index + 1]->word->bounding_box().left();
781  }
782 }
783 
784 // Factored helper computes the rating, certainty, badness and validity of
785 // the permuter of the words in [first_index, end_index).
786 static void EvaluateWordSpan(const PointerVector<WERD_RES>& words,
787  int first_index, int end_index, float* rating,
788  float* certainty, bool* bad,
789  bool* valid_permuter) {
790  if (end_index <= first_index) {
791  *bad = true;
792  *valid_permuter = false;
793  }
794  for (int index = first_index; index < end_index && index < words.size();
795  ++index) {
796  WERD_CHOICE* choice = words[index]->best_choice;
797  if (choice == nullptr) {
798  *bad = true;
799  } else {
800  *rating += choice->rating();
801  *certainty = std::min(*certainty, choice->certainty());
802  if (!Dict::valid_word_permuter(choice->permuter(), false))
803  *valid_permuter = false;
804  }
805  }
806 }
807 
808 // Helper chooses the best combination of words, transferring good ones from
809 // new_words to best_words. To win, a new word must have (better rating and
810 // certainty) or (better permuter status and rating within rating ratio and
811 // certainty within certainty margin) than current best.
812 // All the new_words are consumed (moved to best_words or deleted.)
813 // The return value is the number of new_words used minus the number of
814 // best_words that remain in the output.
815 static int SelectBestWords(double rating_ratio,
816  double certainty_margin,
817  bool debug,
818  PointerVector<WERD_RES>* new_words,
819  PointerVector<WERD_RES>* best_words) {
820  // Process the smallest groups of words that have an overlapping word
821  // boundary at the end.
822  GenericVector<WERD_RES*> out_words;
823  // Index into each word vector (best, new).
824  int b = 0, n = 0;
825  int num_best = 0, num_new = 0;
826  while (b < best_words->size() || n < new_words->size()) {
827  // Start of the current run in each.
828  int start_b = b, start_n = n;
829  while (b < best_words->size() || n < new_words->size()) {
830  int b_right = -INT32_MAX;
831  int next_b_left = INT32_MAX;
832  WordGap(*best_words, b, &b_right, &next_b_left);
833  int n_right = -INT32_MAX;
834  int next_n_left = INT32_MAX;
835  WordGap(*new_words, n, &n_right, &next_n_left);
836  if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) {
837  // The word breaks overlap. [start_b,b] and [start_n, n] match.
838  break;
839  }
840  // Keep searching for the matching word break.
841  if ((b_right < n_right && b < best_words->size()) ||
842  n == new_words->size())
843  ++b;
844  else
845  ++n;
846  }
847  // Rating of the current run in each.
848  float b_rating = 0.0f, n_rating = 0.0f;
849  // Certainty of the current run in each.
850  float b_certainty = 0.0f, n_certainty = 0.0f;
851  // True if any word is missing its best choice.
852  bool b_bad = false, n_bad = false;
853  // True if all words have a valid permuter.
854  bool b_valid_permuter = true, n_valid_permuter = true;
855  const int end_b = b < best_words->size() ? b + 1 : b;
856  const int end_n = n < new_words->size() ? n + 1 : n;
857  EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty,
858  &b_bad, &b_valid_permuter);
859  EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty,
860  &n_bad, &n_valid_permuter);
861  bool new_better = false;
862  if (!n_bad && (b_bad || (n_certainty > b_certainty &&
863  n_rating < b_rating) ||
864  (!b_valid_permuter && n_valid_permuter &&
865  n_rating < b_rating * rating_ratio &&
866  n_certainty > b_certainty - certainty_margin))) {
867  // New is better.
868  for (int i = start_n; i < end_n; ++i) {
869  out_words.push_back((*new_words)[i]);
870  (*new_words)[i] = nullptr;
871  ++num_new;
872  }
873  new_better = true;
874  } else if (!b_bad) {
875  // Current best is better.
876  for (int i = start_b; i < end_b; ++i) {
877  out_words.push_back((*best_words)[i]);
878  (*best_words)[i] = nullptr;
879  ++num_best;
880  }
881  }
882  if (debug) {
883  tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"
884  " valid dict: %d v %d\n",
885  end_n - start_n, new_better ? "better" : "worse",
886  end_b - start_b, n_rating, b_rating,
887  n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
888  }
889  // Move on to the next group.
890  b = end_b;
891  n = end_n;
892  }
893  // Transfer from out_words to best_words.
894  best_words->clear();
895  for (int i = 0; i < out_words.size(); ++i)
896  best_words->push_back(out_words[i]);
897  return num_new - num_best;
898 }
899 
900 // Helper to recognize the word using the given (language-specific) tesseract.
901 // Returns positive if this recognizer found more new best words than the
902 // number kept from best_words.
904  WordRecognizer recognizer, bool debug,
905  WERD_RES** in_word,
906  PointerVector<WERD_RES>* best_words) {
907  if (debug) {
908  tprintf("Trying word using lang %s, oem %d\n",
909  lang.c_str(), static_cast<int>(tessedit_ocr_engine_mode));
910  }
911  // Run the recognizer on the word.
912  PointerVector<WERD_RES> new_words;
913  (this->*recognizer)(word_data, in_word, &new_words);
914  if (new_words.empty()) {
915  // Transfer input word to new_words, as the classifier must have put
916  // the result back in the input.
917  new_words.push_back(*in_word);
918  *in_word = nullptr;
919  }
920  if (debug) {
921  for (int i = 0; i < new_words.size(); ++i)
922  new_words[i]->DebugTopChoice("Lang result");
923  }
924  // Initial version is a bit of a hack based on better certainty and rating
925  // or a dictionary vs non-dictionary word.
926  return SelectBestWords(classify_max_rating_ratio,
928  debug, &new_words, best_words);
929 }
930 
931 // Helper returns true if all the words are acceptable.
932 static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
933  for (int w = 0; w < words.size(); ++w) {
934  if (words[w]->tess_failed || !words[w]->tess_accepted) return false;
935  }
936  return true;
937 }
938 
939 #ifndef DISABLED_LEGACY_ENGINE
940 
941 // Moves good-looking "noise"/diacritics from the reject list to the main
942 // blob list on the current word. Returns true if anything was done, and
943 // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
945  bool* make_next_word_fuzzy) {
946  *make_next_word_fuzzy = false;
947  WERD* real_word = pr_it->word()->word;
948  if (real_word->rej_cblob_list()->empty() ||
949  real_word->cblob_list()->empty() ||
950  real_word->rej_cblob_list()->length() > noise_maxperword)
951  return false;
952  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
953  // Get the noise outlines into a vector with matching bool map.
954  GenericVector<C_OUTLINE*> outlines;
955  real_word->GetNoiseOutlines(&outlines);
956  GenericVector<bool> word_wanted;
957  GenericVector<bool> overlapped_any_blob;
958  GenericVector<C_BLOB*> target_blobs;
959  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
960  &word_wanted, &overlapped_any_blob,
961  &target_blobs);
962  // Filter the outlines that overlapped any blob and put them into the word
963  // now. This simplifies the remaining task and also makes it more accurate
964  // as it has more completed blobs to work on.
965  GenericVector<bool> wanted;
966  GenericVector<C_BLOB*> wanted_blobs;
967  GenericVector<C_OUTLINE*> wanted_outlines;
968  int num_overlapped = 0;
969  int num_overlapped_used = 0;
970  for (int i = 0; i < overlapped_any_blob.size(); ++i) {
971  if (overlapped_any_blob[i]) {
972  ++num_overlapped;
973  if (word_wanted[i]) ++num_overlapped_used;
974  wanted.push_back(word_wanted[i]);
975  wanted_blobs.push_back(target_blobs[i]);
976  wanted_outlines.push_back(outlines[i]);
977  outlines[i] = nullptr;
978  }
979  }
980  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
981  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
982  &target_blobs);
983  int non_overlapped = 0;
984  int non_overlapped_used = 0;
985  for (int i = 0; i < word_wanted.size(); ++i) {
986  if (word_wanted[i]) ++non_overlapped_used;
987  if (outlines[i] != nullptr) ++non_overlapped_used;
988  }
989  if (debug_noise_removal) {
990  tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
991  num_overlapped_used, num_overlapped, non_overlapped_used,
992  non_overlapped);
993  real_word->bounding_box().print();
994  }
995  // Now we have decided which outlines we want, put them into the real_word.
996  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
997  make_next_word_fuzzy)) {
998  pr_it->MakeCurrentWordFuzzy();
999  }
1000  // TODO(rays) Parts of combos have a deep copy of the real word, and need
1001  // to have their noise outlines moved/assigned in the same way!!
1002  return num_overlapped_used != 0 || non_overlapped_used != 0;
1003 }
1004 
1005 // Attempts to put noise/diacritic outlines into the blobs that they overlap.
1006 // Input: a set of noisy outlines that probably belong to the real_word.
1007 // Output: word_wanted indicates which outlines are to be assigned to a blob,
1008 // target_blobs indicates which to assign to, and overlapped_any_blob is
1009 // true for all outlines that overlapped a blob.
1011  const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
1012  PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
1013  GenericVector<bool>* overlapped_any_blob,
1014  GenericVector<C_BLOB*>* target_blobs) {
1015  GenericVector<bool> blob_wanted;
1016  word_wanted->init_to_size(outlines.size(), false);
1017  overlapped_any_blob->init_to_size(outlines.size(), false);
1018  target_blobs->init_to_size(outlines.size(), nullptr);
1019  // For each real blob, find the outlines that seriously overlap it.
1020  // A single blob could be several merged characters, so there can be quite
1021  // a few outlines overlapping, and the full engine needs to be used to chop
1022  // and join to get a sensible result.
1023  C_BLOB_IT blob_it(real_word->cblob_list());
1024  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1025  C_BLOB* blob = blob_it.data();
1026  const TBOX blob_box = blob->bounding_box();
1027  blob_wanted.init_to_size(outlines.size(), false);
1028  int num_blob_outlines = 0;
1029  for (int i = 0; i < outlines.size(); ++i) {
1030  if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
1031  !(*word_wanted)[i]) {
1032  blob_wanted[i] = true;
1033  (*overlapped_any_blob)[i] = true;
1034  ++num_blob_outlines;
1035  }
1036  }
1037  if (debug_noise_removal) {
1038  tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1039  blob_box.print();
1040  }
1041  // If any outlines overlap the blob, and not too many, classify the blob
1042  // (using the full engine, languages and all), and choose the maximal
1043  // combination of outlines that doesn't hurt the end-result classification
1044  // by too much. Mark them as wanted.
1045  if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1046  if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
1047  outlines, num_blob_outlines,
1048  &blob_wanted)) {
1049  for (int i = 0; i < blob_wanted.size(); ++i) {
1050  if (blob_wanted[i]) {
1051  // Claim the outline and record where it is going.
1052  (*word_wanted)[i] = true;
1053  (*target_blobs)[i] = blob;
1054  }
1055  }
1056  }
1057  }
1058  }
1059 }
1060 
1061 // Attempts to assign non-overlapping outlines to their nearest blobs or
1062 // make new blobs out of them.
1064  const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
1065  PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
1066  GenericVector<C_BLOB*>* target_blobs) {
1067  GenericVector<bool> blob_wanted;
1068  word_wanted->init_to_size(outlines.size(), false);
1069  target_blobs->init_to_size(outlines.size(), nullptr);
1070  // Check for outlines that need to be turned into stand-alone blobs.
1071  for (int i = 0; i < outlines.size(); ++i) {
1072  if (outlines[i] == nullptr) continue;
1073  // Get a set of adjacent outlines that don't overlap any existing blob.
1074  blob_wanted.init_to_size(outlines.size(), false);
1075  int num_blob_outlines = 0;
1076  TBOX total_ol_box(outlines[i]->bounding_box());
1077  while (i < outlines.size() && outlines[i] != nullptr) {
1078  blob_wanted[i] = true;
1079  total_ol_box += outlines[i]->bounding_box();
1080  ++i;
1081  ++num_blob_outlines;
1082  }
1083  // Find the insertion point.
1084  C_BLOB_IT blob_it(real_word->cblob_list());
1085  while (!blob_it.at_last() &&
1086  blob_it.data_relative(1)->bounding_box().left() <=
1087  total_ol_box.left()) {
1088  blob_it.forward();
1089  }
1090  // Choose which combination of them we actually want and where to put
1091  // them.
1092  if (debug_noise_removal)
1093  tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1094  C_BLOB* left_blob = blob_it.data();
1095  TBOX left_box = left_blob->bounding_box();
1096  C_BLOB* right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1097  if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
1098  !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1099  SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
1100  outlines, num_blob_outlines,
1101  &blob_wanted)) {
1102  if (debug_noise_removal) tprintf("Added to left blob\n");
1103  for (int j = 0; j < blob_wanted.size(); ++j) {
1104  if (blob_wanted[j]) {
1105  (*word_wanted)[j] = true;
1106  (*target_blobs)[j] = left_blob;
1107  }
1108  }
1109  } else if (right_blob != nullptr &&
1110  (!left_box.x_overlap(total_ol_box) ||
1111  right_blob->bounding_box().x_overlap(total_ol_box)) &&
1113  right_blob, outlines,
1114  num_blob_outlines, &blob_wanted)) {
1115  if (debug_noise_removal) tprintf("Added to right blob\n");
1116  for (int j = 0; j < blob_wanted.size(); ++j) {
1117  if (blob_wanted[j]) {
1118  (*word_wanted)[j] = true;
1119  (*target_blobs)[j] = right_blob;
1120  }
1121  }
1122  } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr,
1123  outlines, num_blob_outlines,
1124  &blob_wanted)) {
1125  if (debug_noise_removal) tprintf("Fitted between blobs\n");
1126  for (int j = 0; j < blob_wanted.size(); ++j) {
1127  if (blob_wanted[j]) {
1128  (*word_wanted)[j] = true;
1129  (*target_blobs)[j] = nullptr;
1130  }
1131  }
1132  }
1133  }
1134 }
1135 
1136 // Starting with ok_outlines set to indicate which outlines overlap the blob,
1137 // chooses the optimal set (approximately) and returns true if any outlines
1138 // are desired, in which case ok_outlines indicates which ones.
1140  int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob,
1141  const GenericVector<C_OUTLINE*>& outlines, int num_outlines,
1142  GenericVector<bool>* ok_outlines) {
1143  STRING best_str;
1144  float target_cert = certainty_threshold;
1145  if (blob != nullptr) {
1146  float target_c2;
1147  target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
1148  if (debug_noise_removal) {
1149  tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.c_str(),
1150  target_cert, target_c2);
1151  blob->bounding_box().print();
1152  }
1153  target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1154  }
1155  GenericVector<bool> test_outlines = *ok_outlines;
1156  // Start with all the outlines in.
1157  STRING all_str;
1158  GenericVector<bool> best_outlines = *ok_outlines;
1159  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1160  pr_it, blob, &all_str);
1161  if (debug_noise_removal) {
1162  TBOX ol_box;
1163  for (int i = 0; i < test_outlines.size(); ++i) {
1164  if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
1165  }
1166  tprintf("All Noise blob classified as %s=%g, delta=%g at:",
1167  all_str.c_str(), best_cert, best_cert - target_cert);
1168  ol_box.print();
1169  }
1170  // Iteratively zero out the bit that improves the certainty the most, until
1171  // we get past the threshold, have zero bits, or fail to improve.
1172  int best_index = 0; // To zero out.
1173  while (num_outlines > 1 && best_index >= 0 &&
1174  (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
1175  // Find the best bit to zero out.
1176  best_index = -1;
1177  for (int i = 0; i < outlines.size(); ++i) {
1178  if (test_outlines[i]) {
1179  test_outlines[i] = false;
1180  STRING str;
1181  float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1182  pr_it, blob, &str);
1183  if (debug_noise_removal) {
1184  TBOX ol_box;
1185  for (int j = 0; j < outlines.size(); ++j) {
1186  if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
1187  tprintf("%d", test_outlines[j]);
1188  }
1189  tprintf(" blob classified as %s=%g, delta=%g) at:", str.c_str(),
1190  cert, cert - target_cert);
1191  ol_box.print();
1192  }
1193  if (cert > best_cert) {
1194  best_cert = cert;
1195  best_index = i;
1196  best_outlines = test_outlines;
1197  }
1198  test_outlines[i] = true;
1199  }
1200  }
1201  if (best_index >= 0) {
1202  test_outlines[best_index] = false;
1203  --num_outlines;
1204  }
1205  }
1206  if (best_cert >= target_cert) {
1207  // Save the best combination.
1208  *ok_outlines = best_outlines;
1209  if (debug_noise_removal) {
1210  tprintf("%s noise combination ", blob ? "Adding" : "New");
1211  for (int i = 0; i < best_outlines.size(); ++i) {
1212  tprintf("%d", best_outlines[i]);
1213  }
1214  tprintf(" yields certainty %g, beating target of %g\n", best_cert,
1215  target_cert);
1216  }
1217  return true;
1218  }
1219 
1220  return false;
1221 }
1222 
1223 // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
1224 // the inclusion of the outlines, and returns the certainty of the raw choice.
1226  const GenericVector<bool>& ok_outlines,
1227  const GenericVector<C_OUTLINE*>& outlines, int pass_n, PAGE_RES_IT* pr_it,
1228  C_BLOB* blob, STRING* best_str) {
1229  C_OUTLINE_IT ol_it;
1230  C_OUTLINE* first_to_keep = nullptr;
1231  C_BLOB* local_blob = nullptr;
1232  if (blob != nullptr) {
1233  // Add the required outlines to the blob.
1234  ol_it.set_to_list(blob->out_list());
1235  first_to_keep = ol_it.data();
1236  }
1237  for (int i = 0; i < ok_outlines.size(); ++i) {
1238  if (ok_outlines[i]) {
1239  // This outline is to be added.
1240  if (blob == nullptr) {
1241  local_blob = new C_BLOB(outlines[i]);
1242  blob = local_blob;
1243  ol_it.set_to_list(blob->out_list());
1244  } else {
1245  ol_it.add_before_stay_put(outlines[i]);
1246  }
1247  }
1248  }
1249  float c2;
1250  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1251  ol_it.move_to_first();
1252  if (first_to_keep == nullptr) {
1253  // We created blob. Empty its outlines and delete it.
1254  for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
1255  delete local_blob;
1256  cert = -c2;
1257  } else {
1258  // Remove the outlines that we put in.
1259  for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1260  ol_it.extract();
1261  }
1262  }
1263  return cert;
1264 }
1265 
1266 // Classifies the given blob (part of word_data->word->word) as an individual
1267 // word, using languages, chopper etc, returning only the certainty of the
1268 // best raw choice, and undoing all the work done to fake out the word.
1270  C_BLOB* blob, STRING* best_str, float* c2) {
1271  WERD* real_word = pr_it->word()->word;
1272  WERD* word = real_word->ConstructFromSingleBlob(
1273  real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
1274  WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1275  // Get a new iterator that points to the new word.
1276  PAGE_RES_IT it(pr_it->page_res);
1277  while (it.word() != word_res && it.word() != nullptr) it.forward();
1278  ASSERT_HOST(it.word() == word_res);
1279  WordData wd(it);
1280  // Force full initialization.
1281  SetupWordPassN(1, &wd);
1282  classify_word_and_language(pass_n, &it, &wd);
1283  if (debug_noise_removal) {
1284  if (wd.word->raw_choice != nullptr) {
1285  tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
1286  wd.row->x_height(), wd.word->raw_choice->min_x_height(),
1287  wd.word->raw_choice->max_x_height());
1288  } else {
1289  tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
1290  wd.row->x_height());
1291  }
1292  }
1293  float cert = 0.0f;
1294  if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but...
1295  cert = wd.word->raw_choice->certainty();
1296  float rat = wd.word->raw_choice->rating();
1297  *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1298  *best_str = wd.word->raw_choice->unichar_string();
1299  } else {
1300  *c2 = 0.0f;
1301  *best_str = "";
1302  }
1303  it.DeleteCurrentWord();
1304  pr_it->ResetWordIterator();
1305  return cert;
1306 }
1307 
1308 #endif // ndef DISABLED_LEGACY_ENGINE
1309 
1310 // Generic function for classifying a word. Can be used either for pass1 or
1311 // pass2 according to the function passed to recognizer.
1312 // word_data holds the word to be recognized, and its block and row, and
1313 // pr_it points to the word as well, in case we are running LSTM and it wants
1314 // to output multiple words.
1315 // Recognizes in the current language, and if successful that is all.
1316 // If recognition was not successful, tries all available languages until
1317 // it gets a successful result or runs out of languages. Keeps the best result.
1319  WordData* word_data) {
1320 #ifdef DISABLED_LEGACY_ENGINE
1322 #else
1323  WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
1325 #endif // def DISABLED_LEGACY_ENGINE
1326 
1327  // Best result so far.
1328  PointerVector<WERD_RES> best_words;
1329  // Points to the best result. May be word or in lang_words.
1330  const WERD_RES* word = word_data->word;
1331  clock_t start_t = clock();
1332  const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1333  if (debug) {
1334  tprintf("%s word with lang %s at:",
1335  word->done ? "Already done" : "Processing",
1336  most_recently_used_->lang.c_str());
1337  word->word->bounding_box().print();
1338  }
1339  if (word->done) {
1340  // If done on pass1, leave it as-is.
1341  if (!word->tess_failed)
1342  most_recently_used_ = word->tesseract;
1343  return;
1344  }
1345  int sub = sub_langs_.size();
1346  if (most_recently_used_ != this) {
1347  // Get the index of the most_recently_used_.
1348  for (sub = 0; sub < sub_langs_.size() &&
1349  most_recently_used_ != sub_langs_[sub]; ++sub) {}
1350  }
1351  most_recently_used_->RetryWithLanguage(
1352  *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
1353  Tesseract* best_lang_tess = most_recently_used_;
1354  if (!WordsAcceptable(best_words)) {
1355  // Try all the other languages to see if they are any better.
1356  if (most_recently_used_ != this &&
1357  this->RetryWithLanguage(*word_data, recognizer, debug,
1358  &word_data->lang_words[sub_langs_.size()],
1359  &best_words) > 0) {
1360  best_lang_tess = this;
1361  }
1362  for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
1363  ++i) {
1364  if (most_recently_used_ != sub_langs_[i] &&
1365  sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
1366  &word_data->lang_words[i],
1367  &best_words) > 0) {
1368  best_lang_tess = sub_langs_[i];
1369  }
1370  }
1371  }
1372  most_recently_used_ = best_lang_tess;
1373  if (!best_words.empty()) {
1374  if (best_words.size() == 1 && !best_words[0]->combination) {
1375  // Move the best single result to the main word.
1376  word_data->word->ConsumeWordResults(best_words[0]);
1377  } else {
1378  // Words came from LSTM, and must be moved to the PAGE_RES properly.
1379  word_data->word = best_words.back();
1380  pr_it->ReplaceCurrentWord(&best_words);
1381  }
1382  ASSERT_HOST(word_data->word->box_word != nullptr);
1383  } else {
1384  tprintf("no best words!!\n");
1385  }
1386  clock_t ocr_t = clock();
1387  if (tessedit_timing_debug) {
1388  tprintf("%s (ocr took %.2f sec)\n",
1389  word_data->word->best_choice->unichar_string().c_str(),
1390  static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
1391  }
1392 }
1393 
1401  WERD_RES** in_word,
1402  PointerVector<WERD_RES>* out_words) {
1403  ROW* row = word_data.row;
1404  BLOCK* block = word_data.block;
1405  prev_word_best_choice_ = word_data.prev_word != nullptr
1406  ? word_data.prev_word->word->best_choice : nullptr;
1407 #ifndef ANDROID_BUILD
1408 #ifdef DISABLED_LEGACY_ENGINE
1410 #else
1413 #endif // def DISABLED_LEGACY_ENGINE
1414  if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1415  LSTMRecognizeWord(*block, row, *in_word, out_words);
1416  if (!out_words->empty())
1417  return; // Successful lstm recognition.
1418  }
1420  // No fallback allowed, so use a fake.
1421  (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1422  return;
1423  }
1424 
1425  #ifndef DISABLED_LEGACY_ENGINE
1426  // Fall back to tesseract for failed words or odd words.
1427  (*in_word)->SetupForRecognition(unicharset, this, BestPix(),
1428  OEM_TESSERACT_ONLY, nullptr,
1431  poly_allow_detailed_fx, row, block);
1432 #endif // ndef DISABLED_LEGACY_ENGINE
1433  }
1434 #endif // ndef ANDROID_BUILD
1435 
1436 #ifndef DISABLED_LEGACY_ENGINE
1437  WERD_RES* word = *in_word;
1438  match_word_pass_n(1, word, row, block);
1439  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1440  word->tess_would_adapt = AdaptableWord(word);
1441  bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1442 
1443  if (adapt_ok) {
1444  // Send word to adaptive classifier for training.
1445  word->BestChoiceToCorrectText();
1446  LearnWord(nullptr, word);
1447  // Mark misadaptions if running blamer.
1448  if (word->blamer_bundle != nullptr) {
1451  }
1452  }
1453 
1454  if (tessedit_enable_doc_dict && !word->IsAmbiguous())
1456  }
1457 #endif // ndef DISABLED_LEGACY_ENGINE
1458 }
1459 
1460 // Helper to report the result of the xheight fix.
1461 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht,
1462  WERD_RES* word, WERD_RES* new_word) {
1463  tprintf("New XHT Match:%s = %s ",
1464  word->best_choice->unichar_string().c_str(),
1465  word->best_choice->debug_string().c_str());
1466  word->reject_map.print(debug_fp);
1467  tprintf(" -> %s = %s ",
1468  new_word->best_choice->unichar_string().c_str(),
1469  new_word->best_choice->debug_string().c_str());
1470  new_word->reject_map.print(debug_fp);
1471  tprintf(" %s->%s %s %s\n",
1472  word->guessed_x_ht ? "GUESS" : "CERT",
1473  new_word->guessed_x_ht ? "GUESS" : "CERT",
1474  new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1475  accept_new_word ? "ACCEPTED" : "");
1476 }
1477 
1478 #ifndef DISABLED_LEGACY_ENGINE
1479 
1480 // Run the x-height fix-up, based on min/max top/bottom information in
1481 // unicharset.
1482 // Returns true if the word was changed.
1483 // See the comment in fixxht.cpp for a description of the overall process.
1484 bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
1485  int original_misfits = CountMisfitTops(word);
1486  if (original_misfits == 0)
1487  return false;
1488  float baseline_shift = 0.0f;
1489  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1490  if (baseline_shift != 0.0f) {
1491  // Try the shift on its own first.
1492  if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
1493  word, block, row))
1494  return false;
1495  original_misfits = CountMisfitTops(word);
1496  if (original_misfits > 0) {
1497  float new_baseline_shift;
1498  // Now recompute the new x_height.
1499  new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1500  if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1501  // No test of return value here, as we are definitely making a change
1502  // to the word by shifting the baseline.
1503  TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
1504  word, block, row);
1505  }
1506  }
1507  return true;
1508  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1509  return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
1510  word, block, row);
1511  } else {
1512  return false;
1513  }
1514 }
1515 
1516 // Runs recognition with the test baseline shift and x-height and returns true
1517 // if there was an improvement in recognition result.
1518 bool Tesseract::TestNewNormalization(int original_misfits,
1519  float baseline_shift, float new_x_ht,
1520  WERD_RES *word, BLOCK* block, ROW *row) {
1521  bool accept_new_x_ht = false;
1522  WERD_RES new_x_ht_word(word->word);
1523  if (word->blamer_bundle != nullptr) {
1524  new_x_ht_word.blamer_bundle = new BlamerBundle();
1525  new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1526  }
1527  new_x_ht_word.x_height = new_x_ht;
1528  new_x_ht_word.baseline_shift = baseline_shift;
1529  new_x_ht_word.caps_height = 0.0;
1530  new_x_ht_word.SetupForRecognition(
1531  unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
1533  poly_allow_detailed_fx, row, block);
1534  match_word_pass_n(2, &new_x_ht_word, row, block);
1535  if (!new_x_ht_word.tess_failed) {
1536  int new_misfits = CountMisfitTops(&new_x_ht_word);
1537  if (debug_x_ht_level >= 1) {
1538  tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
1539  original_misfits, word->x_height,
1540  new_misfits, new_x_ht);
1541  tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
1542  word->best_choice->rating(), word->best_choice->certainty(),
1543  new_x_ht_word.best_choice->rating(),
1544  new_x_ht_word.best_choice->certainty());
1545  }
1546  // The misfits must improve and either the rating or certainty.
1547  accept_new_x_ht = new_misfits < original_misfits &&
1548  (new_x_ht_word.best_choice->certainty() >
1549  word->best_choice->certainty() ||
1550  new_x_ht_word.best_choice->rating() <
1551  word->best_choice->rating());
1552  if (debug_x_ht_level >= 1) {
1553  ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1554  }
1555  }
1556  if (accept_new_x_ht) {
1557  word->ConsumeWordResults(&new_x_ht_word);
1558  return true;
1559  }
1560  return false;
1561 }
1562 
1563 #endif // ndef DISABLED_LEGACY_ENGINE
1564 
1572  WERD_RES** in_word,
1573  PointerVector<WERD_RES>* out_words) {
1574  // Return if we do not want to run Tesseract.
1576  return;
1577  }
1578 #ifndef DISABLED_LEGACY_ENGINE
1579  ROW* row = word_data.row;
1580  BLOCK* block = word_data.block;
1581  WERD_RES* word = *in_word;
1582  prev_word_best_choice_ = word_data.prev_word != nullptr
1583  ? word_data.prev_word->word->best_choice : nullptr;
1584 
1586  check_debug_pt(word, 30);
1587  if (!word->done) {
1588  word->caps_height = 0.0;
1589  if (word->x_height == 0.0f)
1590  word->x_height = row->x_height();
1591  match_word_pass_n(2, word, row, block);
1592  check_debug_pt(word, 40);
1593  }
1594 
1595  SubAndSuperscriptFix(word);
1596 
1597  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1599  block->classify_rotation().y() == 0.0f) {
1600  // Use the tops and bottoms since they are available.
1601  TrainedXheightFix(word, block, row);
1602  }
1603 
1605  }
1606 #ifndef GRAPHICS_DISABLED
1608  if (fx_win == nullptr)
1609  create_fx_win();
1610  clear_fx_win();
1611  word->rebuild_word->plot(fx_win);
1612  TBOX wbox = word->rebuild_word->bounding_box();
1613  fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
1614  wbox.right(), wbox.bottom());
1616  }
1617 #endif
1619  check_debug_pt(word, 50);
1620 #endif // ndef DISABLED_LEGACY_ENGINE
1621 }
1622 
1623 #ifndef DISABLED_LEGACY_ENGINE
1624 
1630  ROW *row, BLOCK* block) {
1631  if (word->tess_failed) return;
1632  tess_segment_pass_n(pass_n, word);
1633 
1634  if (!word->tess_failed) {
1635  if (!word->word->flag (W_REP_CHAR)) {
1636  word->fix_quotes();
1638  word->fix_hyphens();
1639  /* Don't trust fix_quotes! - though I think I've fixed the bug */
1640  if (word->best_choice->length() != word->box_word->length()) {
1641  tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1642  " #Blobs=%d\n",
1643  word->best_choice->debug_string().c_str(),
1644  word->best_choice->length(),
1645  word->box_word->length());
1646 
1647  }
1648  word->tess_accepted = tess_acceptable_word(word);
1649 
1650  // Also sets word->done flag
1651  make_reject_map(word, row, pass_n);
1652  }
1653  }
1654  set_word_fonts(word);
1655 
1656  ASSERT_HOST(word->raw_choice != nullptr);
1657 }
1658 #endif // ndef DISABLED_LEGACY_ENGINE
1659 
1660 // Helper to return the best rated BLOB_CHOICE in the whole word that matches
1661 // the given char_id, or nullptr if none can be found.
1662 static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
1663  WERD_RES* word_res) {
1664  // Find the corresponding best BLOB_CHOICE from any position in the word_res.
1665  BLOB_CHOICE* best_choice = nullptr;
1666  for (int i = 0; i < word_res->best_choice->length(); ++i) {
1667  BLOB_CHOICE* choice = FindMatchingChoice(char_id,
1668  word_res->GetBlobChoices(i));
1669  if (choice != nullptr) {
1670  if (best_choice == nullptr || choice->rating() < best_choice->rating())
1671  best_choice = choice;
1672  }
1673  }
1674  return best_choice;
1675 }
1676 
1677 // Helper to insert blob_choice in each location in the leader word if there is
1678 // no matching BLOB_CHOICE there already, and correct any incorrect results
1679 // in the best_choice.
1680 static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice,
1681  WERD_RES* word_res) {
1682  WERD_CHOICE* word = word_res->best_choice;
1683  for (int i = 0; i < word_res->best_choice->length(); ++i) {
1684  BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(),
1685  word_res->GetBlobChoices(i));
1686  if (choice == nullptr) {
1687  BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
1688  choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
1689  }
1690  }
1691  // Correct any incorrect results in word.
1692  for (int i = 0; i < word->length(); ++i) {
1693  if (word->unichar_id(i) != blob_choice->unichar_id())
1694  word->set_unichar_id(blob_choice->unichar_id(), i);
1695  }
1696 }
1697 
1706  WERD_RES *word_res = page_res_it->word();
1707  const WERD_CHOICE &word = *(word_res->best_choice);
1708 
1709  // Find the frequency of each unique character in the word.
1710  SortHelper<UNICHAR_ID> rep_ch(word.length());
1711  for (int i = 0; i < word.length(); ++i) {
1712  rep_ch.Add(word.unichar_id(i), 1);
1713  }
1714 
1715  // Find the most frequent result.
1716  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1717  int max_count = rep_ch.MaxCount(&maxch_id);
1718  // Find the best exemplar of a classifier result for maxch_id.
1719  BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1720  if (best_choice == nullptr) {
1721  tprintf("Failed to find a choice for %s, occurring %d times\n",
1722  word_res->uch_set->debug_str(maxch_id).c_str(), max_count);
1723  return;
1724  }
1725  word_res->done = true;
1726 
1727  // Measure the mean space.
1728  int gap_count = 0;
1729  WERD* werd = word_res->word;
1730  C_BLOB_IT blob_it(werd->cblob_list());
1731  C_BLOB* prev_blob = blob_it.data();
1732  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1733  C_BLOB* blob = blob_it.data();
1734  int gap = blob->bounding_box().left();
1735  gap -= prev_blob->bounding_box().right();
1736  ++gap_count;
1737  prev_blob = blob;
1738  }
1739  // Just correct existing classification.
1740  CorrectRepcharChoices(best_choice, word_res);
1741  word_res->reject_map.initialise(word.length());
1742 }
1743 
1745  const UNICHARSET& char_set, const char *s, const char *lengths) {
1746  int i = 0;
1747  int offset = 0;
1748  int leading_punct_count;
1749  int upper_count = 0;
1750  int hyphen_pos = -1;
1752 
1753  if (strlen (lengths) > 20)
1754  return word_type;
1755 
1756  /* Single Leading punctuation char*/
1757 
1758  if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
1759  offset += lengths[i++];
1760  leading_punct_count = i;
1761 
1762  /* Initial cap */
1763  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1764  offset += lengths[i++];
1765  upper_count++;
1766  }
1767  if (upper_count > 1) {
1768  word_type = AC_UPPER_CASE;
1769  } else {
1770  /* Lower case word, possibly with an initial cap */
1771  while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1772  offset += lengths[i++];
1773  }
1774  if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1775  goto not_a_word;
1776  /*
1777  Allow a single hyphen in a lower case word
1778  - don't trust upper case - I've seen several cases of "H" -> "I-I"
1779  */
1780  if (lengths[i] == 1 && s[offset] == '-') {
1781  hyphen_pos = i;
1782  offset += lengths[i++];
1783  if (s[offset] != '\0') {
1784  while ((s[offset] != '\0') &&
1785  char_set.get_islower(s + offset, lengths[i])) {
1786  offset += lengths[i++];
1787  }
1788  if (i < hyphen_pos + 3)
1789  goto not_a_word;
1790  }
1791  } else {
1792  /* Allow "'s" in NON hyphenated lower case words */
1793  if (lengths[i] == 1 && (s[offset] == '\'') &&
1794  lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1795  offset += lengths[i++];
1796  offset += lengths[i++];
1797  }
1798  }
1799  if (upper_count > 0)
1800  word_type = AC_INITIAL_CAP;
1801  else
1802  word_type = AC_LOWER_CASE;
1803  }
1804 
1805  /* Up to two different, constrained trailing punctuation chars */
1806  if (lengths[i] == 1 && s[offset] != '\0' &&
1807  STRING(chs_trailing_punct1).contains(s[offset]))
1808  offset += lengths[i++];
1809  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
1810  s[offset - lengths[i - 1]] != s[offset] &&
1811  STRING(chs_trailing_punct2).contains (s[offset]))
1812  offset += lengths[i++];
1813 
1814  if (s[offset] != '\0')
1815  word_type = AC_UNACCEPTABLE;
1816 
1817  not_a_word:
1818 
1819  if (word_type == AC_UNACCEPTABLE) {
1820  /* Look for abbreviation string */
1821  i = 0;
1822  offset = 0;
1823  if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1824  word_type = AC_UC_ABBREV;
1825  while (s[offset] != '\0' &&
1826  char_set.get_isupper(s + offset, lengths[i]) &&
1827  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1828  offset += lengths[i++];
1829  offset += lengths[i++];
1830  }
1831  }
1832  else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1833  word_type = AC_LC_ABBREV;
1834  while (s[offset] != '\0' &&
1835  char_set.get_islower(s + offset, lengths[i]) &&
1836  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1837  offset += lengths[i++];
1838  offset += lengths[i++];
1839  }
1840  }
1841  if (s[offset] != '\0')
1842  word_type = AC_UNACCEPTABLE;
1843  }
1844 
1845  return word_type;
1846 }
1847 
1848 bool Tesseract::check_debug_pt(WERD_RES* word, int location) {
1849  bool show_map_detail = false;
1850  int16_t i;
1851 
1852  if (!test_pt)
1853  return false;
1854 
1855  tessedit_rejection_debug.set_value (false);
1856  debug_x_ht_level.set_value(0);
1857 
1858  if (word->word->bounding_box().contains(FCOORD (test_pt_x, test_pt_y))) {
1859  if (location < 0)
1860  return true; // For breakpoint use
1861  tessedit_rejection_debug.set_value(true);
1862  debug_x_ht_level.set_value(2);
1863  tprintf ("\n\nTESTWD::");
1864  switch (location) {
1865  case 0:
1866  tprintf ("classify_word_pass1 start\n");
1867  word->word->print();
1868  break;
1869  case 10:
1870  tprintf ("make_reject_map: initial map");
1871  break;
1872  case 20:
1873  tprintf ("make_reject_map: after NN");
1874  break;
1875  case 30:
1876  tprintf ("classify_word_pass2 - START");
1877  break;
1878  case 40:
1879  tprintf ("classify_word_pass2 - Pre Xht");
1880  break;
1881  case 50:
1882  tprintf ("classify_word_pass2 - END");
1883  show_map_detail = true;
1884  break;
1885  case 60:
1886  tprintf ("fixspace");
1887  break;
1888  case 70:
1889  tprintf ("MM pass START");
1890  break;
1891  case 80:
1892  tprintf ("MM pass END");
1893  break;
1894  case 90:
1895  tprintf ("After Poor quality rejection");
1896  break;
1897  case 100:
1898  tprintf ("unrej_good_quality_words - START");
1899  break;
1900  case 110:
1901  tprintf ("unrej_good_quality_words - END");
1902  break;
1903  case 120:
1904  tprintf ("Write results pass");
1905  show_map_detail = true;
1906  break;
1907  }
1908  if (word->best_choice != nullptr) {
1909  tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
1910  word->reject_map.print(debug_fp);
1911  tprintf("\n");
1912  if (show_map_detail) {
1913  tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
1914  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1915  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1916  word->reject_map[i].full_print(debug_fp);
1917  }
1918  }
1919  } else {
1920  tprintf("null best choice\n");
1921  }
1922  tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1923  tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1924  return true;
1925  } else {
1926  return false;
1927  }
1928 }
1929 
1935 static void find_modal_font( // good chars in word
1936  STATS* fonts, // font stats
1937  int16_t* font_out, // output font
1938  int8_t* font_count // output count
1939 ) {
1940  int16_t font; //font index
1941  int32_t count; //pile count
1942 
1943  if (fonts->get_total () > 0) {
1944  font = static_cast<int16_t>(fonts->mode ());
1945  *font_out = font;
1946  count = fonts->pile_count (font);
1947  *font_count = count < INT8_MAX ? count : INT8_MAX;
1948  fonts->add (font, -*font_count);
1949  }
1950  else {
1951  *font_out = -1;
1952  *font_count = 0;
1953  }
1954 }
1955 
1962  // Don't try to set the word fonts for an lstm word, as the configs
1963  // will be meaningless.
1964  if (word->chopped_word == nullptr) return;
1965  ASSERT_HOST(word->best_choice != nullptr);
1966 
1967 #ifndef DISABLED_LEGACY_ENGINE
1968  const int fontinfo_size = get_fontinfo_table().size();
1969  if (fontinfo_size == 0) return;
1970  GenericVector<int> font_total_score;
1971  font_total_score.init_to_size(fontinfo_size, 0);
1972 
1973  // Compute the font scores for the word
1974  if (tessedit_debug_fonts) {
1975  tprintf("Examining fonts in %s\n",
1976  word->best_choice->debug_string().c_str());
1977  }
1978  for (int b = 0; b < word->best_choice->length(); ++b) {
1979  const BLOB_CHOICE* choice = word->GetBlobChoice(b);
1980  if (choice == nullptr) continue;
1981  const GenericVector<ScoredFont>& fonts = choice->fonts();
1982  for (int f = 0; f < fonts.size(); ++f) {
1983  const int fontinfo_id = fonts[f].fontinfo_id;
1984  if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1985  font_total_score[fontinfo_id] += fonts[f].score;
1986  }
1987  }
1988  }
1989  // Find the top and 2nd choice for the word.
1990  int score1 = 0, score2 = 0;
1991  int16_t font_id1 = -1, font_id2 = -1;
1992  for (int f = 0; f < fontinfo_size; ++f) {
1993  if (tessedit_debug_fonts && font_total_score[f] > 0) {
1994  tprintf("Font %s, total score = %d\n",
1995  fontinfo_table_.get(f).name, font_total_score[f]);
1996  }
1997  if (font_total_score[f] > score1) {
1998  score2 = score1;
1999  font_id2 = font_id1;
2000  score1 = font_total_score[f];
2001  font_id1 = f;
2002  } else if (font_total_score[f] > score2) {
2003  score2 = font_total_score[f];
2004  font_id2 = f;
2005  }
2006  }
2007  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : nullptr;
2008  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : nullptr;
2009  // Each score has a limit of UINT16_MAX, so divide by that to get the number
2010  // of "votes" for that font, ie number of perfect scores.
2011  word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
2012  word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
2013  if (score1 > 0) {
2014  const FontInfo fi = fontinfo_table_.get(font_id1);
2015  if (tessedit_debug_fonts) {
2016  if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
2017  tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
2018  fi.name, word->fontinfo_id_count,
2019  fontinfo_table_.get(font_id2).name,
2020  word->fontinfo_id2_count);
2021  } else {
2022  tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
2023  fi.name, word->fontinfo_id_count);
2024  }
2025  }
2026  }
2027 #endif // ndef DISABLED_LEGACY_ENGINE
2028 }
2029 
2030 #ifndef DISABLED_LEGACY_ENGINE
2031 
2037  PAGE_RES_IT page_res_it(page_res);
2038  WERD_RES *word; // current word
2039  STATS doc_fonts(0, font_table_size_); // font counters
2040 
2041  // Gather font id statistics.
2042  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2043  page_res_it.forward()) {
2044  word = page_res_it.word();
2045  if (word->fontinfo != nullptr) {
2046  doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
2047  }
2048  if (word->fontinfo2 != nullptr) {
2049  doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
2050  }
2051  }
2052  int16_t doc_font; // modal font
2053  int8_t doc_font_count; // modal font
2054  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2055  if (doc_font_count == 0)
2056  return;
2057  // Get the modal font pointer.
2058  const FontInfo* modal_font = nullptr;
2059  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2060  page_res_it.forward()) {
2061  word = page_res_it.word();
2062  if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
2063  modal_font = word->fontinfo;
2064  break;
2065  }
2066  if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
2067  modal_font = word->fontinfo2;
2068  break;
2069  }
2070  }
2071  ASSERT_HOST(modal_font != nullptr);
2072 
2073  // Assign modal font to weak words.
2074  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2075  page_res_it.forward()) {
2076  word = page_res_it.word();
2077  const int length = word->best_choice->length();
2078 
2079  const int count = word->fontinfo_id_count;
2080  if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2081  word->fontinfo = modal_font;
2082  // Counts only get 1 as it came from the doc.
2083  word->fontinfo_id_count = 1;
2084  }
2085  }
2086 }
2087 #endif // ndef DISABLED_LEGACY_ENGINE
2088 
2089 // If a word has multiple alternates check if the best choice is in the
2090 // dictionary. If not, replace it with an alternate that exists in the
2091 // dictionary.
2093  PAGE_RES_IT word_it(page_res);
2094  for (WERD_RES* word = word_it.word(); word != nullptr;
2095  word = word_it.forward()) {
2096  if (word->best_choices.singleton())
2097  continue; // There are no alternates.
2098 
2099  const WERD_CHOICE* best = word->best_choice;
2100  if (word->tesseract->getDict().valid_word(*best) != 0)
2101  continue; // The best choice is in the dictionary.
2102 
2103  WERD_CHOICE_IT choice_it(&word->best_choices);
2104  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2105  choice_it.forward()) {
2106  WERD_CHOICE* alternate = choice_it.data();
2107  if (word->tesseract->getDict().valid_word(*alternate)) {
2108  // The alternate choice is in the dictionary.
2109  if (tessedit_bigram_debug) {
2110  tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2111  best->unichar_string().c_str(),
2112  alternate->unichar_string().c_str());
2113  }
2114  // Replace the 'best' choice with a better choice.
2115  word->ReplaceBestChoice(alternate);
2116  break;
2117  }
2118  }
2119  }
2120 }
2121 
2122 } // namespace tesseract
tesseract::Tesseract::chs_leading_punct
char * chs_leading_punct
Definition: tesseractclass.h:873
WERD_RES::done
bool done
Definition: pageres.h:299
REJMAP::full_print
void full_print(FILE *fp)
Definition: rejctmap.cpp:332
tesseract::Tesseract::ProcessTargetWord
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:120
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
STATS::get_total
int32_t get_total() const
Definition: statistc.h:83
tesseract::ParamUtils::ReadParamsFile
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:39
tesseract::Tesseract::quality_based_rejection
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:133
tesseract::Tesseract::poly_allow_detailed_fx
bool poly_allow_detailed_fx
Definition: tesseractclass.h:1061
C_BLOB::bounding_box
TBOX bounding_box() const
Definition: stepblob.cpp:247
WERD_RES::fix_hyphens
void fix_hyphens()
Definition: pageres.cpp:1042
tesseract::Tesseract::quality_min_initial_alphas_reqd
int quality_min_initial_alphas_reqd
Definition: tesseractclass.h:881
clear_fx_win
void clear_fx_win()
Definition: drawfx.cpp:60
PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:728
tesseract::Tesseract::output_pass
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:35
WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:266
AC_LC_ABBREV
a.b.c.
Definition: control.h:33
WERD_CHOICE::shallow_copy
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:416
SUBLOC_NORM
#define SUBLOC_NORM
Definition: errcode.h:57
werdit.h
tesseract::Tesseract::recog_pseudo_word
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:62
pageres.h
UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:488
tesseract::TesseractStats::doc_blob_quality
int16_t doc_blob_quality
Definition: tesseractclass.h:128
tesseract::Tesseract::quality_char_pc
double quality_char_pc
Definition: tesseractclass.h:880
STRING::add_str_int
void add_str_int(const char *str, int number)
Definition: strngs.cpp:370
tessvars.h
WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:116
tesseract::Textord::CleanupSingleRowResult
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:318
C_BLOB::out_list
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:69
tesseract::Tesseract::set_word_fonts
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1961
WERD_RES::BestChoiceToCorrectText
void BestChoiceToCorrectText()
Definition: pageres.cpp:920
ACCEPTABLE_WERD_TYPE
ACCEPTABLE_WERD_TYPE
Definition: control.h:27
tesseract::Tesseract::tessedit_display_outwords
bool tessedit_display_outwords
Definition: tesseractclass.h:836
tesseract::Tesseract::textord_use_cjk_fp_model
bool textord_use_cjk_fp_model
Definition: tesseractclass.h:1059
tesseract::Tesseract::chs_trailing_punct1
char * chs_trailing_punct1
Definition: tesseractclass.h:874
W_REP_CHAR
repeated character
Definition: werd.h:52
tesseract::Tesseract::ComputeCompatibleXheight
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:117
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
tesseract::OEM_TESSERACT_LSTM_COMBINED
Definition: publictypes.h:268
POLY_BLOCK::IsText
bool IsText() const
Definition: polyblk.h:62
tesseract::CCStruct::kXHeightCapRatio
static const double kXHeightCapRatio
Definition: ccstruct.h:37
WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:260
WERD_CHOICE
Definition: ratngs.h:261
tesseract::Classify::fontinfo_table_
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
REJMAP::initialise
void initialise(int16_t length)
Definition: rejctmap.cpp:272
tesseractclass.h
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
fx_win
ScrollView * fx_win
Definition: drawfx.cpp:40
tesseract::Tesseract::tessedit_debug_quality_metrics
bool tessedit_debug_quality_metrics
Definition: tesseractclass.h:927
SYSTEM_DAWG_PERM
Definition: ratngs.h:239
tesseract::Tesseract::CountMisfitTops
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:85
WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:147
control.h
tesseract::Tesseract::tessedit_fix_fuzzy_spaces
bool tessedit_fix_fuzzy_spaces
Definition: tesseractclass.h:839
WERD_RES::GetBlobChoices
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:755
PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:754
tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
Definition: control.cpp:1010
ETEXT_DESC::progress_callback2
PROGRESS_FUNC2 progress_callback2
called whenever progress increases
Definition: ocrclass.h:108
FCOORD::y
float y() const
Definition: points.h:209
FindMatchingChoice
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:182
tesseract::Tesseract
Definition: tesseractclass.h:172
WERD::AddSelectedOutlines
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB * > &target_blobs, const GenericVector< C_OUTLINE * > &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:523
tesseract::Tesseract::RetryWithLanguage
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:903
kBackUpConfigFile
const char *const kBackUpConfigFile
Definition: control.cpp:48
TBOX::print
void print() const
Definition: rect.h:277
tesseract::Tesseract::ReportXhtFixResult
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1461
tesseract::Classify::AdaptableWord
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:821
create_fx_win
void create_fx_win()
Definition: drawfx.cpp:48
PAGE_RES_IT::restart_page
WERD_RES * restart_page()
Definition: pageres.h:695
PAGE_RES::misadaption_log
GenericVector< STRING > misadaption_log
Definition: pageres.h:89
tesseract::PointerVector< WERD_RES >
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
TBOX::top
int16_t top() const
Definition: rect.h:57
TBOX::contains
bool contains(const FCOORD pt) const
Definition: rect.h:330
PAGE_RES::blame_reasons
GenericVector< int > blame_reasons
Definition: pageres.h:84
tesseract::Tesseract::recog_interactive
bool recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:77
BLOCK::x_height
int32_t x_height() const
return xheight
Definition: ocrblock.h:105
STRING
Definition: strngs.h:45
tesseract::TesseractStats::doc_outline_errs
int16_t doc_outline_errs
Definition: tesseractclass.h:129
WERD_RES::x_height
float x_height
Definition: pageres.h:310
BlamerBundle::misadaption_debug
const STRING & misadaption_debug() const
Definition: blamer.h:134
STATS::pile_count
int32_t pile_count(int32_t value) const
Definition: statistc.h:75
tesseract::Tesseract::noise_maxperword
int noise_maxperword
Definition: tesseractclass.h:871
drawfx.h
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:334
tesseract::Wordrec::prev_word_best_choice_
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:476
WERD_RES::fontinfo
const FontInfo * fontinfo
Definition: pageres.h:303
WERD_RES
Definition: pageres.h:160
tesseract::PointerVector::truncate
void truncate(int size)
Definition: genericvector.h:457
tesseract::ParamUtils::PrintParams
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:168
tesseract::OEM_LSTM_ONLY
Definition: publictypes.h:267
tesseract::Classify::classify_max_certainty_margin
double classify_max_certainty_margin
Definition: classify.h:440
tesseract::Tesseract::classify_word_pass2
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1571
WERD_RES::ConsumeWordResults
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:761
tesseract::Classify::LearnWord
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:250
tesseract::TesseractStats::good_char_count
int16_t good_char_count
Definition: tesseractclass.h:131
LOC_FUZZY_SPACE
#define LOC_FUZZY_SPACE
Definition: errcode.h:48
C_BLOB::SortByXMiddle
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:124
tesseract::Tesseract::quality_rej_pc
double quality_rej_pc
Definition: tesseractclass.h:876
tesseract::Tesseract::tessedit_enable_dict_correction
bool tessedit_enable_dict_correction
Definition: tesseractclass.h:850
tesseract::Tesseract::debug_noise_removal
int debug_noise_removal
Definition: tesseractclass.h:857
tesseract::Tesseract::tessedit_pageseg_mode
int tessedit_pageseg_mode
Definition: tesseractclass.h:799
tesseract::Tesseract::tess_add_doc_word
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:71
tesseract::Tesseract::LSTMRecognizeWord
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:228
EqualIgnoringCaseAndTerminalPunct
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:807
AC_UC_ABBREV
A.B.C.
Definition: control.h:34
WERD_RES::GetBlobChoice
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:746
ETEXT_DESC
Definition: ocrclass.h:95
tesseract::Tesseract::noise_maxperblob
int noise_maxperblob
Definition: tesseractclass.h:870
ScrollView::ZoomToRectangle
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:755
FCOORD
Definition: points.h:187
tesseract::LSTMRecognizer::GetUnicharset
const UNICHARSET & GetUnicharset() const
Definition: lstmrecognizer.h:132
BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:75
tesseract::Tesseract::acceptable_word_string
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1744
tesseract::Tesseract::tessedit_bigram_debug
int tessedit_bigram_debug
Definition: tesseractclass.h:853
tesseract::TesseractStats::word_count
int32_t word_count
Definition: tesseractclass.h:133
tesseract::Tesseract::rejection_passes
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:612
tesseract::FontInfo::universal_id
int32_t universal_id
Definition: fontinfo.h:123
tesseract::Tesseract::script_pos_pass
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:733
tesseract::Dict::valid_word_permuter
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:474
tesseract::WordData
Definition: tesseractclass.h:144
C_BLOB
Definition: stepblob.h:36
tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:57
tesseract::SET_PARAM_CONSTRAINT_DEBUG_ONLY
Definition: params.h:51
ETEXT_DESC::ocr_alive
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:103
GenericVector::back
T & back() const
Definition: genericvector.h:728
C_OUTLINE
Definition: coutln.h:71
WERD_RES::fontinfo_id_count
int8_t fontinfo_id_count
Definition: pageres.h:305
tesseract::WordData::row
ROW * row
Definition: tesseractclass.h:156
tesseract::Tesseract::test_pt_x
double test_pt_x
Definition: tesseractclass.h:888
WERD_RES::tess_failed
bool tess_failed
Definition: pageres.h:289
tesseract::Tesseract::chs_trailing_punct2
char * chs_trailing_punct2
Definition: tesseractclass.h:875
tesseract::Tesseract::BestPix
Pix * BestPix() const
Definition: tesseractclass.h:231
WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:197
PAGE_RES_IT::InsertSimpleCloneWord
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1209
tesseract::WordData::block
BLOCK * block
Definition: tesseractclass.h:157
BlamerBundle::CopyTruth
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:204
WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:94
WERD_RES::blamer_bundle
BlamerBundle * blamer_bundle
Definition: pageres.h:246
set_global_subloc_code
void set_global_subloc_code(int loc_code)
Definition: globaloc.cpp:30
WERD_RES::InitForRetryRecognition
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:273
tesseract::Tesseract::SetupWordPassN
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:177
tesseract::WordData::word
WERD_RES * word
Definition: tesseractclass.h:155
tesseract::Classify::get_fontinfo_table
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
PAGE_RES_IT::MakeCurrentWordFuzzy
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1469
tesseract::Tesseract::debug_x_ht_level
int debug_x_ht_level
Definition: tesseractclass.h:872
BlamerBundle::IncorrectReasonName
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:62
tesseract::Tesseract::TestNewNormalization
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1518
WERD::print
void print()
Definition: werd.cpp:252
kMinRefitXHeightFraction
const double kMinRefitXHeightFraction
Definition: control.cpp:51
tesseract::Tesseract::TrainedXheightFix
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1484
tesseract::Tesseract::tessedit_tess_adaption_mode
int tessedit_tess_adaption_mode
Definition: tesseractclass.h:883
REJMAP::rej_word_bad_quality
void rej_word_bad_quality()
Definition: rejctmap.cpp:414
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
BLOCK
Definition: ocrblock.h:28
C_BLOB::deep_copy
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:118
BLOCK::pdblk
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:189
SortHelper::Add
void Add(T value, int count)
Definition: sorthelper.h:65
ETEXT_DESC::cancel_this
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:109
REJMAP::length
int32_t length() const
Definition: rejctmap.h:222
WERD_CHOICE::IsAllSpaces
bool IsAllSpaces() const
Definition: ratngs.h:509
tesseract::Tesseract::tessedit_enable_doc_dict
bool tessedit_enable_doc_dict
Definition: tesseractclass.h:844
ROW::x_height
float x_height() const
Definition: ocrrow.h:63
WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:347
WERD_RES::PrintBestChoices
void PrintBestChoices() const
Definition: pageres.cpp:713
LOC_DOC_BLK_REJ
#define LOC_DOC_BLK_REJ
Definition: errcode.h:51
PAGE_RES::char_count
int32_t char_count
Definition: pageres.h:76
tesseract::Tesseract::SubAndSuperscriptFix
bool SubAndSuperscriptFix(WERD_RES *word_res)
Definition: superscript.cpp:100
WERD_CHOICE::min_x_height
float min_x_height() const
Definition: ratngs.h:324
ETEXT_DESC::progress
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:98
UNICHARSET::debug_str
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:342
tesseract::Tesseract::tessedit_debug_fonts
bool tessedit_debug_fonts
Definition: tesseractclass.h:845
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
WERD_RES::SetupForRecognition
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:298
tesseract::Tesseract::test_pt_y
double test_pt_y
Definition: tesseractclass.h:889
TBOX::major_x_overlap
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:403
tesseract::Tesseract::tessedit_test_adaption
bool tessedit_test_adaption
Definition: tesseractclass.h:886
tesseract::Tesseract::tessedit_rejection_debug
bool tessedit_rejection_debug
Definition: tesseractclass.h:1026
tesseract::Tesseract::tessedit_reject_bad_qual_wds
bool tessedit_reject_bad_qual_wds
Definition: tesseractclass.h:924
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
PDBLK::poly_block
POLY_BLOCK * poly_block() const
Definition: pdblock.h:54
tesseract::Tesseract::noise_cert_disjoint
double noise_cert_disjoint
Definition: tesseractclass.h:863
WERD_RES::fontinfo2
const FontInfo * fontinfo2
Definition: pageres.h:304
AC_UPPER_CASE
ALL upper case.
Definition: control.h:31
tesseract::Tesseract::PrerecAllWordsPar
void PrerecAllWordsPar(const GenericVector< WordData > &words)
Definition: par_control.cpp:38
tesseract::Tesseract::word_adaptable
bool word_adaptable(WERD_RES *word, uint16_t mode)
Definition: adaptions.cpp:50
WERD_RES::baseline_shift
float baseline_shift
Definition: pageres.h:312
tesseract::Tesseract::tessedit_write_unlv
bool tessedit_write_unlv
Definition: tesseractclass.h:997
tesseract::Classify::StartBackupAdaptiveClassifier
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:629
set_global_loc_code
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:25
W_EOL
end of line
Definition: werd.h:47
tesseract::Tesseract::word_char_quality
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:95
tesseract::Wordrec::wordrec_run_blamer
bool wordrec_run_blamer
Definition: wordrec.h:232
tesseract::Tesseract::quality_blob_pc
double quality_blob_pc
Definition: tesseractclass.h:877
PAGE_RES_IT::DeleteCurrentWord
void DeleteCurrentWord()
Definition: pageres.cpp:1436
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
tesseract::Classify::classify_max_rating_ratio
double classify_max_rating_ratio
Definition: classify.h:438
UNICHARSET
Definition: unicharset.h:145
UNICHARSET::top_bottom_useful
bool top_bottom_useful() const
Definition: unicharset.h:527
tesseract::Tesseract::noise_cert_basechar
double noise_cert_basechar
Definition: tesseractclass.h:860
UNICHARSET::script_has_xheight
bool script_has_xheight() const
Definition: unicharset.h:894
WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:297
ETEXT_DESC::deadline_exceeded
bool deadline_exceeded() const
Definition: ocrclass.h:129
tesseract::Tesseract::getDict
Dict & getDict() override
Definition: tesseractclass.cpp:564
PAGE_RES::rej_count
int32_t rej_count
Definition: pageres.h:77
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
tesseract::Tesseract::RecogAllWordsPassN
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
Definition: control.cpp:213
tesseract::Tesseract::AnyTessLang
bool AnyTessLang() const
Definition: tesseractclass.h:283
tesseract::Tesseract::tessedit_minimal_rej_pass1
bool tessedit_minimal_rej_pass1
Definition: tesseractclass.h:885
WERD_CHOICE::GetNonSuperscriptSpan
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:399
tesseract::Tesseract::tess_acceptable_word
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:61
tesseract::CCUtil::lang
STRING lang
Definition: ccutil.h:55
lstmrecognizer.h
BLOB_CHOICE::fonts
const GenericVector< tesseract::ScoredFont > & fonts() const
Definition: ratngs.h:91
WERD_RES::raw_choice
WERD_CHOICE * raw_choice
Definition: pageres.h:240
make_pseudo_word
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:33
REJMAP::reject_count
int16_t reject_count()
Definition: rejctmap.h:228
tesseract::Tesseract::bigram_correction_pass
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:467
WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:206
WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:288
tesseract::Tesseract::classify_word_pass1
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1400
tesseract
Definition: baseapi.h:65
WERD_CHOICE::debug_string
const STRING debug_string() const
Definition: ratngs.h:493
tesseract::TesseractStats::dict_words
int32_t dict_words
Definition: tesseractclass.h:134
tesseract::WordData::lang_words
PointerVector< WERD_RES > lang_words
Definition: tesseractclass.h:159
tesseract::Tesseract::AnyLSTMLang
bool AnyLSTMLang() const
Definition: tesseractclass.h:293
tesseract::Tesseract::tessedit_parallelize
int tessedit_parallelize
Definition: tesseractclass.h:1074
PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:748
tesseract::CCUtil::params
ParamsVectors * params()
Definition: ccutil.h:51
tesseract::Tesseract::ClassifyBlobAsWord
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1269
PAGE_RES
Definition: pageres.h:73
WERD_RES::best_choices
WERD_CHOICE_LIST best_choices
Definition: pageres.h:243
WERD_RES::SetScriptPositions
void SetScriptPositions()
Definition: pageres.cpp:854
STATS
Definition: statistc.h:30
tesseract::Tesseract::classify_word_and_language
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1318
callcpp.h
tesseract::Tesseract::AssignDiacriticsToNewBlobs
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
Definition: control.cpp:1063
BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:78
tesseract::FontInfo
Definition: fontinfo.h:62
fixspace.h
AC_LOWER_CASE
ALL lower case.
Definition: control.h:30
PAGE_RES_IT::rej_stat_word
void rej_stat_word()
Definition: pageres.cpp:1658
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
tesseract::Tesseract::noise_cert_factor
double noise_cert_factor
Definition: tesseractclass.h:869
WERD_RES::IsAmbiguous
bool IsAmbiguous()
Definition: pageres.cpp:448
GenericVector
Definition: baseapi.h:40
tesseract::FontInfo::name
char * name
Definition: fontinfo.h:117
PAGE_RES_IT
Definition: pageres.h:668
WERD_RES::caps_height
float caps_height
Definition: pageres.h:311
STATS::mode
int32_t mode() const
Definition: statistc.cpp:100
IRR_NUM_REASONS
Definition: blamer.h:99
WERD_RES::tess_would_adapt
bool tess_would_adapt
Definition: pageres.h:298
WERD_RES::fix_quotes
void fix_quotes()
Definition: pageres.cpp:1013
reject.h
UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:495
TBOX::x_overlap
bool x_overlap(const TBOX &box) const
Definition: rect.h:393
tesseract::Tesseract::tessedit_dump_choices
bool tessedit_dump_choices
Definition: tesseractclass.h:837
AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:29
BlamerBundle::SetMisAdaptionDebug
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:585
tesseract::Tesseract::tessedit_timing_debug
bool tessedit_timing_debug
Definition: tesseractclass.h:838
tesseract::Wordrec::wordrec_debug_blamer
bool wordrec_debug_blamer
Definition: wordrec.h:231
STRING::length
int32_t length() const
Definition: strngs.cpp:187
STRING::contains
bool contains(char c) const
Definition: strngs.cpp:183
WERD::ConstructFromSingleBlob
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:124
tesseract::Tesseract::tessedit_enable_bigram_correction
bool tessedit_enable_bigram_correction
Definition: tesseractclass.h:848
tesseract::Classify::AdaptiveClassifierIsEmpty
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:326
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
output.h
count
int count(LIST var_list)
Definition: oldlist.cpp:79
LOC_WRITE_RESULTS
#define LOC_WRITE_RESULTS
Definition: errcode.h:52
ocrclass.h
BLOB_CHOICE
Definition: ratngs.h:49
tesseract::BoxWord::length
int length() const
Definition: boxword.h:82
tesseract::Tesseract::match_word_pass_n
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1629
tesseract::Tesseract::multilang_debug_level
int multilang_debug_level
Definition: tesseractclass.h:890
WERD_RES::tesseract
tesseract::Tesseract * tesseract
Definition: pageres.h:274
TBOX::major_overlap
bool major_overlap(const TBOX &box) const
Definition: rect.h:362
WERD
Definition: werd.h:55
tesseract::Tesseract::font_recognition_pass
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:2036
BLOCK_RES::block
BLOCK * block
Definition: pageres.h:113
TBOX::left
int16_t left() const
Definition: rect.h:71
tesseract::TesseractStats::doc_good_char_quality
int16_t doc_good_char_quality
Definition: tesseractclass.h:132
ROW
Definition: ocrrow.h:35
tesseract::Tesseract::blamer_pass
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:709
STATS::add
void add(int32_t value, int32_t count)
Definition: statistc.cpp:87
ETEXT_DESC::cancel
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:105
debug_fp
FILE * debug_fp
Definition: tessvars.cpp:23
tesseract::Tesseract::dictionary_correction_pass
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2092
WERD_RES::fontinfo_id2_count
int8_t fontinfo_id2_count
Definition: pageres.h:306
TBOX::right
int16_t right() const
Definition: rect.h:78
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:706
PAGE_RES_IT::ReplaceCurrentWord
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1329
tesseract::Tesseract::SetupAllWordsPassN
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
Definition: control.cpp:154
tesseract::Tesseract::SelectGoodDiacriticOutlines
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1139
WERD_RES::part_of_combo
bool part_of_combo
Definition: pageres.h:334
tesseract::Tesseract::fix_fuzzy_spaces
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:73
tesseract::Dict::valid_bigram
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:813
tesseract::TesseractStats::doc_char_quality
int16_t doc_char_quality
Definition: tesseractclass.h:130
sorthelper.h
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
BLOCK::classify_rotation
FCOORD classify_rotation() const
Definition: ocrblock.h:139
tesseract::Tesseract::tessedit_fix_hyphens
bool tessedit_fix_hyphens
Definition: tesseractclass.h:842
POLY_BLOCK
Definition: polyblk.h:26
tesseract::Tesseract::recog_all_words
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:302
tesseract::Tesseract::tessedit_word_for_word
bool tessedit_word_for_word
Definition: tesseractclass.h:1022
tesseract::Tesseract::fix_rep_char
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1705
ScrollView::Update
static void Update()
Definition: scrollview.cpp:708
TWERD::plot
void plot(ScrollView *window)
Definition: blobs.cpp:895
WERD_RES::word
WERD * word
Definition: pageres.h:180
TWERD::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:859
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
tesseract::Tesseract::tess_segment_pass_n
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:31
BlamerBundle::incorrect_result_reason
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:121
REJMAP::print
void print(FILE *fp)
Definition: rejctmap.cpp:320
WERD_RES::ReplaceBestChoice
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:791
LOC_MM_ADAPT
#define LOC_MM_ADAPT
Definition: errcode.h:50
WERD::GetNoiseOutlines
void GetNoiseOutlines(GenericVector< C_OUTLINE * > *outlines)
Definition: werd.cpp:505
tesseract::Classify::classify_bln_numeric_mode
bool classify_bln_numeric_mode
Definition: classify.h:508
AC_INITIAL_CAP
ALL but initial lc.
Definition: control.h:32
tesseract::Tesseract::tessedit_minimal_rejection
bool tessedit_minimal_rejection
Definition: tesseractclass.h:1019
BlamerBundle
Definition: blamer.h:103
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::OEM_TESSERACT_ONLY
Definition: publictypes.h:266
tesseract::Tesseract::tessedit_ocr_engine_mode
int tessedit_ocr_engine_mode
Definition: tesseractclass.h:802
tesseract::Tesseract::noise_cert_punc
double noise_cert_punc
Definition: tesseractclass.h:866
WERD_RES::guessed_x_ht
bool guessed_x_ht
Definition: pageres.h:307
tesseract::Tesseract::test_pt
bool test_pt
Definition: tesseractclass.h:887
PAGE_RES_IT::page_res
PAGE_RES * page_res
Definition: pageres.h:671
FREQ_DAWG_PERM
Definition: ratngs.h:242
tesseract::Tesseract::ClassifyBlobPlusOutlines
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
Definition: control.cpp:1225
WERD::rej_cblob_list
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:89
tesseract::Classify::SwitchAdaptiveClassifier
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:613
tesseract::WordRecognizer
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
Definition: tesseractclass.h:170
tesseract::Classify::classify_debug_level
int classify_debug_level
Definition: classify.h:430
tesseract::Tesseract::make_reject_map
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
PAGE_RES_IT::ResetWordIterator
void ResetWordIterator()
Definition: pageres.cpp:1518
WERD_RES::small_caps
bool small_caps
Definition: pageres.h:300
tesseract::Tesseract::word_blob_quality
int16_t word_blob_quality(WERD_RES *word)
Definition: docqual.cpp:64
W_BOL
start of line
Definition: werd.h:46
tesseract::Tesseract::ReassignDiacritics
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:944
tesseract::WordData::prev_word
WordData * prev_word
Definition: tesseractclass.h:158
USER_DAWG_PERM
Definition: ratngs.h:241
tesseract::Tesseract::word_outline_errs
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:76
tesseract::Tesseract::check_debug_pt
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1848
tesseract::Tesseract::right_to_left
bool right_to_left() const
Definition: tesseractclass.h:273
tesseract::Tesseract::quality_outline_pc
double quality_outline_pc
Definition: tesseractclass.h:879
TBOX
Definition: rect.h:33
WERD_CHOICE::max_x_height
float max_x_height() const
Definition: ratngs.h:327
BlamerBundle::LastChanceBlame
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:558
tesseract::Classify::AdaptiveClassifierIsFull
bool AdaptiveClassifierIsFull() const
Definition: classify.h:325
SortHelper
Definition: sorthelper.h:36
docqual.h