tesseract  4.0.0-1-g2a2b
control.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: control.cpp (Formerly control.c)
3  * Description: Module-independent matcher controller.
4  * Author: Ray Smith
5  * Created: Thu Apr 23 11:09:58 BST 1992
6  * ReHacked: Tue Sep 22 08:42:49 BST 1992 Phil Cheatle
7  *
8  * (C) Copyright 1992, Hewlett-Packard Ltd.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  **********************************************************************/
20 
21 // Include automatically generated configuration file if running autoconf.
22 #ifdef HAVE_CONFIG_H
23 #include "config_auto.h"
24 #endif
25 
26 #include <cmath>
27 #include <cstdint> // for int16_t, int32_t
28 #include <cstdio> // for fclose, fopen, FILE
29 #include <ctime> // for clock
30 #include <cctype>
31 #include "callcpp.h"
32 #include "control.h"
33 #ifndef DISABLED_LEGACY_ENGINE
34 #include "docqual.h"
35 #include "drawfx.h"
36 #include "fixspace.h"
37 #endif
38 #include "globals.h"
39 #include "lstmrecognizer.h"
40 #include "ocrclass.h"
41 #include "output.h"
42 #include "pageres.h" // for WERD_RES, PAGE_RES_IT, PAGE_RES, BLO...
43 #include "pgedit.h"
44 #include "reject.h"
45 #include "sorthelper.h"
46 #include "tesseractclass.h"
47 #include "tessvars.h"
48 #include "werdit.h"
49 
50 #define MIN_FONT_ROW_COUNT 8
51 #define MAX_XHEIGHT_DIFF 3
52 
53 const char* const kBackUpConfigFile = "tempconfigdata.config";
54 // Min believable x-height for any text when refitting as a fraction of
55 // original x-height
56 const double kMinRefitXHeightFraction = 0.5;
57 
58 
65 namespace tesseract {
66 
68  TBOX &selection_box) {
69  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
70  if (it != nullptr) {
72  it->DeleteCurrentWord();
73  delete it;
74  }
75 }
76 
83  int16_t char_qual;
84  int16_t good_char_qual;
85 
86  WordData word_data(*pr_it);
87  SetupWordPassN(2, &word_data);
88  // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
89  if (lstm_recognizer_ == nullptr) {
90 #ifndef DISABLED_LEGACY_ENGINE
91  classify_word_and_language(2, pr_it, &word_data);
92 #endif // ndef DISABLED_LEGACY_ENGINE
93  } else {
94  classify_word_and_language(1, pr_it, &word_data);
95  }
96 #ifndef DISABLED_LEGACY_ENGINE
98  WERD_RES* word_res = pr_it->word();
99  word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
100  tprintf("\n%d chars; word_blob_quality: %d; outline_errs: %d; "
101  "char_quality: %d; good_char_quality: %d\n",
102  word_res->reject_map.length(),
103  word_blob_quality(word_res, pr_it->row()->row),
104  word_outline_errs(word_res), char_qual, good_char_qual);
105  }
106 #endif // ndef DISABLED_LEGACY_ENGINE
107  return true;
108 }
109 
110 // Helper function to check for a target word and handle it appropriately.
111 // Inspired by Jetsoft's requirement to process only single words on pass2
112 // and beyond.
113 // If word_config is not null:
114 // If the word_box and target_word_box overlap, read the word_config file
115 // else reset to previous config data.
116 // return true.
117 // else
118 // If the word_box and target_word_box overlap or pass <= 1, return true.
119 // Note that this function uses a fixed temporary file for storing the previous
120 // configs, so it is neither thread-safe, nor process-safe, but the assumption
121 // is that it will only be used for one debug window at a time.
122 //
123 // Since this function is used for debugging (and not to change OCR results)
124 // set only debug params from the word config file.
125 bool Tesseract::ProcessTargetWord(const TBOX& word_box,
126  const TBOX& target_word_box,
127  const char* word_config,
128  int pass) {
129  if (word_config != nullptr) {
130  if (word_box.major_overlap(target_word_box)) {
131  if (backup_config_file_ == nullptr) {
132  backup_config_file_ = kBackUpConfigFile;
133  FILE* config_fp = fopen(backup_config_file_, "wb");
134  if (config_fp == nullptr) {
135  tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
136  } else {
137  ParamUtils::PrintParams(config_fp, params());
138  fclose(config_fp);
139  }
140  ParamUtils::ReadParamsFile(word_config,
142  params());
143  }
144  } else {
145  if (backup_config_file_ != nullptr) {
146  ParamUtils::ReadParamsFile(backup_config_file_,
148  params());
149  backup_config_file_ = nullptr;
150  }
151  }
152  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
153  return false;
154  }
155  return true;
156 }
157 
160  const TBOX* target_word_box,
161  const char* word_config,
162  PAGE_RES* page_res,
163  GenericVector<WordData>* words) {
164  // Prepare all the words.
165  PAGE_RES_IT page_res_it(page_res);
166  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
167  page_res_it.forward()) {
168  if (target_word_box == nullptr ||
169  ProcessTargetWord(page_res_it.word()->word->bounding_box(),
170  *target_word_box, word_config, 1)) {
171  words->push_back(WordData(page_res_it));
172  }
173  }
174  // Setup all the words for recognition with polygonal approximation.
175  for (int w = 0; w < words->size(); ++w) {
176  SetupWordPassN(pass_n, &(*words)[w]);
177  if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
178  }
179 }
180 
181 // Sets up the single word ready for whichever engine is to be run.
182 void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
183  if (pass_n == 1 || !word->word->done) {
184  if (pass_n == 1) {
185  word->word->SetupForRecognition(unicharset, this, BestPix(),
186  tessedit_ocr_engine_mode, nullptr,
190  word->row, word->block);
191  } else if (pass_n == 2) {
192  // TODO(rays) Should we do this on pass1 too?
193  word->word->caps_height = 0.0;
194  if (word->word->x_height == 0.0f)
195  word->word->x_height = word->row->x_height();
196  }
197  word->lang_words.truncate(0);
198  for (int s = 0; s <= sub_langs_.size(); ++s) {
199  // The sub_langs_.size() entry is for the master language.
200  Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
201  WERD_RES* word_res = new WERD_RES;
202  word_res->InitForRetryRecognition(*word->word);
203  word->lang_words.push_back(word_res);
204  // LSTM doesn't get setup for pass2.
205  if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
206  word_res->SetupForRecognition(
207  lang_t->unicharset, lang_t, BestPix(),
208  lang_t->tessedit_ocr_engine_mode, nullptr,
210  lang_t->textord_use_cjk_fp_model,
211  lang_t->poly_allow_detailed_fx, word->row, word->block);
212  }
213  }
214  }
215 }
216 
217 // Runs word recognition on all the words.
218 bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
219  PAGE_RES_IT* pr_it,
220  GenericVector<WordData>* words) {
221  // TODO(rays) Before this loop can be parallelized (it would yield a massive
222  // speed-up) all remaining member globals need to be converted to local/heap
223  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
224  // added. The results will be significantly different with adaption on, and
225  // deterioration will need investigation.
226  pr_it->restart_page();
227  for (int w = 0; w < words->size(); ++w) {
228  WordData* word = &(*words)[w];
229  if (w > 0) word->prev_word = &(*words)[w - 1];
230  if (monitor != nullptr) {
231  monitor->ocr_alive = TRUE;
232  if (pass_n == 1) {
233  monitor->progress = 70 * w / words->size();
234  if (monitor->progress_callback2 != nullptr) {
235  TBOX box = pr_it->word()->word->bounding_box();
236  (*monitor->progress_callback2)(monitor, box.left(),
237  box.right(), box.top(), box.bottom());
238  }
239  } else {
240  monitor->progress = 70 + 30 * w / words->size();
241  if (monitor->progress_callback2 != nullptr) {
242  (*monitor->progress_callback2)(monitor, 0, 0, 0, 0);
243  }
244  }
245  if (monitor->deadline_exceeded() ||
246  (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this,
247  words->size()))) {
248  // Timeout. Fake out the rest of the words.
249  for (; w < words->size(); ++w) {
250  (*words)[w].word->SetupFake(unicharset);
251  }
252  return false;
253  }
254  }
255  if (word->word->tess_failed) {
256  int s;
257  for (s = 0; s < word->lang_words.size() &&
258  word->lang_words[s]->tess_failed; ++s) {}
259  // If all are failed, skip it. Image words are skipped by this test.
260  if (s > word->lang_words.size()) continue;
261  }
262  // Sync pr_it with the wth WordData.
263  while (pr_it->word() != nullptr && pr_it->word() != word->word)
264  pr_it->forward();
265  ASSERT_HOST(pr_it->word() != nullptr);
266  bool make_next_word_fuzzy = false;
267  if (!AnyLSTMLang() &&
268  ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
269  // Needs to be setup again to see the new outlines in the chopped_word.
270  SetupWordPassN(pass_n, word);
271  }
272 
273  classify_word_and_language(pass_n, pr_it, word);
275  tprintf("Pass%d: %s [%s]\n", pass_n,
277  word->word->best_choice->debug_string().string());
278  }
279  pr_it->forward();
280  if (make_next_word_fuzzy && pr_it->word() != nullptr) {
281  pr_it->MakeCurrentWordFuzzy();
282  }
283  }
284  return true;
285 }
286 
309  ETEXT_DESC* monitor,
310  const TBOX* target_word_box,
311  const char* word_config,
312  int dopasses) {
313  PAGE_RES_IT page_res_it(page_res);
314 
316  tessedit_test_adaption.set_value (TRUE);
317  tessedit_minimal_rejection.set_value (TRUE);
318  }
319 
320  if (dopasses==0 || dopasses==1) {
321  page_res_it.restart_page();
322  // ****************** Pass 1 *******************
323 
324  #ifndef DISABLED_LEGACY_ENGINE
325  // If the adaptive classifier is full switch to one we prepared earlier,
326  // ie on the previous page. If the current adaptive classifier is non-empty,
327  // prepare a backup starting at this page, in case it fills up. Do all this
328  // independently for each language.
329  if (AdaptiveClassifierIsFull()) {
331  } else if (!AdaptiveClassifierIsEmpty()) {
333  }
334  // Now check the sub-langs as well.
335  for (int i = 0; i < sub_langs_.size(); ++i) {
336  if (sub_langs_[i]->AdaptiveClassifierIsFull()) {
337  sub_langs_[i]->SwitchAdaptiveClassifier();
338  } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {
339  sub_langs_[i]->StartBackupAdaptiveClassifier();
340  }
341  }
342 
343  #endif // ndef DISABLED_LEGACY_ENGINE
344 
345  // Set up all words ready for recognition, so that if parallelism is on
346  // all the input and output classes are ready to run the classifier.
348  SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
349  #ifndef DISABLED_LEGACY_ENGINE
350  if (tessedit_parallelize) {
351  PrerecAllWordsPar(words);
352  }
353  #endif // ndef DISABLED_LEGACY_ENGINE
354 
355  stats_.word_count = words.size();
356 
357  stats_.dict_words = 0;
358  stats_.doc_blob_quality = 0;
359  stats_.doc_outline_errs = 0;
360  stats_.doc_char_quality = 0;
361  stats_.good_char_count = 0;
362  stats_.doc_good_char_quality = 0;
363 
364  most_recently_used_ = this;
365  // Run pass 1 word recognition.
366  if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
367  // Pass 1 post-processing.
368  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
369  page_res_it.forward()) {
370  if (page_res_it.word()->word->flag(W_REP_CHAR)) {
371  fix_rep_char(&page_res_it);
372  continue;
373  }
374 
375  // Count dict words.
376  if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
377  ++(stats_.dict_words);
378 
379  // Update misadaption log (we only need to do it on pass 1, since
380  // adaption only happens on this pass).
381  if (page_res_it.word()->blamer_bundle != nullptr &&
382  page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
383  page_res->misadaption_log.push_back(
384  page_res_it.word()->blamer_bundle->misadaption_debug());
385  }
386  }
387  }
388 
389  if (dopasses == 1) return true;
390 
391  #ifndef DISABLED_LEGACY_ENGINE
392 
393  // ****************** Pass 2 *******************
395  AnyTessLang()) {
396  page_res_it.restart_page();
398  SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
399  if (tessedit_parallelize) {
400  PrerecAllWordsPar(words);
401  }
402  most_recently_used_ = this;
403  // Run pass 2 word recognition.
404  if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
405  }
406 
407  // The next passes are only required for Tess-only.
408  if (AnyTessLang() && !AnyLSTMLang()) {
409  // ****************** Pass 3 *******************
410  // Fix fuzzy spaces.
412 
415  fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
416 
417  // ****************** Pass 4 *******************
420 
421  // ****************** Pass 5,6 *******************
422  rejection_passes(page_res, monitor, target_word_box, word_config);
423 
424  // ****************** Pass 8 *******************
425  font_recognition_pass(page_res);
426 
427  // ****************** Pass 9 *******************
428  // Check the correctness of the final results.
429  blamer_pass(page_res);
430  script_pos_pass(page_res);
431  }
432 
433  #endif // ndef DISABLED_LEGACY_ENGINE
434 
435  // Write results pass.
437  // This is now redundant, but retained commented so show how to obtain
438  // bounding boxes and style information.
439 
440  #ifndef DISABLED_LEGACY_ENGINE
441  // changed by jetsoft
442  // needed for dll to output memory structure
443  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
444  output_pass(page_res_it, target_word_box);
445  // end jetsoft
446  #endif //ndef DISABLED_LEGACY_ENGINE
447 
448  const PageSegMode pageseg_mode = static_cast<PageSegMode>(
449  static_cast<int>(tessedit_pageseg_mode));
450  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
451 
452  // Remove empty words, as these mess up the result iterators.
453  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
454  page_res_it.forward()) {
455  const WERD_RES* word = page_res_it.word();
456  const POLY_BLOCK* pb = page_res_it.block()->block != nullptr
457  ? page_res_it.block()->block->pdblk.poly_block()
458  : nullptr;
459  if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
460  (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
461  page_res_it.DeleteCurrentWord();
462  }
463  }
464 
465  if (monitor != nullptr) {
466  monitor->progress = 100;
467  }
468  return true;
469 }
470 
471 #ifndef DISABLED_LEGACY_ENGINE
472 
474  PAGE_RES_IT word_it(page_res);
475 
476  WERD_RES *w_prev = nullptr;
477  WERD_RES *w = word_it.word();
478  while (true) {
479  w_prev = w;
480  while (word_it.forward() != nullptr &&
481  (!word_it.word() || word_it.word()->part_of_combo)) {
482  // advance word_it, skipping over parts of combos
483  }
484  if (!word_it.word()) break;
485  w = word_it.word();
486  if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
487  continue;
488  }
489  if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
490  if (tessedit_bigram_debug) {
491  tprintf("Skipping because one of the words is W_REP_CHAR\n");
492  }
493  continue;
494  }
495  // Two words sharing the same language model, excellent!
496  GenericVector<WERD_CHOICE *> overrides_word1;
497  GenericVector<WERD_CHOICE *> overrides_word2;
498 
499  const STRING orig_w1_str = w_prev->best_choice->unichar_string();
500  const STRING orig_w2_str = w->best_choice->unichar_string();
501  WERD_CHOICE prev_best(w->uch_set);
502  {
503  int w1start, w1end;
504  w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
505  prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
506  }
507  WERD_CHOICE this_best(w->uch_set);
508  {
509  int w2start, w2end;
510  w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
511  this_best = w->best_choice->shallow_copy(w2start, w2end);
512  }
513 
514  if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
515  if (tessedit_bigram_debug) {
516  tprintf("Top choice \"%s %s\" verified by bigram model.\n",
517  orig_w1_str.string(), orig_w2_str.string());
518  }
519  continue;
520  }
521  if (tessedit_bigram_debug > 2) {
522  tprintf("Examining alt choices for \"%s %s\".\n",
523  orig_w1_str.string(), orig_w2_str.string());
524  }
525  if (tessedit_bigram_debug > 1) {
526  if (!w_prev->best_choices.singleton()) {
527  w_prev->PrintBestChoices();
528  }
529  if (!w->best_choices.singleton()) {
530  w->PrintBestChoices();
531  }
532  }
533  float best_rating = 0.0;
534  int best_idx = 0;
535  WERD_CHOICE_IT prev_it(&w_prev->best_choices);
536  for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
537  WERD_CHOICE *p1 = prev_it.data();
538  WERD_CHOICE strip1(w->uch_set);
539  {
540  int p1start, p1end;
541  p1->GetNonSuperscriptSpan(&p1start, &p1end);
542  strip1 = p1->shallow_copy(p1start, p1end);
543  }
544  WERD_CHOICE_IT w_it(&w->best_choices);
545  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
546  WERD_CHOICE *p2 = w_it.data();
547  WERD_CHOICE strip2(w->uch_set);
548  {
549  int p2start, p2end;
550  p2->GetNonSuperscriptSpan(&p2start, &p2end);
551  strip2 = p2->shallow_copy(p2start, p2end);
552  }
553  if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
554  overrides_word1.push_back(p1);
555  overrides_word2.push_back(p2);
556  if (overrides_word1.size() == 1 ||
557  p1->rating() + p2->rating() < best_rating) {
558  best_rating = p1->rating() + p2->rating();
559  best_idx = overrides_word1.size() - 1;
560  }
561  }
562  }
563  }
564  if (!overrides_word1.empty()) {
565  // Excellent, we have some bigram matches.
567  *overrides_word1[best_idx]) &&
569  *overrides_word2[best_idx])) {
570  if (tessedit_bigram_debug > 1) {
571  tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
572  "model.\n", orig_w1_str.string(), orig_w2_str.string());
573  }
574  continue;
575  }
576  const STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
577  const STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
578  if (new_w1_str != orig_w1_str) {
579  w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
580  }
581  if (new_w2_str != orig_w2_str) {
582  w->ReplaceBestChoice(overrides_word2[best_idx]);
583  }
584  if (tessedit_bigram_debug > 0) {
585  STRING choices_description;
586  int num_bigram_choices
587  = overrides_word1.size() * overrides_word2.size();
588  if (num_bigram_choices == 1) {
589  choices_description = "This was the unique bigram choice.";
590  } else {
591  if (tessedit_bigram_debug > 1) {
592  STRING bigrams_list;
593  const int kMaxChoicesToPrint = 20;
594  for (int i = 0; i < overrides_word1.size() &&
595  i < kMaxChoicesToPrint; i++) {
596  if (i > 0) { bigrams_list += ", "; }
597  WERD_CHOICE *p1 = overrides_word1[i];
598  WERD_CHOICE *p2 = overrides_word2[i];
599  bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
600  }
601  choices_description = "There were many choices: {";
602  choices_description += bigrams_list;
603  choices_description += "}";
604  } else {
605  choices_description.add_str_int("There were ", num_bigram_choices);
606  choices_description += " compatible bigrams.";
607  }
608  }
609  tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
610  orig_w1_str.string(), orig_w2_str.string(),
611  new_w1_str.string(), new_w2_str.string(),
612  choices_description.string());
613  }
614  }
615  }
616 }
617 
619  ETEXT_DESC* monitor,
620  const TBOX* target_word_box,
621  const char* word_config) {
622  PAGE_RES_IT page_res_it(page_res);
623  // ****************** Pass 5 *******************
624  // Gather statistics on rejects.
625  int word_index = 0;
626  while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
628  WERD_RES* word = page_res_it.word();
629  word_index++;
630  if (monitor != nullptr) {
631  monitor->ocr_alive = TRUE;
632  monitor->progress = 95 + 5 * word_index / stats_.word_count;
633  }
634  if (word->rebuild_word == nullptr) {
635  // Word was not processed by tesseract.
636  page_res_it.forward();
637  continue;
638  }
639  check_debug_pt(word, 70);
640 
641  // changed by jetsoft
642  // specific to its needs to extract one word when need
643  if (target_word_box &&
645  *target_word_box, word_config, 4)) {
646  page_res_it.forward();
647  continue;
648  }
649  // end jetsoft
650 
651  page_res_it.rej_stat_word();
652  const int chars_in_word = word->reject_map.length();
653  const int rejects_in_word = word->reject_map.reject_count();
654 
655  const int blob_quality = word_blob_quality(word, page_res_it.row()->row);
656  stats_.doc_blob_quality += blob_quality;
657  const int outline_errs = word_outline_errs(word);
658  stats_.doc_outline_errs += outline_errs;
659  int16_t all_char_quality;
660  int16_t accepted_all_char_quality;
661  word_char_quality(word, page_res_it.row()->row,
662  &all_char_quality, &accepted_all_char_quality);
663  stats_.doc_char_quality += all_char_quality;
664  const uint8_t permuter_type = word->best_choice->permuter();
665  if ((permuter_type == SYSTEM_DAWG_PERM) ||
666  (permuter_type == FREQ_DAWG_PERM) ||
667  (permuter_type == USER_DAWG_PERM)) {
668  stats_.good_char_count += chars_in_word - rejects_in_word;
669  stats_.doc_good_char_quality += accepted_all_char_quality;
670  }
671  check_debug_pt(word, 80);
673  (blob_quality == 0) && (outline_errs >= chars_in_word))
675  check_debug_pt(word, 90);
676  page_res_it.forward();
677  }
678 
680  tprintf
681  ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
682  " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
683  page_res->char_count, page_res->rej_count,
684  page_res->rej_count / static_cast<float>(page_res->char_count),
685  stats_.doc_blob_quality,
686  stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
687  stats_.doc_outline_errs,
688  stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
689  stats_.doc_char_quality,
690  stats_.doc_char_quality / static_cast<float>(page_res->char_count),
691  stats_.doc_good_char_quality,
692  (stats_.good_char_count > 0) ?
693  (stats_.doc_good_char_quality /
694  static_cast<float>(stats_.good_char_count)) : 0.0);
695  }
696  bool good_quality_doc =
697  ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
698  quality_rej_pc) &&
699  (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
700  quality_blob_pc) &&
701  (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
703  (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
705 
706  // ****************** Pass 6 *******************
707  // Do whole document or whole block rejection pass
708  if (!tessedit_test_adaption) {
710  quality_based_rejection(page_res_it, good_quality_doc);
711  }
712 }
713 
714 #endif // ndef DISABLED_LEGACY_ENGINE
715 
717  if (!wordrec_run_blamer) return;
718  PAGE_RES_IT page_res_it(page_res);
719  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
720  page_res_it.forward()) {
721  WERD_RES *word = page_res_it.word();
724  }
725  tprintf("Blame reasons:\n");
726  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
728  static_cast<IncorrectResultReason>(bl)),
729  page_res->blame_reasons[bl]);
730  }
731  if (page_res->misadaption_log.length() > 0) {
732  tprintf("Misadaption log:\n");
733  for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
734  tprintf("%s\n", page_res->misadaption_log[i].string());
735  }
736  }
737 }
738 
739 // Sets script positions and detects smallcaps on all output words.
741  PAGE_RES_IT page_res_it(page_res);
742  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
743  page_res_it.forward()) {
744  WERD_RES* word = page_res_it.word();
745  if (word->word->flag(W_REP_CHAR)) {
746  page_res_it.forward();
747  continue;
748  }
749  const float x_height = page_res_it.block()->block->x_height();
750  float word_x_height = word->x_height;
751  if (word_x_height < word->best_choice->min_x_height() ||
752  word_x_height > word->best_choice->max_x_height()) {
753  word_x_height = (word->best_choice->min_x_height() +
754  word->best_choice->max_x_height()) / 2.0f;
755  }
756  // Test for small caps. Word capheight must be close to block xheight,
757  // and word must contain no lower case letters, and at least one upper case.
758  const double small_cap_xheight = x_height * kXHeightCapRatio;
759  const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
760  if (word->uch_set->script_has_xheight() &&
761  small_cap_xheight - small_cap_delta <= word_x_height &&
762  word_x_height <= small_cap_xheight + small_cap_delta) {
763  // Scan for upper/lower.
764  int num_upper = 0;
765  int num_lower = 0;
766  for (int i = 0; i < word->best_choice->length(); ++i) {
767  if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
768  ++num_upper;
769  else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
770  ++num_lower;
771  }
772  if (num_upper > 0 && num_lower == 0)
773  word->small_caps = true;
774  }
775  word->SetScriptPositions();
776  }
777 }
778 
779 // Helper finds the gap between the index word and the next.
780 static void WordGap(const PointerVector<WERD_RES>& words, int index, int* right,
781  int* next_left) {
782  *right = -INT32_MAX;
783  *next_left = INT32_MAX;
784  if (index < words.size()) {
785  *right = words[index]->word->bounding_box().right();
786  if (index + 1 < words.size())
787  *next_left = words[index + 1]->word->bounding_box().left();
788  }
789 }
790 
791 // Factored helper computes the rating, certainty, badness and validity of
792 // the permuter of the words in [first_index, end_index).
793 static void EvaluateWordSpan(const PointerVector<WERD_RES>& words,
794  int first_index, int end_index, float* rating,
795  float* certainty, bool* bad,
796  bool* valid_permuter) {
797  if (end_index <= first_index) {
798  *bad = true;
799  *valid_permuter = false;
800  }
801  for (int index = first_index; index < end_index && index < words.size();
802  ++index) {
803  WERD_CHOICE* choice = words[index]->best_choice;
804  if (choice == nullptr) {
805  *bad = true;
806  } else {
807  *rating += choice->rating();
808  *certainty = std::min(*certainty, choice->certainty());
809  if (!Dict::valid_word_permuter(choice->permuter(), false))
810  *valid_permuter = false;
811  }
812  }
813 }
814 
815 // Helper chooses the best combination of words, transferring good ones from
816 // new_words to best_words. To win, a new word must have (better rating and
817 // certainty) or (better permuter status and rating within rating ratio and
818 // certainty within certainty margin) than current best.
819 // All the new_words are consumed (moved to best_words or deleted.)
820 // The return value is the number of new_words used minus the number of
821 // best_words that remain in the output.
822 static int SelectBestWords(double rating_ratio,
823  double certainty_margin,
824  bool debug,
825  PointerVector<WERD_RES>* new_words,
826  PointerVector<WERD_RES>* best_words) {
827  // Process the smallest groups of words that have an overlapping word
828  // boundary at the end.
829  GenericVector<WERD_RES*> out_words;
830  // Index into each word vector (best, new).
831  int b = 0, n = 0;
832  int num_best = 0, num_new = 0;
833  while (b < best_words->size() || n < new_words->size()) {
834  // Start of the current run in each.
835  int start_b = b, start_n = n;
836  while (b < best_words->size() || n < new_words->size()) {
837  int b_right = -INT32_MAX;
838  int next_b_left = INT32_MAX;
839  WordGap(*best_words, b, &b_right, &next_b_left);
840  int n_right = -INT32_MAX;
841  int next_n_left = INT32_MAX;
842  WordGap(*new_words, n, &n_right, &next_n_left);
843  if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) {
844  // The word breaks overlap. [start_b,b] and [start_n, n] match.
845  break;
846  }
847  // Keep searching for the matching word break.
848  if ((b_right < n_right && b < best_words->size()) ||
849  n == new_words->size())
850  ++b;
851  else
852  ++n;
853  }
854  // Rating of the current run in each.
855  float b_rating = 0.0f, n_rating = 0.0f;
856  // Certainty of the current run in each.
857  float b_certainty = 0.0f, n_certainty = 0.0f;
858  // True if any word is missing its best choice.
859  bool b_bad = false, n_bad = false;
860  // True if all words have a valid permuter.
861  bool b_valid_permuter = true, n_valid_permuter = true;
862  const int end_b = b < best_words->size() ? b + 1 : b;
863  const int end_n = n < new_words->size() ? n + 1 : n;
864  EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty,
865  &b_bad, &b_valid_permuter);
866  EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty,
867  &n_bad, &n_valid_permuter);
868  bool new_better = false;
869  if (!n_bad && (b_bad || (n_certainty > b_certainty &&
870  n_rating < b_rating) ||
871  (!b_valid_permuter && n_valid_permuter &&
872  n_rating < b_rating * rating_ratio &&
873  n_certainty > b_certainty - certainty_margin))) {
874  // New is better.
875  for (int i = start_n; i < end_n; ++i) {
876  out_words.push_back((*new_words)[i]);
877  (*new_words)[i] = nullptr;
878  ++num_new;
879  }
880  new_better = true;
881  } else if (!b_bad) {
882  // Current best is better.
883  for (int i = start_b; i < end_b; ++i) {
884  out_words.push_back((*best_words)[i]);
885  (*best_words)[i] = nullptr;
886  ++num_best;
887  }
888  }
889  if (debug) {
890  tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"
891  " valid dict: %d v %d\n",
892  end_n - start_n, new_better ? "better" : "worse",
893  end_b - start_b, n_rating, b_rating,
894  n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
895  }
896  // Move on to the next group.
897  b = end_b;
898  n = end_n;
899  }
900  // Transfer from out_words to best_words.
901  best_words->clear();
902  for (int i = 0; i < out_words.size(); ++i)
903  best_words->push_back(out_words[i]);
904  return num_new - num_best;
905 }
906 
907 // Helper to recognize the word using the given (language-specific) tesseract.
908 // Returns positive if this recognizer found more new best words than the
909 // number kept from best_words.
911  WordRecognizer recognizer, bool debug,
912  WERD_RES** in_word,
913  PointerVector<WERD_RES>* best_words) {
914  if (debug) {
915  tprintf("Trying word using lang %s, oem %d\n",
916  lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
917  }
918  // Run the recognizer on the word.
919  PointerVector<WERD_RES> new_words;
920  (this->*recognizer)(word_data, in_word, &new_words);
921  if (new_words.empty()) {
922  // Transfer input word to new_words, as the classifier must have put
923  // the result back in the input.
924  new_words.push_back(*in_word);
925  *in_word = nullptr;
926  }
927  if (debug) {
928  for (int i = 0; i < new_words.size(); ++i)
929  new_words[i]->DebugTopChoice("Lang result");
930  }
931  // Initial version is a bit of a hack based on better certainty and rating
932  // or a dictionary vs non-dictionary word.
933  return SelectBestWords(classify_max_rating_ratio,
935  debug, &new_words, best_words);
936 }
937 
938 // Helper returns true if all the words are acceptable.
939 static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
940  for (int w = 0; w < words.size(); ++w) {
941  if (words[w]->tess_failed || !words[w]->tess_accepted) return false;
942  }
943  return true;
944 }
945 
946 // Moves good-looking "noise"/diacritics from the reject list to the main
947 // blob list on the current word. Returns true if anything was done, and
948 // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
950  bool* make_next_word_fuzzy) {
951 #ifdef DISABLED_LEGACY_ENGINE
952  return false;
953 #else
954  *make_next_word_fuzzy = false;
955  WERD* real_word = pr_it->word()->word;
956  if (real_word->rej_cblob_list()->empty() ||
957  real_word->cblob_list()->empty() ||
958  real_word->rej_cblob_list()->length() > noise_maxperword)
959  return false;
960  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
961  // Get the noise outlines into a vector with matching bool map.
962  GenericVector<C_OUTLINE*> outlines;
963  real_word->GetNoiseOutlines(&outlines);
964  GenericVector<bool> word_wanted;
965  GenericVector<bool> overlapped_any_blob;
966  GenericVector<C_BLOB*> target_blobs;
967  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
968  &word_wanted, &overlapped_any_blob,
969  &target_blobs);
970  // Filter the outlines that overlapped any blob and put them into the word
971  // now. This simplifies the remaining task and also makes it more accurate
972  // as it has more completed blobs to work on.
973  GenericVector<bool> wanted;
974  GenericVector<C_BLOB*> wanted_blobs;
975  GenericVector<C_OUTLINE*> wanted_outlines;
976  int num_overlapped = 0;
977  int num_overlapped_used = 0;
978  for (int i = 0; i < overlapped_any_blob.size(); ++i) {
979  if (overlapped_any_blob[i]) {
980  ++num_overlapped;
981  if (word_wanted[i]) ++num_overlapped_used;
982  wanted.push_back(word_wanted[i]);
983  wanted_blobs.push_back(target_blobs[i]);
984  wanted_outlines.push_back(outlines[i]);
985  outlines[i] = nullptr;
986  }
987  }
988  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
989  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
990  &target_blobs);
991  int non_overlapped = 0;
992  int non_overlapped_used = 0;
993  for (int i = 0; i < word_wanted.size(); ++i) {
994  if (word_wanted[i]) ++non_overlapped_used;
995  if (outlines[i] != nullptr) ++non_overlapped_used;
996  }
997  if (debug_noise_removal) {
998  tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
999  num_overlapped_used, num_overlapped, non_overlapped_used,
1000  non_overlapped);
1001  real_word->bounding_box().print();
1002  }
1003  // Now we have decided which outlines we want, put them into the real_word.
1004  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
1005  make_next_word_fuzzy)) {
1006  pr_it->MakeCurrentWordFuzzy();
1007  }
1008  // TODO(rays) Parts of combos have a deep copy of the real word, and need
1009  // to have their noise outlines moved/assigned in the same way!!
1010  return num_overlapped_used != 0 || non_overlapped_used != 0;
1011 #endif // ndef DISABLED_LEGACY_ENGINE
1012 }
1013 
1014 // Attempts to put noise/diacritic outlines into the blobs that they overlap.
1015 // Input: a set of noisy outlines that probably belong to the real_word.
1016 // Output: word_wanted indicates which outlines are to be assigned to a blob,
1017 // target_blobs indicates which to assign to, and overlapped_any_blob is
1018 // true for all outlines that overlapped a blob.
1020  const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
1021  PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
1022  GenericVector<bool>* overlapped_any_blob,
1023  GenericVector<C_BLOB*>* target_blobs) {
1024 #ifndef DISABLED_LEGACY_ENGINE
1025  GenericVector<bool> blob_wanted;
1026  word_wanted->init_to_size(outlines.size(), false);
1027  overlapped_any_blob->init_to_size(outlines.size(), false);
1028  target_blobs->init_to_size(outlines.size(), nullptr);
1029  // For each real blob, find the outlines that seriously overlap it.
1030  // A single blob could be several merged characters, so there can be quite
1031  // a few outlines overlapping, and the full engine needs to be used to chop
1032  // and join to get a sensible result.
1033  C_BLOB_IT blob_it(real_word->cblob_list());
1034  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1035  C_BLOB* blob = blob_it.data();
1036  const TBOX blob_box = blob->bounding_box();
1037  blob_wanted.init_to_size(outlines.size(), false);
1038  int num_blob_outlines = 0;
1039  for (int i = 0; i < outlines.size(); ++i) {
1040  if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
1041  !(*word_wanted)[i]) {
1042  blob_wanted[i] = true;
1043  (*overlapped_any_blob)[i] = true;
1044  ++num_blob_outlines;
1045  }
1046  }
1047  if (debug_noise_removal) {
1048  tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1049  blob_box.print();
1050  }
1051  // If any outlines overlap the blob, and not too many, classify the blob
1052  // (using the full engine, languages and all), and choose the maximal
1053  // combination of outlines that doesn't hurt the end-result classification
1054  // by too much. Mark them as wanted.
1055  if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1056  if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
1057  outlines, num_blob_outlines,
1058  &blob_wanted)) {
1059  for (int i = 0; i < blob_wanted.size(); ++i) {
1060  if (blob_wanted[i]) {
1061  // Claim the outline and record where it is going.
1062  (*word_wanted)[i] = true;
1063  (*target_blobs)[i] = blob;
1064  }
1065  }
1066  }
1067  }
1068  }
1069 #endif // ndef DISABLED_LEGACY_ENGINE
1070 }
1071 
1072 // Attempts to assign non-overlapping outlines to their nearest blobs or
1073 // make new blobs out of them.
1075  const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
1076  PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
1077  GenericVector<C_BLOB*>* target_blobs) {
1078 #ifndef DISABLED_LEGACY_ENGINE
1079  GenericVector<bool> blob_wanted;
1080  word_wanted->init_to_size(outlines.size(), false);
1081  target_blobs->init_to_size(outlines.size(), nullptr);
1082  // Check for outlines that need to be turned into stand-alone blobs.
1083  for (int i = 0; i < outlines.size(); ++i) {
1084  if (outlines[i] == nullptr) continue;
1085  // Get a set of adjacent outlines that don't overlap any existing blob.
1086  blob_wanted.init_to_size(outlines.size(), false);
1087  int num_blob_outlines = 0;
1088  TBOX total_ol_box(outlines[i]->bounding_box());
1089  while (i < outlines.size() && outlines[i] != nullptr) {
1090  blob_wanted[i] = true;
1091  total_ol_box += outlines[i]->bounding_box();
1092  ++i;
1093  ++num_blob_outlines;
1094  }
1095  // Find the insertion point.
1096  C_BLOB_IT blob_it(real_word->cblob_list());
1097  while (!blob_it.at_last() &&
1098  blob_it.data_relative(1)->bounding_box().left() <=
1099  total_ol_box.left()) {
1100  blob_it.forward();
1101  }
1102  // Choose which combination of them we actually want and where to put
1103  // them.
1104  if (debug_noise_removal)
1105  tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1106  C_BLOB* left_blob = blob_it.data();
1107  TBOX left_box = left_blob->bounding_box();
1108  C_BLOB* right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1109  if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
1110  !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1111  SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
1112  outlines, num_blob_outlines,
1113  &blob_wanted)) {
1114  if (debug_noise_removal) tprintf("Added to left blob\n");
1115  for (int j = 0; j < blob_wanted.size(); ++j) {
1116  if (blob_wanted[j]) {
1117  (*word_wanted)[j] = true;
1118  (*target_blobs)[j] = left_blob;
1119  }
1120  }
1121  } else if (right_blob != nullptr &&
1122  (!left_box.x_overlap(total_ol_box) ||
1123  right_blob->bounding_box().x_overlap(total_ol_box)) &&
1125  right_blob, outlines,
1126  num_blob_outlines, &blob_wanted)) {
1127  if (debug_noise_removal) tprintf("Added to right blob\n");
1128  for (int j = 0; j < blob_wanted.size(); ++j) {
1129  if (blob_wanted[j]) {
1130  (*word_wanted)[j] = true;
1131  (*target_blobs)[j] = right_blob;
1132  }
1133  }
1134  } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr,
1135  outlines, num_blob_outlines,
1136  &blob_wanted)) {
1137  if (debug_noise_removal) tprintf("Fitted between blobs\n");
1138  for (int j = 0; j < blob_wanted.size(); ++j) {
1139  if (blob_wanted[j]) {
1140  (*word_wanted)[j] = true;
1141  (*target_blobs)[j] = nullptr;
1142  }
1143  }
1144  }
1145  }
1146 #endif // ndef DISABLED_LEGACY_ENGINE
1147 }
1148 
1149 // Starting with ok_outlines set to indicate which outlines overlap the blob,
1150 // chooses the optimal set (approximately) and returns true if any outlines
1151 // are desired, in which case ok_outlines indicates which ones.
1153  int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob,
1154  const GenericVector<C_OUTLINE*>& outlines, int num_outlines,
1155  GenericVector<bool>* ok_outlines) {
1156 #ifndef DISABLED_LEGACY_ENGINE
1157  STRING best_str;
1158  float target_cert = certainty_threshold;
1159  if (blob != nullptr) {
1160  float target_c2;
1161  target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
1162  if (debug_noise_removal) {
1163  tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(),
1164  target_cert, target_c2);
1165  blob->bounding_box().print();
1166  }
1167  target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1168  }
1169  GenericVector<bool> test_outlines = *ok_outlines;
1170  // Start with all the outlines in.
1171  STRING all_str;
1172  GenericVector<bool> best_outlines = *ok_outlines;
1173  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1174  pr_it, blob, &all_str);
1175  if (debug_noise_removal) {
1176  TBOX ol_box;
1177  for (int i = 0; i < test_outlines.size(); ++i) {
1178  if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
1179  }
1180  tprintf("All Noise blob classified as %s=%g, delta=%g at:",
1181  all_str.string(), best_cert, best_cert - target_cert);
1182  ol_box.print();
1183  }
1184  // Iteratively zero out the bit that improves the certainty the most, until
1185  // we get past the threshold, have zero bits, or fail to improve.
1186  int best_index = 0; // To zero out.
1187  while (num_outlines > 1 && best_index >= 0 &&
1188  (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
1189  // Find the best bit to zero out.
1190  best_index = -1;
1191  for (int i = 0; i < outlines.size(); ++i) {
1192  if (test_outlines[i]) {
1193  test_outlines[i] = false;
1194  STRING str;
1195  float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1196  pr_it, blob, &str);
1197  if (debug_noise_removal) {
1198  TBOX ol_box;
1199  for (int j = 0; j < outlines.size(); ++j) {
1200  if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
1201  tprintf("%d", test_outlines[j]);
1202  }
1203  tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(),
1204  cert, cert - target_cert);
1205  ol_box.print();
1206  }
1207  if (cert > best_cert) {
1208  best_cert = cert;
1209  best_index = i;
1210  best_outlines = test_outlines;
1211  }
1212  test_outlines[i] = true;
1213  }
1214  }
1215  if (best_index >= 0) {
1216  test_outlines[best_index] = false;
1217  --num_outlines;
1218  }
1219  }
1220  if (best_cert >= target_cert) {
1221  // Save the best combination.
1222  *ok_outlines = best_outlines;
1223  if (debug_noise_removal) {
1224  tprintf("%s noise combination ", blob ? "Adding" : "New");
1225  for (int i = 0; i < best_outlines.size(); ++i) {
1226  tprintf("%d", best_outlines[i]);
1227  }
1228  tprintf(" yields certainty %g, beating target of %g\n", best_cert,
1229  target_cert);
1230  }
1231  return true;
1232  }
1233 #endif // ndef DISABLED_LEGACY_ENGINE
1234  return false;
1235 }
1236 
1237 // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
1238 // the inclusion of the outlines, and returns the certainty of the raw choice.
1240  const GenericVector<bool>& ok_outlines,
1241  const GenericVector<C_OUTLINE*>& outlines, int pass_n, PAGE_RES_IT* pr_it,
1242  C_BLOB* blob, STRING* best_str) {
1243 #ifndef DISABLED_LEGACY_ENGINE
1244  C_OUTLINE_IT ol_it;
1245  C_OUTLINE* first_to_keep = nullptr;
1246  C_BLOB* local_blob = nullptr;
1247  if (blob != nullptr) {
1248  // Add the required outlines to the blob.
1249  ol_it.set_to_list(blob->out_list());
1250  first_to_keep = ol_it.data();
1251  }
1252  for (int i = 0; i < ok_outlines.size(); ++i) {
1253  if (ok_outlines[i]) {
1254  // This outline is to be added.
1255  if (blob == nullptr) {
1256  local_blob = new C_BLOB(outlines[i]);
1257  blob = local_blob;
1258  ol_it.set_to_list(blob->out_list());
1259  } else {
1260  ol_it.add_before_stay_put(outlines[i]);
1261  }
1262  }
1263  }
1264  float c2;
1265  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1266  ol_it.move_to_first();
1267  if (first_to_keep == nullptr) {
1268  // We created blob. Empty its outlines and delete it.
1269  for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
1270  delete local_blob;
1271  cert = -c2;
1272  } else {
1273  // Remove the outlines that we put in.
1274  for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1275  ol_it.extract();
1276  }
1277  }
1278  return cert;
1279 #else
1280  return 0.1;
1281 #endif // ndef DISABLED_LEGACY_ENGINE
1282 }
1283 
1284 // Classifies the given blob (part of word_data->word->word) as an individual
1285 // word, using languages, chopper etc, returning only the certainty of the
1286 // best raw choice, and undoing all the work done to fake out the word.
1288  C_BLOB* blob, STRING* best_str, float* c2) {
1289 #ifndef DISABLED_LEGACY_ENGINE
1290  WERD* real_word = pr_it->word()->word;
1291  WERD* word = real_word->ConstructFromSingleBlob(
1292  real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
1293  WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1294  // Get a new iterator that points to the new word.
1295  PAGE_RES_IT it(pr_it->page_res);
1296  while (it.word() != word_res && it.word() != nullptr) it.forward();
1297  ASSERT_HOST(it.word() == word_res);
1298  WordData wd(it);
1299  // Force full initialization.
1300  SetupWordPassN(1, &wd);
1301  classify_word_and_language(pass_n, &it, &wd);
1302  if (debug_noise_removal) {
1303  if (wd.word->raw_choice != NULL) {
1304  tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
1305  wd.row->x_height(), wd.word->raw_choice->min_x_height(),
1306  wd.word->raw_choice->max_x_height());
1307  } else {
1308  tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
1309  wd.row->x_height());
1310  }
1311  }
1312  float cert = 0.0f;
1313  if (wd.word->raw_choice != NULL) { // This probably shouldn't happen, but...
1314  cert = wd.word->raw_choice->certainty();
1315  float rat = wd.word->raw_choice->rating();
1316  *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1317  *best_str = wd.word->raw_choice->unichar_string();
1318  } else {
1319  *c2 = 0.0f;
1320  *best_str = "";
1321  }
1322  it.DeleteCurrentWord();
1323  pr_it->ResetWordIterator();
1324  return cert;
1325 #else
1326  return 0.1;
1327 #endif // ndef DISABLED_LEGACY_ENGINE
1328 }
1329 
1330 // Generic function for classifying a word. Can be used either for pass1 or
1331 // pass2 according to the function passed to recognizer.
1332 // word_data holds the word to be recognized, and its block and row, and
1333 // pr_it points to the word as well, in case we are running LSTM and it wants
1334 // to output multiple words.
1335 // Recognizes in the current language, and if successful that is all.
1336 // If recognition was not successful, tries all available languages until
1337 // it gets a successful result or runs out of languages. Keeps the best result.
1339  WordData* word_data) {
1340 #ifdef DISABLED_LEGACY_ENGINE
1342 #else
1343  WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
1345 #endif // def DISABLED_LEGACY_ENGINE
1346 
1347  // Best result so far.
1348  PointerVector<WERD_RES> best_words;
1349  // Points to the best result. May be word or in lang_words.
1350  const WERD_RES* word = word_data->word;
1351  clock_t start_t = clock();
1352  const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1353  if (debug) {
1354  tprintf("%s word with lang %s at:",
1355  word->done ? "Already done" : "Processing",
1356  most_recently_used_->lang.string());
1357  word->word->bounding_box().print();
1358  }
1359  if (word->done) {
1360  // If done on pass1, leave it as-is.
1361  if (!word->tess_failed)
1362  most_recently_used_ = word->tesseract;
1363  return;
1364  }
1365  int sub = sub_langs_.size();
1366  if (most_recently_used_ != this) {
1367  // Get the index of the most_recently_used_.
1368  for (sub = 0; sub < sub_langs_.size() &&
1369  most_recently_used_ != sub_langs_[sub]; ++sub) {}
1370  }
1371  most_recently_used_->RetryWithLanguage(
1372  *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
1373  Tesseract* best_lang_tess = most_recently_used_;
1374  if (!WordsAcceptable(best_words)) {
1375  // Try all the other languages to see if they are any better.
1376  if (most_recently_used_ != this &&
1377  this->RetryWithLanguage(*word_data, recognizer, debug,
1378  &word_data->lang_words[sub_langs_.size()],
1379  &best_words) > 0) {
1380  best_lang_tess = this;
1381  }
1382  for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
1383  ++i) {
1384  if (most_recently_used_ != sub_langs_[i] &&
1385  sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
1386  &word_data->lang_words[i],
1387  &best_words) > 0) {
1388  best_lang_tess = sub_langs_[i];
1389  }
1390  }
1391  }
1392  most_recently_used_ = best_lang_tess;
1393  if (!best_words.empty()) {
1394  if (best_words.size() == 1 && !best_words[0]->combination) {
1395  // Move the best single result to the main word.
1396  word_data->word->ConsumeWordResults(best_words[0]);
1397  } else {
1398  // Words came from LSTM, and must be moved to the PAGE_RES properly.
1399  word_data->word = best_words.back();
1400  pr_it->ReplaceCurrentWord(&best_words);
1401  }
1402  ASSERT_HOST(word_data->word->box_word != nullptr);
1403  } else {
1404  tprintf("no best words!!\n");
1405  }
1406  clock_t ocr_t = clock();
1407  if (tessedit_timing_debug) {
1408  tprintf("%s (ocr took %.2f sec)\n",
1409  word->best_choice->unichar_string().string(),
1410  static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
1411  }
1412 }
1413 
1421  WERD_RES** in_word,
1422  PointerVector<WERD_RES>* out_words) {
1423  ROW* row = word_data.row;
1424  BLOCK* block = word_data.block;
1425  prev_word_best_choice_ = word_data.prev_word != nullptr
1426  ? word_data.prev_word->word->best_choice : nullptr;
1427 #ifndef ANDROID_BUILD
1428 #ifdef DISABLED_LEGACY_ENGINE
1430 #else
1433 #endif // def DISABLED_LEGACY_ENGINE
1434  if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1435  LSTMRecognizeWord(*block, row, *in_word, out_words);
1436  if (!out_words->empty())
1437  return; // Successful lstm recognition.
1438  }
1440  // No fallback allowed, so use a fake.
1441  (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1442  return;
1443  }
1444 
1445  #ifndef DISABLED_LEGACY_ENGINE
1446  // Fall back to tesseract for failed words or odd words.
1447  (*in_word)->SetupForRecognition(unicharset, this, BestPix(),
1448  OEM_TESSERACT_ONLY, nullptr,
1451  poly_allow_detailed_fx, row, block);
1452 #endif // ndef DISABLED_LEGACY_ENGINE
1453  }
1454 #endif // ndef ANDROID_BUILD
1455 
1456 #ifndef DISABLED_LEGACY_ENGINE
1457  WERD_RES* word = *in_word;
1458  match_word_pass_n(1, word, row, block);
1459  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1460  word->tess_would_adapt = AdaptableWord(word);
1461  bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1462 
1463  if (adapt_ok) {
1464  // Send word to adaptive classifier for training.
1465  word->BestChoiceToCorrectText();
1466  LearnWord(nullptr, word);
1467  // Mark misadaptions if running blamer.
1468  if (word->blamer_bundle != nullptr) {
1471  }
1472  }
1473 
1474  if (tessedit_enable_doc_dict && !word->IsAmbiguous())
1476  }
1477 #endif // ndef DISABLED_LEGACY_ENGINE
1478 }
1479 
1480 // Helper to report the result of the xheight fix.
1481 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht,
1482  WERD_RES* word, WERD_RES* new_word) {
1483  tprintf("New XHT Match:%s = %s ",
1484  word->best_choice->unichar_string().string(),
1485  word->best_choice->debug_string().string());
1486  word->reject_map.print(debug_fp);
1487  tprintf(" -> %s = %s ",
1488  new_word->best_choice->unichar_string().string(),
1489  new_word->best_choice->debug_string().string());
1490  new_word->reject_map.print(debug_fp);
1491  tprintf(" %s->%s %s %s\n",
1492  word->guessed_x_ht ? "GUESS" : "CERT",
1493  new_word->guessed_x_ht ? "GUESS" : "CERT",
1494  new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1495  accept_new_word ? "ACCEPTED" : "");
1496 }
1497 
1498 #ifndef DISABLED_LEGACY_ENGINE
1499 
1500 // Run the x-height fix-up, based on min/max top/bottom information in
1501 // unicharset.
1502 // Returns true if the word was changed.
1503 // See the comment in fixxht.cpp for a description of the overall process.
1504 bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
1505  int original_misfits = CountMisfitTops(word);
1506  if (original_misfits == 0)
1507  return false;
1508  float baseline_shift = 0.0f;
1509  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1510  if (baseline_shift != 0.0f) {
1511  // Try the shift on its own first.
1512  if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
1513  word, block, row))
1514  return false;
1515  original_misfits = CountMisfitTops(word);
1516  if (original_misfits > 0) {
1517  float new_baseline_shift;
1518  // Now recompute the new x_height.
1519  new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1520  if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1521  // No test of return value here, as we are definitely making a change
1522  // to the word by shifting the baseline.
1523  TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
1524  word, block, row);
1525  }
1526  }
1527  return true;
1528  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1529  return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
1530  word, block, row);
1531  } else {
1532  return false;
1533  }
1534 }
1535 
1536 // Runs recognition with the test baseline shift and x-height and returns true
1537 // if there was an improvement in recognition result.
1538 bool Tesseract::TestNewNormalization(int original_misfits,
1539  float baseline_shift, float new_x_ht,
1540  WERD_RES *word, BLOCK* block, ROW *row) {
1541  bool accept_new_x_ht = false;
1542  WERD_RES new_x_ht_word(word->word);
1543  if (word->blamer_bundle != nullptr) {
1544  new_x_ht_word.blamer_bundle = new BlamerBundle();
1545  new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1546  }
1547  new_x_ht_word.x_height = new_x_ht;
1548  new_x_ht_word.baseline_shift = baseline_shift;
1549  new_x_ht_word.caps_height = 0.0;
1550  new_x_ht_word.SetupForRecognition(
1551  unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
1553  poly_allow_detailed_fx, row, block);
1554  match_word_pass_n(2, &new_x_ht_word, row, block);
1555  if (!new_x_ht_word.tess_failed) {
1556  int new_misfits = CountMisfitTops(&new_x_ht_word);
1557  if (debug_x_ht_level >= 1) {
1558  tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
1559  original_misfits, word->x_height,
1560  new_misfits, new_x_ht);
1561  tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
1562  word->best_choice->rating(), word->best_choice->certainty(),
1563  new_x_ht_word.best_choice->rating(),
1564  new_x_ht_word.best_choice->certainty());
1565  }
1566  // The misfits must improve and either the rating or certainty.
1567  accept_new_x_ht = new_misfits < original_misfits &&
1568  (new_x_ht_word.best_choice->certainty() >
1569  word->best_choice->certainty() ||
1570  new_x_ht_word.best_choice->rating() <
1571  word->best_choice->rating());
1572  if (debug_x_ht_level >= 1) {
1573  ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1574  }
1575  }
1576  if (accept_new_x_ht) {
1577  word->ConsumeWordResults(&new_x_ht_word);
1578  return true;
1579  }
1580  return false;
1581 }
1582 
1583 #endif // ndef DISABLED_LEGACY_ENGINE
1584 
1592  WERD_RES** in_word,
1593  PointerVector<WERD_RES>* out_words) {
1594  // Return if we do not want to run Tesseract.
1596  return;
1597  }
1598 #ifndef DISABLED_LEGACY_ENGINE
1599  ROW* row = word_data.row;
1600  BLOCK* block = word_data.block;
1601  WERD_RES* word = *in_word;
1602  prev_word_best_choice_ = word_data.prev_word != nullptr
1603  ? word_data.prev_word->word->best_choice : nullptr;
1604 
1606  check_debug_pt(word, 30);
1607  if (!word->done) {
1608  word->caps_height = 0.0;
1609  if (word->x_height == 0.0f)
1610  word->x_height = row->x_height();
1611  match_word_pass_n(2, word, row, block);
1612  check_debug_pt(word, 40);
1613  }
1614 
1615  SubAndSuperscriptFix(word);
1616 
1617  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1619  block->classify_rotation().y() == 0.0f) {
1620  // Use the tops and bottoms since they are available.
1621  TrainedXheightFix(word, block, row);
1622  }
1623 
1625  }
1626 #ifndef GRAPHICS_DISABLED
1628  if (fx_win == nullptr)
1629  create_fx_win();
1630  clear_fx_win();
1631  word->rebuild_word->plot(fx_win);
1632  TBOX wbox = word->rebuild_word->bounding_box();
1633  fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
1634  wbox.right(), wbox.bottom());
1636  }
1637 #endif
1639  check_debug_pt(word, 50);
1640 #endif // ndef DISABLED_LEGACY_ENGINE
1641 }
1642 
1643 #ifndef DISABLED_LEGACY_ENGINE
1644 
1650  ROW *row, BLOCK* block) {
1651  if (word->tess_failed) return;
1652  tess_segment_pass_n(pass_n, word);
1653 
1654  if (!word->tess_failed) {
1655  if (!word->word->flag (W_REP_CHAR)) {
1656  word->fix_quotes();
1658  word->fix_hyphens();
1659  /* Don't trust fix_quotes! - though I think I've fixed the bug */
1660  if (word->best_choice->length() != word->box_word->length()) {
1661  tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1662  " #Blobs=%d\n",
1663  word->best_choice->debug_string().string(),
1664  word->best_choice->length(),
1665  word->box_word->length());
1666 
1667  }
1668  word->tess_accepted = tess_acceptable_word(word);
1669 
1670  // Also sets word->done flag
1671  make_reject_map(word, row, pass_n);
1672  }
1673  }
1674  set_word_fonts(word);
1675 
1676  ASSERT_HOST(word->raw_choice != nullptr);
1677 }
1678 #endif // ndef DISABLED_LEGACY_ENGINE
1679 
1680 // Helper to return the best rated BLOB_CHOICE in the whole word that matches
1681 // the given char_id, or nullptr if none can be found.
1682 static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
1683  WERD_RES* word_res) {
1684  // Find the corresponding best BLOB_CHOICE from any position in the word_res.
1685  BLOB_CHOICE* best_choice = nullptr;
1686  for (int i = 0; i < word_res->best_choice->length(); ++i) {
1687  BLOB_CHOICE* choice = FindMatchingChoice(char_id,
1688  word_res->GetBlobChoices(i));
1689  if (choice != nullptr) {
1690  if (best_choice == nullptr || choice->rating() < best_choice->rating())
1691  best_choice = choice;
1692  }
1693  }
1694  return best_choice;
1695 }
1696 
1697 // Helper to insert blob_choice in each location in the leader word if there is
1698 // no matching BLOB_CHOICE there already, and correct any incorrect results
1699 // in the best_choice.
1700 static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice,
1701  WERD_RES* word_res) {
1702  WERD_CHOICE* word = word_res->best_choice;
1703  for (int i = 0; i < word_res->best_choice->length(); ++i) {
1704  BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(),
1705  word_res->GetBlobChoices(i));
1706  if (choice == nullptr) {
1707  BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
1708  choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
1709  }
1710  }
1711  // Correct any incorrect results in word.
1712  for (int i = 0; i < word->length(); ++i) {
1713  if (word->unichar_id(i) != blob_choice->unichar_id())
1714  word->set_unichar_id(blob_choice->unichar_id(), i);
1715  }
1716 }
1717 
1726  WERD_RES *word_res = page_res_it->word();
1727  const WERD_CHOICE &word = *(word_res->best_choice);
1728 
1729  // Find the frequency of each unique character in the word.
1730  SortHelper<UNICHAR_ID> rep_ch(word.length());
1731  for (int i = 0; i < word.length(); ++i) {
1732  rep_ch.Add(word.unichar_id(i), 1);
1733  }
1734 
1735  // Find the most frequent result.
1736  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1737  int max_count = rep_ch.MaxCount(&maxch_id);
1738  // Find the best exemplar of a classifier result for maxch_id.
1739  BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1740  if (best_choice == nullptr) {
1741  tprintf("Failed to find a choice for %s, occurring %d times\n",
1742  word_res->uch_set->debug_str(maxch_id).string(), max_count);
1743  return;
1744  }
1745  word_res->done = TRUE;
1746 
1747  // Measure the mean space.
1748  int gap_count = 0;
1749  WERD* werd = word_res->word;
1750  C_BLOB_IT blob_it(werd->cblob_list());
1751  C_BLOB* prev_blob = blob_it.data();
1752  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1753  C_BLOB* blob = blob_it.data();
1754  int gap = blob->bounding_box().left();
1755  gap -= prev_blob->bounding_box().right();
1756  ++gap_count;
1757  prev_blob = blob;
1758  }
1759  // Just correct existing classification.
1760  CorrectRepcharChoices(best_choice, word_res);
1761  word_res->reject_map.initialise(word.length());
1762 }
1763 
1765  const UNICHARSET& char_set, const char *s, const char *lengths) {
1766  int i = 0;
1767  int offset = 0;
1768  int leading_punct_count;
1769  int upper_count = 0;
1770  int hyphen_pos = -1;
1772 
1773  if (strlen (lengths) > 20)
1774  return word_type;
1775 
1776  /* Single Leading punctuation char*/
1777 
1778  if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
1779  offset += lengths[i++];
1780  leading_punct_count = i;
1781 
1782  /* Initial cap */
1783  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1784  offset += lengths[i++];
1785  upper_count++;
1786  }
1787  if (upper_count > 1) {
1788  word_type = AC_UPPER_CASE;
1789  } else {
1790  /* Lower case word, possibly with an initial cap */
1791  while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1792  offset += lengths[i++];
1793  }
1794  if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1795  goto not_a_word;
1796  /*
1797  Allow a single hyphen in a lower case word
1798  - don't trust upper case - I've seen several cases of "H" -> "I-I"
1799  */
1800  if (lengths[i] == 1 && s[offset] == '-') {
1801  hyphen_pos = i;
1802  offset += lengths[i++];
1803  if (s[offset] != '\0') {
1804  while ((s[offset] != '\0') &&
1805  char_set.get_islower(s + offset, lengths[i])) {
1806  offset += lengths[i++];
1807  }
1808  if (i < hyphen_pos + 3)
1809  goto not_a_word;
1810  }
1811  } else {
1812  /* Allow "'s" in NON hyphenated lower case words */
1813  if (lengths[i] == 1 && (s[offset] == '\'') &&
1814  lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1815  offset += lengths[i++];
1816  offset += lengths[i++];
1817  }
1818  }
1819  if (upper_count > 0)
1820  word_type = AC_INITIAL_CAP;
1821  else
1822  word_type = AC_LOWER_CASE;
1823  }
1824 
1825  /* Up to two different, constrained trailing punctuation chars */
1826  if (lengths[i] == 1 && s[offset] != '\0' &&
1827  STRING(chs_trailing_punct1).contains(s[offset]))
1828  offset += lengths[i++];
1829  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
1830  s[offset - lengths[i - 1]] != s[offset] &&
1831  STRING(chs_trailing_punct2).contains (s[offset]))
1832  offset += lengths[i++];
1833 
1834  if (s[offset] != '\0')
1835  word_type = AC_UNACCEPTABLE;
1836 
1837  not_a_word:
1838 
1839  if (word_type == AC_UNACCEPTABLE) {
1840  /* Look for abbreviation string */
1841  i = 0;
1842  offset = 0;
1843  if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1844  word_type = AC_UC_ABBREV;
1845  while (s[offset] != '\0' &&
1846  char_set.get_isupper(s + offset, lengths[i]) &&
1847  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1848  offset += lengths[i++];
1849  offset += lengths[i++];
1850  }
1851  }
1852  else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1853  word_type = AC_LC_ABBREV;
1854  while (s[offset] != '\0' &&
1855  char_set.get_islower(s + offset, lengths[i]) &&
1856  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1857  offset += lengths[i++];
1858  offset += lengths[i++];
1859  }
1860  }
1861  if (s[offset] != '\0')
1862  word_type = AC_UNACCEPTABLE;
1863  }
1864 
1865  return word_type;
1866 }
1867 
1868 bool Tesseract::check_debug_pt(WERD_RES* word, int location) {
1869  bool show_map_detail = false;
1870  int16_t i;
1871 
1872  if (!test_pt)
1873  return false;
1874 
1875  tessedit_rejection_debug.set_value (FALSE);
1876  debug_x_ht_level.set_value(0);
1877 
1878  if (word->word->bounding_box().contains(FCOORD (test_pt_x, test_pt_y))) {
1879  if (location < 0)
1880  return true; // For breakpoint use
1881  tessedit_rejection_debug.set_value(TRUE);
1882  debug_x_ht_level.set_value(2);
1883  tprintf ("\n\nTESTWD::");
1884  switch (location) {
1885  case 0:
1886  tprintf ("classify_word_pass1 start\n");
1887  word->word->print();
1888  break;
1889  case 10:
1890  tprintf ("make_reject_map: initial map");
1891  break;
1892  case 20:
1893  tprintf ("make_reject_map: after NN");
1894  break;
1895  case 30:
1896  tprintf ("classify_word_pass2 - START");
1897  break;
1898  case 40:
1899  tprintf ("classify_word_pass2 - Pre Xht");
1900  break;
1901  case 50:
1902  tprintf ("classify_word_pass2 - END");
1903  show_map_detail = true;
1904  break;
1905  case 60:
1906  tprintf ("fixspace");
1907  break;
1908  case 70:
1909  tprintf ("MM pass START");
1910  break;
1911  case 80:
1912  tprintf ("MM pass END");
1913  break;
1914  case 90:
1915  tprintf ("After Poor quality rejection");
1916  break;
1917  case 100:
1918  tprintf ("unrej_good_quality_words - START");
1919  break;
1920  case 110:
1921  tprintf ("unrej_good_quality_words - END");
1922  break;
1923  case 120:
1924  tprintf ("Write results pass");
1925  show_map_detail = true;
1926  break;
1927  }
1928  if (word->best_choice != nullptr) {
1929  tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
1930  word->reject_map.print(debug_fp);
1931  tprintf("\n");
1932  if (show_map_detail) {
1933  tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
1934  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1935  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1936  word->reject_map[i].full_print(debug_fp);
1937  }
1938  }
1939  } else {
1940  tprintf("null best choice\n");
1941  }
1942  tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1943  tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1944  return true;
1945  } else {
1946  return false;
1947  }
1948 }
1949 
1955 static void find_modal_font( // good chars in word
1956  STATS* fonts, // font stats
1957  int16_t* font_out, // output font
1958  int8_t* font_count // output count
1959 ) {
1960  int16_t font; //font index
1961  int32_t count; //pile count
1962 
1963  if (fonts->get_total () > 0) {
1964  font = (int16_t) fonts->mode ();
1965  *font_out = font;
1966  count = fonts->pile_count (font);
1967  *font_count = count < INT8_MAX ? count : INT8_MAX;
1968  fonts->add (font, -*font_count);
1969  }
1970  else {
1971  *font_out = -1;
1972  *font_count = 0;
1973  }
1974 }
1975 
1982  // Don't try to set the word fonts for an lstm word, as the configs
1983  // will be meaningless.
1984  if (word->chopped_word == nullptr) return;
1985  ASSERT_HOST(word->best_choice != nullptr);
1986 
1987 #ifndef DISABLED_LEGACY_ENGINE
1988  const int fontinfo_size = get_fontinfo_table().size();
1989  if (fontinfo_size == 0) return;
1990  GenericVector<int> font_total_score;
1991  font_total_score.init_to_size(fontinfo_size, 0);
1992 
1993  word->italic = 0;
1994  word->bold = 0;
1995  // Compute the font scores for the word
1996  if (tessedit_debug_fonts) {
1997  tprintf("Examining fonts in %s\n",
1998  word->best_choice->debug_string().string());
1999  }
2000  for (int b = 0; b < word->best_choice->length(); ++b) {
2001  const BLOB_CHOICE* choice = word->GetBlobChoice(b);
2002  if (choice == nullptr) continue;
2003  const GenericVector<ScoredFont>& fonts = choice->fonts();
2004  for (int f = 0; f < fonts.size(); ++f) {
2005  const int fontinfo_id = fonts[f].fontinfo_id;
2006  if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
2007  font_total_score[fontinfo_id] += fonts[f].score;
2008  }
2009  }
2010  }
2011  // Find the top and 2nd choice for the word.
2012  int score1 = 0, score2 = 0;
2013  int16_t font_id1 = -1, font_id2 = -1;
2014  for (int f = 0; f < fontinfo_size; ++f) {
2015  if (tessedit_debug_fonts && font_total_score[f] > 0) {
2016  tprintf("Font %s, total score = %d\n",
2017  fontinfo_table_.get(f).name, font_total_score[f]);
2018  }
2019  if (font_total_score[f] > score1) {
2020  score2 = score1;
2021  font_id2 = font_id1;
2022  score1 = font_total_score[f];
2023  font_id1 = f;
2024  } else if (font_total_score[f] > score2) {
2025  score2 = font_total_score[f];
2026  font_id2 = f;
2027  }
2028  }
2029  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : nullptr;
2030  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : nullptr;
2031  // Each score has a limit of UINT16_MAX, so divide by that to get the number
2032  // of "votes" for that font, ie number of perfect scores.
2033  word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
2034  word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
2035  if (score1 > 0) {
2036  const FontInfo fi = fontinfo_table_.get(font_id1);
2037  if (tessedit_debug_fonts) {
2038  if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
2039  tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
2040  fi.name, word->fontinfo_id_count,
2041  fontinfo_table_.get(font_id2).name,
2042  word->fontinfo_id2_count);
2043  } else {
2044  tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
2045  fi.name, word->fontinfo_id_count);
2046  }
2047  }
2048  word->italic = (fi.is_italic() ? 1 : -1) * word->fontinfo_id_count;
2049  word->bold = (fi.is_bold() ? 1 : -1) * word->fontinfo_id_count;
2050  }
2051 #endif // ndef DISABLED_LEGACY_ENGINE
2052 }
2053 
2054 
2061  PAGE_RES_IT page_res_it(page_res);
2062  WERD_RES *word; // current word
2063  STATS doc_fonts(0, font_table_size_); // font counters
2064 
2065  // Gather font id statistics.
2066  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2067  page_res_it.forward()) {
2068  word = page_res_it.word();
2069  if (word->fontinfo != nullptr) {
2070  doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
2071  }
2072  if (word->fontinfo2 != nullptr) {
2073  doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
2074  }
2075  }
2076  int16_t doc_font; // modal font
2077  int8_t doc_font_count; // modal font
2078  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2079  if (doc_font_count == 0)
2080  return;
2081  // Get the modal font pointer.
2082  const FontInfo* modal_font = nullptr;
2083  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2084  page_res_it.forward()) {
2085  word = page_res_it.word();
2086  if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
2087  modal_font = word->fontinfo;
2088  break;
2089  }
2090  if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
2091  modal_font = word->fontinfo2;
2092  break;
2093  }
2094  }
2095  ASSERT_HOST(modal_font != nullptr);
2096 
2097  // Assign modal font to weak words.
2098  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2099  page_res_it.forward()) {
2100  word = page_res_it.word();
2101  const int length = word->best_choice->length();
2102 
2103  const int count = word->fontinfo_id_count;
2104  if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2105  word->fontinfo = modal_font;
2106  // Counts only get 1 as it came from the doc.
2107  word->fontinfo_id_count = 1;
2108  word->italic = modal_font->is_italic() ? 1 : -1;
2109  word->bold = modal_font->is_bold() ? 1 : -1;
2110  }
2111  }
2112 }
2113 
2114 // If a word has multiple alternates check if the best choice is in the
2115 // dictionary. If not, replace it with an alternate that exists in the
2116 // dictionary.
2118  PAGE_RES_IT word_it(page_res);
2119  for (WERD_RES* word = word_it.word(); word != nullptr;
2120  word = word_it.forward()) {
2121  if (word->best_choices.singleton())
2122  continue; // There are no alternates.
2123 
2124  const WERD_CHOICE* best = word->best_choice;
2125  if (word->tesseract->getDict().valid_word(*best) != 0)
2126  continue; // The best choice is in the dictionary.
2127 
2128  WERD_CHOICE_IT choice_it(&word->best_choices);
2129  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2130  choice_it.forward()) {
2131  WERD_CHOICE* alternate = choice_it.data();
2132  if (word->tesseract->getDict().valid_word(*alternate)) {
2133  // The alternate choice is in the dictionary.
2134  if (tessedit_bigram_debug) {
2135  tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2136  best->unichar_string().string(),
2137  alternate->unichar_string().string());
2138  }
2139  // Replace the 'best' choice with a better choice.
2140  word->ReplaceBestChoice(alternate);
2141  break;
2142  }
2143  }
2144  }
2145 }
2146 
2147 } // namespace tesseract
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
Definition: control.cpp:159
const UNICHARSET & GetUnicharset() const
BLOCK_RES * block() const
Definition: pageres.h:757
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:359
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1322
WERD_CHOICE_LIST best_choices
Definition: pageres.h:243
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1287
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:771
double classify_max_certainty_margin
Definition: classify.h:445
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:2060
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:493
bool tess_failed
Definition: pageres.h:288
TWERD * rebuild_word
Definition: pageres.h:260
int UNICHAR_ID
Definition: unichar.h:35
int32_t pile_count(int32_t value) const
Definition: statistc.h:78
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:102
bool guessed_x_ht
Definition: pageres.h:308
int size() const
Definition: genericvector.h:71
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1591
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs)
Definition: control.cpp:1074
#define TRUE
Definition: capi.h:51
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:171
int32_t rej_count
Definition: pageres.h:80
GenericVector< STRING > misadaption_log
Definition: pageres.h:92
ROW_RES * row() const
Definition: pageres.h:754
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481
void set_global_subloc_code(int loc_code)
Definition: globaloc.cpp:85
Dict & getDict() override
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
Definition: control.cpp:1239
#define LOC_FUZZY_SPACE
Definition: errcode.h:50
FILE * debug_fp
Definition: tessvars.cpp:24
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:222
void print() const
Definition: rect.h:278
void Add(T value, int count)
Definition: sorthelper.h:65
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
int8_t italic
Definition: pageres.h:301
REJMAP reject_map
Definition: pageres.h:287
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:132
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:72
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:760
const char * string() const
Definition: strngs.cpp:196
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:136
int count(LIST var_list)
Definition: oldlist.cpp:98
void clear_fx_win()
Definition: drawfx.cpp:72
void full_print(FILE *fp)
Definition: rejctmap.cpp:335
TBOX bounding_box() const
Definition: werd.cpp:159
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:765
uint8_t permuter() const
Definition: ratngs.h:346
bool classify_bln_numeric_mode
Definition: classify.h:541
void GetNoiseOutlines(GenericVector< C_OUTLINE *> *outlines)
Definition: werd.cpp:529
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1420
void fix_hyphens()
Definition: pageres.cpp:1053
bool x_overlap(const TBOX &box) const
Definition: rect.h:401
TBOX bounding_box() const
Definition: blobs.cpp:871
Definition: rect.h:34
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:910
int32_t length() const
Definition: rejctmap.h:223
Definition: werd.h:35
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:823
void create_fx_win()
Definition: drawfx.cpp:59
int32_t mode() const
Definition: statistc.cpp:114
bool tessedit_enable_bigram_correction
const FontInfo * fontinfo
Definition: pageres.h:304
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1981
#define LOC_DOC_BLK_REJ
Definition: errcode.h:53
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:326
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:787
bool script_has_xheight() const
Definition: unicharset.h:898
Definition: statistc.h:33
bool AdaptiveClassifierIsFull() const
Definition: classify.h:325
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:199
bool is_bold() const
Definition: fontinfo.h:112
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:93
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:127
bool SubAndSuperscriptFix(WERD_RES *word_res)
float baseline_shift
Definition: pageres.h:313
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:61
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:414
static void Update()
Definition: scrollview.cpp:711
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:579
T & back() const
Pix * BestPix() const
void SetScriptPositions()
Definition: pageres.cpp:864
int8_t fontinfo_id2_count
Definition: pageres.h:307
bool small_caps
Definition: pageres.h:299
PointerVector< WERD_RES > lang_words
void ResetWordIterator()
Definition: pageres.cpp:1533
bool IsAmbiguous()
Definition: pageres.cpp:458
float max_x_height() const
Definition: ratngs.h:339
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1649
int16_t word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:61
WERD_RES * restart_page()
Definition: pageres.h:698
void print()
Definition: werd.cpp:265
BLOCK * block
Definition: pageres.h:117
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB *> &target_blobs, const GenericVector< C_OUTLINE *> &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:547
bool tess_would_adapt
Definition: pageres.h:297
int16_t left() const
Definition: rect.h:72
void rej_stat_word()
Definition: pageres.cpp:1674
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
a.b.c.
Definition: control.h:34
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:125
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:397
int16_t top() const
Definition: rect.h:58
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1538
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1481
#define LOC_WRITE_RESULTS
Definition: errcode.h:54
float x_height() const
Definition: ocrrow.h:64
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
bool word_adaptable(WERD_RES *word, uint16_t mode)
Definition: adaptions.cpp:35
UNICHARSET unicharset
Definition: ccutil.h:68
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:740
void fix_quotes()
Definition: pageres.cpp:1024
int16_t reject_count()
Definition: rejctmap.h:229
void PrintBestChoices() const
Definition: pageres.cpp:723
void init_to_size(int size, const T &t)
STRING lang
Definition: ccutil.h:66
bool is_italic() const
Definition: fontinfo.h:111
bool major_overlap(const TBOX &box) const
Definition: rect.h:368
#define FALSE
Definition: capi.h:52
A.B.C.
Definition: control.h:35
int8_t bold
Definition: pageres.h:302
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:308
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:412
bool deadline_exceeded() const
Definition: ocrclass.h:164
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:118
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:342
bool right_to_left() const
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:39
bool tess_accepted
Definition: pageres.h:296
const double kMinRefitXHeightFraction
Definition: control.cpp:56
const GenericVector< tesseract::ScoredFont > & fonts() const
Definition: ratngs.h:92
double classify_max_rating_ratio
Definition: classify.h:443
FCOORD classify_rotation() const
Definition: ocrblock.h:142
int length() const
Definition: genericvector.h:85
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
void plot(ScrollView *window)
Definition: blobs.cpp:907
WERD_RES * word() const
Definition: pageres.h:751
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:308
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56
void BestChoiceToCorrectText()
Definition: pageres.cpp:929
Definition: werd.h:59
ParamsVectors * params()
Definition: ccutil.h:62
int8_t fontinfo_id_count
Definition: pageres.h:306
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
Definition: ocrrow.h:36
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: werd.h:34
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:73
int32_t char_count
Definition: pageres.h:79
bool IsText() const
Definition: polyblk.h:49
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1764
Definition: ocrblock.h:30
int length() const
Definition: ratngs.h:303
PAGE_RES * page_res
Definition: pageres.h:677
C_BLOB_LIST * cblob_list()
Definition: werd.h:98
float caps_height
Definition: pageres.h:312
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:808
ALL but initial lc.
Definition: control.h:33
void add(int32_t value, int32_t count)
Definition: statistc.cpp:100
float min_x_height() const
Definition: ratngs.h:336
int push_back(T object)
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:618
void add_str_int(const char *str, int number)
Definition: strngs.cpp:379
bool done
Definition: pageres.h:298
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1483
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1725
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:67
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1152
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:62
float rating() const
Definition: ratngs.h:80
tesseract::Tesseract * tesseract
Definition: pageres.h:282
EXTERN ScrollView * fx_win
Definition: drawfx.cpp:50
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:473
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:614
float x_height
Definition: pageres.h:311
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
Definition: strngs.h:45
const STRING & misadaption_debug() const
Definition: blamer.h:131
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:125
bool contains(const char c) const
Definition: strngs.cpp:187
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:182
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:78
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:80
const STRING debug_string() const
Definition: ratngs.h:505
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
Definition: control.cpp:218
TBOX bounding_box() const
Definition: stepblob.cpp:255
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:70
void DeleteCurrentWord()
Definition: pageres.cpp:1450
GenericVector< int > blame_reasons
Definition: pageres.h:87
const char *const kBackUpConfigFile
Definition: control.cpp:53
bool contains(const FCOORD pt) const
Definition: rect.h:333
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1269
const UNICHARSET * uch_set
Definition: pageres.h:206
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70
static const double kXHeightCapRatio
Definition: ccstruct.h:37
int length() const
Definition: boxword.h:83
BlamerBundle * blamer_bundle
Definition: pageres.h:246
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:129
Definition: points.h:189
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:283
ALL upper case.
Definition: control.h:32
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:35
const STRING & unichar_string() const
Definition: ratngs.h:541
PROGRESS_FUNC2 progress_callback2
called whenever progress increases
Definition: ocrclass.h:131
bool part_of_combo
Definition: pageres.h:335
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1868
int16_t right() const
Definition: rect.h:79
bool AnyTessLang() const
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:630
int32_t universal_id
Definition: fontinfo.h:123
ACCEPTABLE_WERD_TYPE
Definition: control.h:28
bool wordrec_run_blamer
Definition: wordrec.h:237
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:139
bool IsAllSpaces() const
Definition: ratngs.h:521
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:180
WERD_RES * forward()
Definition: pageres.h:731
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:552
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:93
WERD_CHOICE * raw_choice
Definition: pageres.h:240
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:949
bool wordrec_debug_blamer
Definition: wordrec.h:236
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:322
TWERD * chopped_word
Definition: pageres.h:215
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:801
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:716
bool recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:82
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2117
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:32
#define SUBLOC_NORM
Definition: errcode.h:59
int16_t bottom() const
Definition: rect.h:65
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:459
bool AnyLSTMLang() const
bool top_bottom_useful() const
Definition: unicharset.h:532
Unacceptable word.
Definition: control.h:30
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:122
#define LOC_MM_ADAPT
Definition: errcode.h:52
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
PDBLK pdblk
Definition: ocrblock.h:192
void PrerecAllWordsPar(const GenericVector< WordData > &words)
Definition: par_control.cpp:39
int32_t length() const
Definition: strngs.cpp:191
WERD_CHOICE * best_choice
Definition: pageres.h:235
tesseract::BoxWord * box_word
Definition: pageres.h:266
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:251
int32_t get_total() const
Definition: statistc.h:86
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:119
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs)
Definition: control.cpp:1019
float y() const
Definition: points.h:211
void print(FILE *fp)
Definition: rejctmap.cpp:323
void rej_word_bad_quality()
Definition: rejctmap.cpp:417
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:756
void initialise(int16_t length)
Definition: rejctmap.cpp:275
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:43
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1504
ROW * row
Definition: pageres.h:143
int32_t x_height() const
return xheight
Definition: ocrblock.h:108
#define ASSERT_HOST(x)
Definition: errcode.h:84
const FontInfo * fontinfo2
Definition: pageres.h:305
ALL lower case.
Definition: control.h:31
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1338
WERD * word
Definition: pageres.h:189