tesseract  5.0.0-alpha-619-ge9db
reject.cpp File Reference
#include "tessvars.h"
#include <cctype>
#include <cerrno>
#include <cstring>
#include <tesseract/genericvector.h>
#include "reject.h"
#include "control.h"
#include "docqual.h"
#include <tesseract/helpers.h>
#include "tesseractclass.h"

Go to the source code of this file.

Namespaces

 tesseract
 

Functions

 CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract
 
void reject_blanks (WERD_RES *word)
 
void reject_poor_matches (WERD_RES *word)
 
float compute_reject_threshold (WERD_CHOICE *word)
 

Function Documentation

◆ CLISTIZEH()

CLISTIZEH ( STRING  )

Definition at line 50 of file reject.cpp.

59  {
60 void Tesseract::set_done(WERD_RES *word, int16_t pass) {
61  word->done = word->tess_accepted &&
62  (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr);
63  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
64  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
65  word->best_choice->permuter() == FREQ_DAWG_PERM ||
67  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
68  one_ell_conflict(word, false)) {
69  if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
70  word->done = false;
71  }
72  if (word->done && ((!word_from_dict &&
73  word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
74  if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
75  word->done = false;
76  }
77  if (tessedit_rejection_debug) {
78  tprintf("set_done(): done=%d\n", word->done);
79  word->best_choice->print("");
80  }
81 }
82 
83 
84 /*************************************************************************
85  * make_reject_map()
86  *
87  * Sets the done flag to indicate whether the resylt is acceptable.
88  *
89  * Sets a reject map for the word.
90  *************************************************************************/
91 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
92  int i;
93  int offset;
94 
95  flip_0O(word);
96  check_debug_pt(word, -1); // For trap only
97  set_done(word, pass); // Set acceptance
99  reject_blanks(word);
100  /*
101  0: Rays original heuristic - the baseline
102  */
103  if (tessedit_reject_mode == 0) {
104  if (!word->done)
105  reject_poor_matches(word);
106  } else if (tessedit_reject_mode == 5) {
107  /*
108  5: Reject I/1/l from words where there is no strong contextual confirmation;
109  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
110  and the whole of any words which are very small
111  */
112  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
114  } else {
115  one_ell_conflict(word, true);
116  /*
117  Originally the code here just used the done flag. Now I have duplicated
118  and unpacked the conditions for setting the done flag so that each
119  mechanism can be turned on or off independently. This works WITHOUT
120  affecting the done flag setting.
121  */
122  if (rej_use_tess_accepted && !word->tess_accepted)
124 
125  if (rej_use_tess_blanks &&
126  (strchr (word->best_choice->unichar_string().c_str(), ' ') != nullptr))
128 
129  WERD_CHOICE* best_choice = word->best_choice;
130  if (rej_use_good_perm) {
131  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
132  best_choice->permuter() == FREQ_DAWG_PERM ||
133  best_choice->permuter() == USER_DAWG_PERM) &&
134  (!rej_use_sensible_wd ||
135  acceptable_word_string(*word->uch_set,
136  best_choice->unichar_string().c_str(),
137  best_choice->unichar_lengths().c_str()) !=
138  AC_UNACCEPTABLE)) {
139  // PASSED TEST
140  } else if (best_choice->permuter() == NUMBER_PERM) {
141  if (rej_alphas_in_number_perm) {
142  for (i = 0, offset = 0;
143  best_choice->unichar_string()[offset] != '\0';
144  offset += best_choice->unichar_lengths()[i++]) {
145  if (word->reject_map[i].accepted() &&
146  word->uch_set->get_isalpha(
147  best_choice->unichar_string().c_str() + offset,
148  best_choice->unichar_lengths()[i]))
149  word->reject_map[i].setrej_bad_permuter();
150  // rej alpha
151  }
152  }
153  } else {
155  }
156  }
157  /* Ambig word rejection was here once !!*/
158  }
159  } else {
160  tprintf("BAD tessedit_reject_mode\n");
161  ASSERT_HOST("Fatal error encountered!" == nullptr);
162  }
163 
164  if (tessedit_image_border > -1)
165  reject_edge_blobs(word);
166 
167  check_debug_pt (word, 10);
168  if (tessedit_rejection_debug) {
169  tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
170  tprintf("Certainty: %f Rating: %f\n",
171  word->best_choice->certainty (), word->best_choice->rating ());
172  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
173  }
174 
175  flip_hyphens(word);

◆ compute_reject_threshold()

float compute_reject_threshold ( WERD_CHOICE word)

Definition at line 225 of file reject.cpp.

229  {
230  float threshold; // rejection threshold
231  float bestgap = 0.0f; // biggest gap
232  float gapstart; // bottom of gap
233 
234  int blob_count = word->length();
235  GenericVector<float> ratings;
236  ratings.resize_no_init(blob_count);
237  for (int i = 0; i < blob_count; ++i) {
238  ratings[i] = word->certainty(i);
239  }
240  ratings.sort();
241  gapstart = ratings[0] - 1; // all reject if none better
242  if (blob_count >= 3) {
243  for (int index = 0; index < blob_count - 1; index++) {
244  if (ratings[index + 1] - ratings[index] > bestgap) {
245  bestgap = ratings[index + 1] - ratings[index];
246  // find biggest
247  gapstart = ratings[index];
248  }
249  }
250  }

◆ reject_blanks()

void reject_blanks ( WERD_RES word)

Definition at line 178 of file reject.cpp.

181  {
182  int16_t i;
183  int16_t offset;
184 
185  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
186  offset += word->best_choice->unichar_lengths()[i], i += 1) {
187  if (word->best_choice->unichar_string()[offset] == ' ')
188  //rej unrecognised blobs

◆ reject_poor_matches()

void reject_poor_matches ( WERD_RES word)

Definition at line 207 of file reject.cpp.

210  {
211  float threshold = compute_reject_threshold(word->best_choice);
212  for (int i = 0; i < word->best_choice->length(); ++i) {
213  if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
214  word->reject_map[i].setrej_tess_failure();
215  else if (word->best_choice->certainty(i) < threshold)
WERD_RES::done
bool done
Definition: pageres.h:299
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
reject_blanks
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:178
flip_hyphens
void flip_hyphens(WERD_RES *word)
kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:23
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
flip_0O
void flip_0O(WERD_RES *word)
WERD_CHOICE
Definition: ratngs.h:261
WERD_CHOICE::dangerous_ambig_found
bool dangerous_ambig_found() const
Definition: ratngs.h:351
REJMAP::initialise
void initialise(int16_t length)
Definition: rejctmap.cpp:272
UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:481
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
SYSTEM_DAWG_PERM
Definition: ratngs.h:239
WERD_RES::denorm
DENORM denorm
Definition: pageres.h:195
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:334
WERD_RES
Definition: pageres.h:160
reject_poor_matches
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:207
WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:197
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
REJMAP::rej_word_contains_blanks
void rej_word_contains_blanks()
Definition: rejctmap.cpp:369
compute_reject_threshold
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:225
UNICHAR_SPACE
Definition: unicharset.h:34
DENORM::y_scale
float y_scale() const
Definition: normalis.h:269
GenericVector::resize_no_init
void resize_no_init(int size)
Definition: genericvector.h:65
WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:297
REJMAP::rej_word_not_tess_accepted
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:360
WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:288
GenericVector< float >
AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:29
STRING::length
int32_t length() const
Definition: strngs.cpp:187
REJMAP::rej_word_bad_permuter
void rej_word_bad_permuter()
Definition: rejctmap.cpp:378
WERD_CHOICE::print
void print() const
Definition: ratngs.h:568
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
REJMAP::rej_word_small_xht
void rej_word_small_xht()
Definition: rejctmap.cpp:342
ROW
Definition: ocrrow.h:35
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
GenericVector::sort
void sort()
Definition: genericvector.h:1102
WERD_CHOICE::unichar_lengths
const STRING & unichar_lengths() const
Definition: ratngs.h:536
FREQ_DAWG_PERM
Definition: ratngs.h:242
NUMBER_PERM
Definition: ratngs.h:237
USER_DAWG_PERM
Definition: ratngs.h:241