tesseract  4.0.0-1-g2a2b
reject.cpp File Reference
#include "tessvars.h"
#include <cctype>
#include <cerrno>
#include <cstring>
#include "genericvector.h"
#include "reject.h"
#include "control.h"
#include "docqual.h"
#include "globaloc.h"
#include "globals.h"
#include "helpers.h"
#include "tesseractclass.h"

Go to the source code of this file.

Namespaces

 tesseract
 

Functions

 CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract
 
void reject_blanks (WERD_RES *word)
 
void reject_poor_matches (WERD_RES *word)
 
float compute_reject_threshold (WERD_CHOICE *word)
 

Function Documentation

◆ CLISTIZEH()

CLISTIZEH ( STRING  )

Definition at line 55 of file reject.cpp.

63  {
64 void Tesseract::set_done(WERD_RES *word, int16_t pass) {
65  word->done = word->tess_accepted &&
66  (strchr(word->best_choice->unichar_string().string(), ' ') == nullptr);
67  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
68  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
69  word->best_choice->permuter() == FREQ_DAWG_PERM ||
71  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
72  one_ell_conflict(word, false)) {
73  if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
74  word->done = FALSE;
75  }
76  if (word->done && ((!word_from_dict &&
77  word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
78  if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
79  word->done = FALSE;
80  }
81  if (tessedit_rejection_debug) {
82  tprintf("set_done(): done=%d\n", word->done);
83  word->best_choice->print("");
84  }
85 }
86 
87 
88 /*************************************************************************
89  * make_reject_map()
90  *
91  * Sets the done flag to indicate whether the resylt is acceptable.
92  *
93  * Sets a reject map for the word.
94  *************************************************************************/
95 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
96  int i;
97  int offset;
98 
99  flip_0O(word);
100  check_debug_pt(word, -1); // For trap only
101  set_done(word, pass); // Set acceptance
103  reject_blanks(word);
104  /*
105  0: Rays original heuristic - the baseline
106  */
107  if (tessedit_reject_mode == 0) {
108  if (!word->done)
109  reject_poor_matches(word);
110  } else if (tessedit_reject_mode == 5) {
111  /*
112  5: Reject I/1/l from words where there is no strong contextual confirmation;
113  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
114  and the whole of any words which are very small
115  */
116  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
118  } else {
119  one_ell_conflict(word, true);
120  /*
121  Originally the code here just used the done flag. Now I have duplicated
122  and unpacked the conditions for setting the done flag so that each
123  mechanism can be turned on or off independently. This works WITHOUT
124  affecting the done flag setting.
125  */
126  if (rej_use_tess_accepted && !word->tess_accepted)
128 
129  if (rej_use_tess_blanks &&
130  (strchr (word->best_choice->unichar_string().string (), ' ') != nullptr))
132 
133  WERD_CHOICE* best_choice = word->best_choice;
134  if (rej_use_good_perm) {
135  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
136  best_choice->permuter() == FREQ_DAWG_PERM ||
137  best_choice->permuter() == USER_DAWG_PERM) &&
138  (!rej_use_sensible_wd ||
139  acceptable_word_string(*word->uch_set,
140  best_choice->unichar_string().string(),
141  best_choice->unichar_lengths().string()) !=
142  AC_UNACCEPTABLE)) {
143  // PASSED TEST
144  } else if (best_choice->permuter() == NUMBER_PERM) {
145  if (rej_alphas_in_number_perm) {
146  for (i = 0, offset = 0;
147  best_choice->unichar_string()[offset] != '\0';
148  offset += best_choice->unichar_lengths()[i++]) {
149  if (word->reject_map[i].accepted() &&
150  word->uch_set->get_isalpha(
151  best_choice->unichar_string().string() + offset,
152  best_choice->unichar_lengths()[i]))
153  word->reject_map[i].setrej_bad_permuter();
154  // rej alpha
155  }
156  }
157  } else {
159  }
160  }
161  /* Ambig word rejection was here once !!*/
162  }
163  } else {
164  tprintf("BAD tessedit_reject_mode\n");
165  err_exit();
166  }
167 
168  if (tessedit_image_border > -1)
169  reject_edge_blobs(word);
170 
171  check_debug_pt (word, 10);
172  if (tessedit_rejection_debug) {
173  tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
174  tprintf("Certainty: %f Rating: %f\n",
175  word->best_choice->certainty (), word->best_choice->rating ());
176  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
177  }
178 
179  flip_hyphens(word);
180  check_debug_pt(word, 20);
181 }
182 } // namespace tesseract
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
void print() const
Definition: ratngs.h:580
uint8_t permuter() const
Definition: ratngs.h:346
float y_scale() const
Definition: normalis.h:270
const int kBlnXHeight
Definition: normalis.h:24
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
bool dangerous_ambig_found() const
Definition: ratngs.h:363
const STRING & unichar_lengths() const
Definition: ratngs.h:548
DENORM denorm
Definition: pageres.h:204
void flip_0O(WERD_RES *word)
#define FALSE
Definition: capi.h:52
bool tess_accepted
Definition: pageres.h:296
void flip_hyphens(WERD_RES *word)
void rej_word_bad_permuter()
Definition: rejctmap.cpp:381
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:185
Definition: ocrrow.h:36
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool done
Definition: pageres.h:298
void rej_word_small_xht()
Definition: rejctmap.cpp:345
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:214
const UNICHARSET * uch_set
Definition: pageres.h:206
const STRING & unichar_string() const
Definition: ratngs.h:541
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:363
void rej_word_contains_blanks()
Definition: rejctmap.cpp:372
void err_exit()
Definition: globaloc.cpp:75
Unacceptable word.
Definition: control.h:30
int32_t length() const
Definition: strngs.cpp:191
WERD_CHOICE * best_choice
Definition: pageres.h:235
void initialise(int16_t length)
Definition: rejctmap.cpp:275

◆ compute_reject_threshold()

float compute_reject_threshold ( WERD_CHOICE word)

Definition at line 233 of file reject.cpp.

233  {
234  float threshold; // rejection threshold
235  float bestgap = 0.0f; // biggest gap
236  float gapstart; // bottom of gap
237 
238  int blob_count = word->length();
239  GenericVector<float> ratings;
240  ratings.resize_no_init(blob_count);
241  for (int i = 0; i < blob_count; ++i) {
242  ratings[i] = word->certainty(i);
243  }
244  ratings.sort();
245  gapstart = ratings[0] - 1; // all reject if none better
246  if (blob_count >= 3) {
247  for (int index = 0; index < blob_count - 1; index++) {
248  if (ratings[index + 1] - ratings[index] > bestgap) {
249  bestgap = ratings[index + 1] - ratings[index];
250  // find biggest
251  gapstart = ratings[index];
252  }
253  }
254  }
255  threshold = gapstart + bestgap / 2;
256 
257  return threshold;
258 }
void resize_no_init(int size)
Definition: genericvector.h:65
float certainty() const
Definition: ratngs.h:330
int length() const
Definition: ratngs.h:303

◆ reject_blanks()

void reject_blanks ( WERD_RES word)

Definition at line 185 of file reject.cpp.

185  {
186  int16_t i;
187  int16_t offset;
188 
189  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
190  offset += word->best_choice->unichar_lengths()[i], i += 1) {
191  if (word->best_choice->unichar_string()[offset] == ' ')
192  //rej unrecognised blobs
193  word->reject_map[i].setrej_tess_failure ();
194  }
195 }
REJMAP reject_map
Definition: pageres.h:287
const STRING & unichar_lengths() const
Definition: ratngs.h:548
const STRING & unichar_string() const
Definition: ratngs.h:541
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ reject_poor_matches()

void reject_poor_matches ( WERD_RES word)

Definition at line 214 of file reject.cpp.

214  {
215  float threshold = compute_reject_threshold(word->best_choice);
216  for (int i = 0; i < word->best_choice->length(); ++i) {
217  if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
218  word->reject_map[i].setrej_tess_failure();
219  else if (word->best_choice->certainty(i) < threshold)
220  word->reject_map[i].setrej_poor_match();
221  }
222 }
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:233
REJMAP reject_map
Definition: pageres.h:287
float certainty() const
Definition: ratngs.h:330
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int length() const
Definition: ratngs.h:303
WERD_CHOICE * best_choice
Definition: pageres.h:235