tesseract  5.0.0-alpha-619-ge9db
reject.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: reject.cpp (Formerly reject.c)
3  * Description: Rejection functions used in tessedit
4  * Author: Phil Cheatle
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #ifdef DISABLED_LEGACY_ENGINE
25 
26 #include "tesseractclass.h"
27 
28 namespace tesseract {
29 
30 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
31  const WERD_CHOICE &word = *werd_res->best_choice;
32  int dict_word_type = werd_res->tesseract->dict_word(word);
33  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
34 }
35 } // namespace tesseract
36 
37 #else
38 
39 #include "tessvars.h"
40 #include <cctype>
41 #include <cerrno>
42 #include <cstring>
44 #include "reject.h"
45 #include "control.h"
46 #include "docqual.h"
47 #include <tesseract/helpers.h>
48 
49 #include "tesseractclass.h"
50 
52 
53 /*************************************************************************
54  * set_done()
55  *
56  * Set the done flag based on the word acceptability criteria
57  *************************************************************************/
58 
59 namespace tesseract {
60 void Tesseract::set_done(WERD_RES *word, int16_t pass) {
61  word->done = word->tess_accepted &&
62  (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr);
63  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
64  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
65  word->best_choice->permuter() == FREQ_DAWG_PERM ||
67  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
68  one_ell_conflict(word, false)) {
69  if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
70  word->done = false;
71  }
72  if (word->done && ((!word_from_dict &&
73  word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
74  if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
75  word->done = false;
76  }
78  tprintf("set_done(): done=%d\n", word->done);
79  word->best_choice->print("");
80  }
81 }
82 
83 
84 /*************************************************************************
85  * make_reject_map()
86  *
87  * Sets the done flag to indicate whether the resylt is acceptable.
88  *
89  * Sets a reject map for the word.
90  *************************************************************************/
91 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
92  int i;
93  int offset;
94 
95  flip_0O(word);
96  check_debug_pt(word, -1); // For trap only
97  set_done(word, pass); // Set acceptance
99  reject_blanks(word);
100  /*
101  0: Rays original heuristic - the baseline
102  */
103  if (tessedit_reject_mode == 0) {
104  if (!word->done)
105  reject_poor_matches(word);
106  } else if (tessedit_reject_mode == 5) {
107  /*
108  5: Reject I/1/l from words where there is no strong contextual confirmation;
109  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
110  and the whole of any words which are very small
111  */
112  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
114  } else {
115  one_ell_conflict(word, true);
116  /*
117  Originally the code here just used the done flag. Now I have duplicated
118  and unpacked the conditions for setting the done flag so that each
119  mechanism can be turned on or off independently. This works WITHOUT
120  affecting the done flag setting.
121  */
122  if (rej_use_tess_accepted && !word->tess_accepted)
124 
125  if (rej_use_tess_blanks &&
126  (strchr (word->best_choice->unichar_string().c_str(), ' ') != nullptr))
128 
129  WERD_CHOICE* best_choice = word->best_choice;
130  if (rej_use_good_perm) {
131  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
132  best_choice->permuter() == FREQ_DAWG_PERM ||
133  best_choice->permuter() == USER_DAWG_PERM) &&
136  best_choice->unichar_string().c_str(),
137  best_choice->unichar_lengths().c_str()) !=
138  AC_UNACCEPTABLE)) {
139  // PASSED TEST
140  } else if (best_choice->permuter() == NUMBER_PERM) {
142  for (i = 0, offset = 0;
143  best_choice->unichar_string()[offset] != '\0';
144  offset += best_choice->unichar_lengths()[i++]) {
145  if (word->reject_map[i].accepted() &&
146  word->uch_set->get_isalpha(
147  best_choice->unichar_string().c_str() + offset,
148  best_choice->unichar_lengths()[i]))
149  word->reject_map[i].setrej_bad_permuter();
150  // rej alpha
151  }
152  }
153  } else {
155  }
156  }
157  /* Ambig word rejection was here once !!*/
158  }
159  } else {
160  tprintf("BAD tessedit_reject_mode\n");
161  ASSERT_HOST("Fatal error encountered!" == nullptr);
162  }
163 
164  if (tessedit_image_border > -1)
165  reject_edge_blobs(word);
166 
167  check_debug_pt (word, 10);
169  tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
170  tprintf("Certainty: %f Rating: %f\n",
171  word->best_choice->certainty (), word->best_choice->rating ());
172  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
173  }
174 
175  flip_hyphens(word);
176  check_debug_pt(word, 20);
177 }
178 } // namespace tesseract
179 
180 
181 void reject_blanks(WERD_RES *word) {
182  int16_t i;
183  int16_t offset;
184 
185  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
186  offset += word->best_choice->unichar_lengths()[i], i += 1) {
187  if (word->best_choice->unichar_string()[offset] == ' ')
188  //rej unrecognised blobs
189  word->reject_map[i].setrej_tess_failure ();
190  }
191 }
192 
193 namespace tesseract {
194 void Tesseract::reject_I_1_L(WERD_RES *word) {
195  int16_t i;
196  int16_t offset;
197 
198  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
199  offset += word->best_choice->unichar_lengths()[i], i += 1) {
201  contains (word->best_choice->unichar_string()[offset])) {
202  //rej 1Il conflict
203  word->reject_map[i].setrej_1Il_conflict ();
204  }
205  }
206 }
207 } // namespace tesseract
208 
209 
210 void reject_poor_matches(WERD_RES *word) {
211  float threshold = compute_reject_threshold(word->best_choice);
212  for (int i = 0; i < word->best_choice->length(); ++i) {
213  if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
214  word->reject_map[i].setrej_tess_failure();
215  else if (word->best_choice->certainty(i) < threshold)
216  word->reject_map[i].setrej_poor_match();
217  }
218 }
219 
220 
221 /**********************************************************************
222  * compute_reject_threshold
223  *
224  * Set a rejection threshold for this word.
225  * Initially this is a trivial function which looks for the largest
226  * gap in the certainty value.
227  **********************************************************************/
228 
230  float threshold; // rejection threshold
231  float bestgap = 0.0f; // biggest gap
232  float gapstart; // bottom of gap
233 
234  int blob_count = word->length();
235  GenericVector<float> ratings;
236  ratings.resize_no_init(blob_count);
237  for (int i = 0; i < blob_count; ++i) {
238  ratings[i] = word->certainty(i);
239  }
240  ratings.sort();
241  gapstart = ratings[0] - 1; // all reject if none better
242  if (blob_count >= 3) {
243  for (int index = 0; index < blob_count - 1; index++) {
244  if (ratings[index + 1] - ratings[index] > bestgap) {
245  bestgap = ratings[index + 1] - ratings[index];
246  // find biggest
247  gapstart = ratings[index];
248  }
249  }
250  }
251  threshold = gapstart + bestgap / 2;
252 
253  return threshold;
254 }
255 
256 
257 /*************************************************************************
258  * reject_edge_blobs()
259  *
260  * If the word is perilously close to the edge of the image, reject those blobs
261  * in the word which are too close to the edge as they could be clipped.
262  *************************************************************************/
263 namespace tesseract {
265  TBOX word_box = word->word->bounding_box();
266  // Use the box_word as it is already denormed back to image coordinates.
267  int blobcount = word->box_word->length();
268 
269  if (word_box.left() < tessedit_image_border ||
270  word_box.bottom() < tessedit_image_border ||
271  word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
272  word_box.top() + tessedit_image_border > ImageHeight() - 1) {
273  ASSERT_HOST(word->reject_map.length() == blobcount);
274  for (int blobindex = 0; blobindex < blobcount; blobindex++) {
275  TBOX blob_box = word->box_word->BlobBox(blobindex);
276  if (blob_box.left() < tessedit_image_border ||
277  blob_box.bottom() < tessedit_image_border ||
278  blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
279  blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
280  word->reject_map[blobindex].setrej_edge_char();
281  // Close to edge
282  }
283  }
284  }
285 }
286 
287 /**********************************************************************
288  * one_ell_conflict()
289  *
290  * Identify words where there is a potential I/l/1 error.
291  * - A bundle of contextual heuristics!
292  **********************************************************************/
293 bool Tesseract::one_ell_conflict(WERD_RES* word_res, bool update_map) {
294  const char *word;
295  const char *lengths;
296  int16_t word_len; //its length
297  int16_t first_alphanum_index_;
298  int16_t first_alphanum_offset_;
299  int16_t i;
300  int16_t offset;
301  bool non_conflict_set_char; //non conf set a/n?
302  bool conflict = false;
303  bool allow_1s;
304  ACCEPTABLE_WERD_TYPE word_type;
305  bool dict_perm_type;
306  bool dict_word_ok;
307  int dict_word_type;
308 
309  word = word_res->best_choice->unichar_string().c_str();
310  lengths = word_res->best_choice->unichar_lengths().c_str();
311  word_len = strlen(lengths);
312  /*
313  If there are no occurrences of the conflict set characters then the word
314  is OK.
315  */
316  if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr)
317  return false;
318 
319  /*
320  There is a conflict if there are NO other (confirmed) alphanumerics apart
321  from those in the conflict set.
322  */
323 
324  for (i = 0, offset = 0, non_conflict_set_char = false;
325  (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
326  non_conflict_set_char =
327  (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
328  word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
329  !STRING (conflict_set_I_l_1).contains (word[offset]);
330  if (!non_conflict_set_char) {
331  if (update_map)
332  reject_I_1_L(word_res);
333  return true;
334  }
335 
336  /*
337  If the word is accepted by a dawg permuter, and the first alpha character
338  is "I" or "l", check to see if the alternative is also a dawg word. If it
339  is, then there is a potential error otherwise the word is ok.
340  */
341 
342  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
343  (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
345  (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
346  (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
347  dict_word_type = dict_word(*(word_res->best_choice));
348  dict_word_ok = (dict_word_type > 0) &&
349  (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
350 
351  if ((rej_1Il_use_dict_word && dict_word_ok) ||
352  (rej_1Il_trust_permuter_type && dict_perm_type) ||
353  (dict_perm_type && dict_word_ok)) {
354  first_alphanum_index_ = first_alphanum_index (word, lengths);
355  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
356  if (lengths[first_alphanum_index_] == 1 &&
357  word[first_alphanum_offset_] == 'I') {
358  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
359  if (safe_dict_word(word_res) > 0) {
360  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
361  if (update_map)
362  word_res->reject_map[first_alphanum_index_].
363  setrej_1Il_conflict();
364  return true;
365  }
366  else {
367  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
368  return false;
369  }
370  }
371 
372  if (lengths[first_alphanum_index_] == 1 &&
373  word[first_alphanum_offset_] == 'l') {
374  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
375  if (safe_dict_word(word_res) > 0) {
376  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
377  if (update_map)
378  word_res->reject_map[first_alphanum_index_].
379  setrej_1Il_conflict();
380  return true;
381  }
382  else {
383  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
384  return false;
385  }
386  }
387  return false;
388  }
389 
390  /*
391  NEW 1Il code. The old code relied on permuter types too much. In fact,
392  tess will use TOP_CHOICE permute for good things like "palette".
393  In this code the string is examined independently to see if it looks like
394  a well formed word.
395  */
396 
397  /*
398  REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
399  dictionary word.
400  */
401  first_alphanum_index_ = first_alphanum_index (word, lengths);
402  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
403  if (lengths[first_alphanum_index_] == 1 &&
404  word[first_alphanum_offset_] == 'l') {
405  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
406  if (safe_dict_word(word_res) > 0)
407  return false;
408  else
409  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
410  }
411  else if (lengths[first_alphanum_index_] == 1 &&
412  word[first_alphanum_offset_] == 'I') {
413  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
414  if (safe_dict_word(word_res) > 0)
415  return false;
416  else
417  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
418  }
419  /*
420  For strings containing digits:
421  If there are no alphas OR the numeric permuter liked the word,
422  reject any non 1 conflict chs
423  Else reject all conflict chs
424  */
425  if (word_contains_non_1_digit (word, lengths)) {
426  allow_1s = (alpha_count (word, lengths) == 0) ||
427  (word_res->best_choice->permuter () == NUMBER_PERM);
428 
429  int16_t offset;
430  conflict = false;
431  for (i = 0, offset = 0; word[offset] != '\0';
432  offset += word_res->best_choice->unichar_lengths()[i++]) {
433  if ((!allow_1s || (word[offset] != '1')) &&
434  STRING (conflict_set_I_l_1).contains (word[offset])) {
435  if (update_map)
436  word_res->reject_map[i].setrej_1Il_conflict ();
437  conflict = true;
438  }
439  }
440  return conflict;
441  }
442  /*
443  For anything else. See if it conforms to an acceptable word type. If so,
444  treat accordingly.
445  */
446  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
447  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
448  first_alphanum_index_ = first_alphanum_index (word, lengths);
449  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
450  if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
451  if (update_map)
452  word_res->reject_map[first_alphanum_index_].
453  setrej_1Il_conflict ();
454  return true;
455  }
456  else
457  return false;
458  }
459  else if (word_type == AC_UPPER_CASE) {
460  return false;
461  }
462  else {
463  if (update_map)
464  reject_I_1_L(word_res);
465  return true;
466  }
467 }
468 
469 
470 int16_t Tesseract::first_alphanum_index(const char *word,
471  const char *word_lengths) {
472  int16_t i;
473  int16_t offset;
474 
475  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
476  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
477  unicharset.get_isdigit(word + offset, word_lengths[i]))
478  return i;
479  }
480  return -1;
481 }
482 
483 int16_t Tesseract::first_alphanum_offset(const char *word,
484  const char *word_lengths) {
485  int16_t i;
486  int16_t offset;
487 
488  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
489  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
490  unicharset.get_isdigit(word + offset, word_lengths[i]))
491  return offset;
492  }
493  return -1;
494 }
495 
496 int16_t Tesseract::alpha_count(const char *word,
497  const char *word_lengths) {
498  int16_t i;
499  int16_t offset;
500  int16_t count = 0;
501 
502  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
503  if (unicharset.get_isalpha (word + offset, word_lengths[i]))
504  count++;
505  }
506  return count;
507 }
508 
509 
510 bool Tesseract::word_contains_non_1_digit(const char* word,
511  const char* word_lengths) {
512  int16_t i;
513  int16_t offset;
514 
515  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
516  if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
517  (word_lengths[i] != 1 || word[offset] != '1'))
518  return true;
519  }
520  return false;
521 }
522 
523 /*************************************************************************
524  * dont_allow_1Il()
525  * Don't unreject LONE accepted 1Il conflict set chars
526  *************************************************************************/
528  int i = 0;
529  int offset;
530  int word_len = word->reject_map.length();
531  const char *s = word->best_choice->unichar_string().c_str();
532  const char *lengths = word->best_choice->unichar_lengths().c_str();
533  bool accepted_1Il = false;
534 
535  for (i = 0, offset = 0; i < word_len;
536  offset += word->best_choice->unichar_lengths()[i++]) {
537  if (word->reject_map[i].accepted()) {
538  if (STRING(conflict_set_I_l_1).contains(s[offset])) {
539  accepted_1Il = true;
540  } else {
541  if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
542  word->uch_set->get_isdigit(s + offset, lengths[i]))
543  return; // >=1 non 1Il ch accepted
544  }
545  }
546  }
547  if (!accepted_1Il)
548  return; //Nothing to worry about
549 
550  for (i = 0, offset = 0; i < word_len;
551  offset += word->best_choice->unichar_lengths()[i++]) {
552  if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
553  word->reject_map[i].accepted())
554  word->reject_map[i].setrej_postNN_1Il();
555  }
556 }
557 
558 
559 int16_t Tesseract::count_alphanums(WERD_RES *word_res) {
560  int count = 0;
561  const WERD_CHOICE *best_choice = word_res->best_choice;
562  for (int i = 0; i < word_res->reject_map.length(); ++i) {
563  if ((word_res->reject_map[i].accepted()) &&
564  (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
565  word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
566  count++;
567  }
568  }
569  return count;
570 }
571 
572 
573 // reject all if most rejected.
575  /* Reject the whole of the word if the fraction of rejects exceeds a limit */
576 
577  if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
580 }
581 
582 
584  int16_t char_quality;
585  int16_t accepted_char_quality;
586 
587  if (word->best_choice->unichar_lengths().length() <= 1)
588  return false;
589 
591  contains(word->best_choice->unichar_string()[0]))
592  return false;
593 
594  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
595  for (int i = 1; i < word->best_choice->length(); ++i) {
596  if (word->best_choice->unichar_id(i) != uch_id) return false;
597  }
598 
599  word_char_quality(word, &char_quality, &accepted_char_quality);
600 
601  if ((word->best_choice->unichar_lengths().length () == char_quality) &&
602  (char_quality == accepted_char_quality))
603  return true;
604  else
605  return false;
606 }
607 
608 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
609  const WERD_CHOICE &word = *werd_res->best_choice;
610  int dict_word_type = werd_res->tesseract->dict_word(word);
611  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
612 }
613 
614 // Note: After running this function word_res->ratings
615 // might not contain the right BLOB_CHOICE corresponding to each character
616 // in word_res->best_choice.
617 void Tesseract::flip_hyphens(WERD_RES *word_res) {
618  WERD_CHOICE *best_choice = word_res->best_choice;
619  int i;
620  int prev_right = -9999;
621  int next_left;
622  TBOX out_box;
623  float aspect_ratio;
624 
626  return;
627 
628  int num_blobs = word_res->rebuild_word->NumBlobs();
629  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
630  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
631  TBLOB* blob = word_res->rebuild_word->blobs[i];
632  out_box = blob->bounding_box();
633  if (i + 1 == num_blobs)
634  next_left = 9999;
635  else
636  next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
637  // Don't touch small or touching blobs - it is too dangerous.
638  if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
639  (out_box.left() > prev_right) && (out_box.right() < next_left)) {
640  aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
641  if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
642  if (aspect_ratio >= tessedit_upper_flip_hyphen &&
643  word_res->uch_set->contains_unichar_id(unichar_dash) &&
644  word_res->uch_set->get_enabled(unichar_dash)) {
645  /* Certain HYPHEN */
646  best_choice->set_unichar_id(unichar_dash, i);
647  if (word_res->reject_map[i].rejected())
648  word_res->reject_map[i].setrej_hyphen_accept();
649  }
650  if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
651  word_res->reject_map[i].accepted())
652  //Suspected HYPHEN
653  word_res->reject_map[i].setrej_hyphen ();
654  }
655  else if (best_choice->unichar_id(i) == unichar_dash) {
656  if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
657  (word_res->reject_map[i].rejected()))
658  word_res->reject_map[i].setrej_hyphen_accept();
659  //Certain HYPHEN
660 
661  if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
662  (word_res->reject_map[i].accepted()))
663  //Suspected HYPHEN
664  word_res->reject_map[i].setrej_hyphen();
665  }
666  }
667  prev_right = out_box.right();
668  }
669 }
670 
671 // Note: After running this function word_res->ratings
672 // might not contain the right BLOB_CHOICE corresponding to each character
673 // in word_res->best_choice.
674 void Tesseract::flip_0O(WERD_RES *word_res) {
675  WERD_CHOICE *best_choice = word_res->best_choice;
676  int i;
677  TBOX out_box;
678 
679  if (!tessedit_flip_0O)
680  return;
681 
682  int num_blobs = word_res->rebuild_word->NumBlobs();
683  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
684  TBLOB* blob = word_res->rebuild_word->blobs[i];
685  if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
686  word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
687  out_box = blob->bounding_box();
688  if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
689  (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
690  return; //Beware words with sub/superscripts
691  }
692  }
693  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
694  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
695  if (unichar_0 == INVALID_UNICHAR_ID ||
696  !word_res->uch_set->get_enabled(unichar_0) ||
697  unichar_O == INVALID_UNICHAR_ID ||
698  !word_res->uch_set->get_enabled(unichar_O)) {
699  return; // 0 or O are not present/enabled in unicharset
700  }
701  for (i = 1; i < best_choice->length(); ++i) {
702  if (best_choice->unichar_id(i) == unichar_0 ||
703  best_choice->unichar_id(i) == unichar_O) {
704  /* A0A */
705  if ((i+1) < best_choice->length() &&
706  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
707  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
708  best_choice->set_unichar_id(unichar_O, i);
709  }
710  /* A00A */
711  if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
712  (i+1) < best_choice->length() &&
713  (best_choice->unichar_id(i+1) == unichar_0 ||
714  best_choice->unichar_id(i+1) == unichar_O) &&
715  (i+2) < best_choice->length() &&
716  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
717  best_choice->set_unichar_id(unichar_O, i);
718  i++;
719  }
720  /* AA0<non digit or end of word> */
721  if ((i > 1) &&
722  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
723  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
724  (((i+1) < best_choice->length() &&
725  !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
726  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
727  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
728  (i == best_choice->length() - 1))) {
729  best_choice->set_unichar_id(unichar_O, i);
730  }
731  /* 9O9 */
732  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
733  (i+1) < best_choice->length() &&
734  non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
735  best_choice->set_unichar_id(unichar_0, i);
736  }
737  /* 9OOO */
738  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
739  (i+2) < best_choice->length() &&
740  (best_choice->unichar_id(i+1) == unichar_0 ||
741  best_choice->unichar_id(i+1) == unichar_O) &&
742  (best_choice->unichar_id(i+2) == unichar_0 ||
743  best_choice->unichar_id(i+2) == unichar_O)) {
744  best_choice->set_unichar_id(unichar_0, i);
745  best_choice->set_unichar_id(unichar_0, i+1);
746  best_choice->set_unichar_id(unichar_0, i+2);
747  i += 2;
748  }
749  /* 9OO<non upper> */
750  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
751  (i+2) < best_choice->length() &&
752  (best_choice->unichar_id(i+1) == unichar_0 ||
753  best_choice->unichar_id(i+1) == unichar_O) &&
754  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
755  best_choice->set_unichar_id(unichar_0, i);
756  best_choice->set_unichar_id(unichar_0, i+1);
757  i++;
758  }
759  /* 9O<non upper> */
760  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
761  (i+1) < best_choice->length() &&
762  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
763  best_choice->set_unichar_id(unichar_0, i);
764  }
765  /* 9[.,]OOO.. */
766  if ((i > 1) &&
767  (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
768  word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
769  (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
770  best_choice->unichar_id(i-2) == unichar_O)) {
771  if (best_choice->unichar_id(i-2) == unichar_O) {
772  best_choice->set_unichar_id(unichar_0, i-2);
773  }
774  while (i < best_choice->length() &&
775  (best_choice->unichar_id(i) == unichar_O ||
776  best_choice->unichar_id(i) == unichar_0)) {
777  best_choice->set_unichar_id(unichar_0, i);
778  i++;
779  }
780  i--;
781  }
782  }
783  }
784 }
785 
786 bool Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
787  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
788 }
789 
790 bool Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
791  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
792 }
793 } // namespace tesseract
794 
795 #endif // def DISABLED_LEGACY_ENGINE
WERD_RES::done
bool done
Definition: pageres.h:299
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
tesseract::Tesseract::first_alphanum_index
int16_t first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:468
reject_blanks
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:178
tesseract::Tesseract::min_sane_x_ht_pixels
int min_sane_x_ht_pixels
Definition: tesseractclass.h:1042
WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:266
tessvars.h
kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:23
UNICHARSET::contains_unichar_id
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
ACCEPTABLE_WERD_TYPE
ACCEPTABLE_WERD_TYPE
Definition: control.h:27
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:260
WERD_CHOICE
Definition: ratngs.h:261
WERD_CHOICE::dangerous_ambig_found
bool dangerous_ambig_found() const
Definition: ratngs.h:351
REJMAP::initialise
void initialise(int16_t length)
Definition: rejctmap.cpp:272
UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:481
tesseractclass.h
tesseract::Tesseract::flip_0O
void flip_0O(WERD_RES *word)
Definition: reject.cpp:671
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
SYSTEM_DAWG_PERM
Definition: ratngs.h:239
WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:147
WERD_RES::denorm
DENORM denorm
Definition: pageres.h:195
control.h
tesseract::Wordrec::dict_word
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:103
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
TBOX::top
int16_t top() const
Definition: rect.h:57
STRING
Definition: strngs.h:45
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:334
tesseract::Tesseract::rej_1Il_trust_permuter_type
bool rej_1Il_trust_permuter_type
Definition: tesseractclass.h:1032
WERD_RES
Definition: pageres.h:160
tesseract::Tesseract::rej_1Il_use_dict_word
bool rej_1Il_use_dict_word
Definition: tesseractclass.h:1031
tesseract::Tesseract::reject_I_1_L
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:191
UNICHARSET::eq
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:686
tesseract::Tesseract::tessedit_image_border
int tessedit_image_border
Definition: tesseractclass.h:1039
tesseract::Tesseract::tessedit_flip_0O
bool tessedit_flip_0O
Definition: tesseractclass.h:1027
tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds
char * ok_repeated_ch_non_alphanum_wds
Definition: tesseractclass.h:1040
tesseract::Tesseract::acceptable_word_string
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1744
reject_poor_matches
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:207
tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:57
TBOX::height
int16_t height() const
Definition: rect.h:107
WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:197
genericvector.h
tesseract::Tesseract::count_alphanums
int16_t count_alphanums(const WERD_CHOICE &word)
Definition: output.cpp:375
REJMAP::length
int32_t length() const
Definition: rejctmap.h:222
WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:347
tesseract::Tesseract::rej_use_tess_blanks
bool rej_use_tess_blanks
Definition: tesseractclass.h:1034
tesseract::BoxWord::BlobBox
const TBOX & BlobBox(int index) const
Definition: boxword.h:83
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
tesseract::Tesseract::safe_dict_word
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:605
tesseract::Tesseract::tessedit_rejection_debug
bool tessedit_rejection_debug
Definition: tesseractclass.h:1026
tesseract::Tesseract::dont_allow_1Il
void dont_allow_1Il(WERD_RES *word)
Definition: reject.cpp:524
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
REJMAP::rej_word_contains_blanks
void rej_word_contains_blanks()
Definition: rejctmap.cpp:369
UNICHARSET::get_enabled
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:868
tesseract::Tesseract::non_O_upper
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:783
tesseract::Tesseract::rej_trust_doc_dawg
bool rej_trust_doc_dawg
Definition: tesseractclass.h:1030
tesseract::Tesseract::tessedit_reject_mode
int tessedit_reject_mode
Definition: tesseractclass.h:1025
CLISTIZEH
CLISTIZEH(STRING) CLISTIZE(STRING) namespace tesseract
Definition: reject.cpp:50
AC_UPPER_CASE
ALL upper case.
Definition: control.h:31
compute_reject_threshold
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:225
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
tesseract::Tesseract::rej_use_good_perm
bool rej_use_good_perm
Definition: tesseractclass.h:1035
UNICHAR_SPACE
Definition: unicharset.h:34
DENORM::y_scale
float y_scale() const
Definition: normalis.h:269
tesseract::Tesseract::word_char_quality
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:95
GenericVector::resize_no_init
void resize_no_init(int size)
Definition: genericvector.h:65
TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:457
TBOX::width
int16_t width() const
Definition: rect.h:114
UNICHARSET
Definition: unicharset.h:145
WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:297
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
REJMAP::rej_word_not_tess_accepted
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:360
tesseract::Tesseract::ImageHeight
int ImageHeight() const
Definition: tesseractclass.h:253
REJMAP::reject_count
int16_t reject_count()
Definition: rejctmap.h:228
tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract
double rej_whole_of_mostly_reject_word_fract
Definition: tesseractclass.h:1038
tesseract::Tesseract::non_0_digit
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:787
CLISTIZE
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:288
helpers.h
tesseract
Definition: baseapi.h:65
tesseract::Tesseract::tessedit_upper_flip_hyphen
double tessedit_upper_flip_hyphen
Definition: tesseractclass.h:1029
tesseract::Tesseract::tessedit_lower_flip_hyphen
double tessedit_lower_flip_hyphen
Definition: tesseractclass.h:1028
AC_LOWER_CASE
ALL lower case.
Definition: control.h:30
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
tesseract::Tesseract::flip_hyphens
void flip_hyphens(WERD_RES *word)
Definition: reject.cpp:614
TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:466
GenericVector< float >
tesseract::Tesseract::first_alphanum_offset
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:481
reject.h
UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:495
AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:29
tesseract::Tesseract::rej_use_tess_accepted
bool rej_use_tess_accepted
Definition: tesseractclass.h:1033
STRING::length
int32_t length() const
Definition: strngs.cpp:187
REJMAP::rej_word_bad_permuter
void rej_word_bad_permuter()
Definition: rejctmap.cpp:378
WERD_CHOICE::print
void print() const
Definition: ratngs.h:568
STRING::contains
bool contains(char c) const
Definition: strngs.cpp:183
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
REJMAP::rej_word_small_xht
void rej_word_small_xht()
Definition: rejctmap.cpp:342
count
int count(LIST var_list)
Definition: oldlist.cpp:79
tesseract::BoxWord::length
int length() const
Definition: boxword.h:82
TBLOB
Definition: blobs.h:282
WERD_RES::tesseract
tesseract::Tesseract * tesseract
Definition: pageres.h:274
TBOX::left
int16_t left() const
Definition: rect.h:71
tesseract::Tesseract::repeated_nonalphanum_wd
bool repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
Definition: reject.cpp:580
tesseract::Tesseract::reject_mostly_rejects
void reject_mostly_rejects(WERD_RES *word)
Definition: reject.cpp:571
ROW
Definition: ocrrow.h:35
DENORM::x_scale
float x_scale() const
Definition: normalis.h:266
tesseract::Tesseract::one_ell_conflict
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
Definition: reject.cpp:291
TBOX::right
int16_t right() const
Definition: rect.h:78
tesseract::Tesseract::set_done
void set_done(WERD_RES *word, int16_t pass)
tesseract::Tesseract::conflict_set_I_l_1
char * conflict_set_I_l_1
Definition: tesseractclass.h:1041
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::Tesseract::ImageWidth
int ImageWidth() const
Definition: tesseractclass.h:250
tesseract::Tesseract::reject_edge_blobs
void reject_edge_blobs(WERD_RES *word)
Definition: reject.cpp:263
tesseract::Tesseract::rej_use_sensible_wd
bool rej_use_sensible_wd
Definition: tesseractclass.h:1036
WERD_RES::word
WERD * word
Definition: pageres.h:180
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
REJMAP::rej_word_mostly_rej
void rej_word_mostly_rej()
Definition: rejctmap.cpp:405
tesseract::Tesseract::rej_alphas_in_number_perm
bool rej_alphas_in_number_perm
Definition: tesseractclass.h:1037
GenericVector::sort
void sort()
Definition: genericvector.h:1102
DOC_DAWG_PERM
Definition: ratngs.h:240
AC_INITIAL_CAP
ALL but initial lc.
Definition: control.h:32
WERD_CHOICE::unichar_lengths
const STRING & unichar_lengths() const
Definition: ratngs.h:536
FREQ_DAWG_PERM
Definition: ratngs.h:242
kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:24
tesseract::Tesseract::make_reject_map
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:446
tesseract::Tesseract::word_contains_non_1_digit
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:508
NUMBER_PERM
Definition: ratngs.h:237
USER_DAWG_PERM
Definition: ratngs.h:241
tesseract::Tesseract::alpha_count
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:494
tesseract::Tesseract::check_debug_pt
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1848
TBOX
Definition: rect.h:33
docqual.h