tesseract  4.0.0-1-g2a2b
reject.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: reject.cpp (Formerly reject.c)
3  * Description: Rejection functions used in tessedit
4  * Author: Phil Cheatle
5  * Created: Wed Sep 23 16:50:21 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 // Include automatically generated configuration file if running autoconf.
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #ifdef DISABLED_LEGACY_ENGINE
26 
27 #include "tesseractclass.h"
28 
29 namespace tesseract {
30 
31 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
32  const WERD_CHOICE &word = *werd_res->best_choice;
33  int dict_word_type = werd_res->tesseract->dict_word(word);
34  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
35 }
36 } // namespace tesseract
37 
38 #else
39 
40 #include "tessvars.h"
41 #include <cctype>
42 #include <cerrno>
43 #include <cstring>
44 #include "genericvector.h"
45 #include "reject.h"
46 #include "control.h"
47 #include "docqual.h"
48 #include "globaloc.h" // For err_exit.
49 #include "globals.h"
50 #include "helpers.h"
51 
52 #include "tesseractclass.h"
53 
54 
56 
57 /*************************************************************************
58  * set_done()
59  *
60  * Set the done flag based on the word acceptability criteria
61  *************************************************************************/
62 
63 namespace tesseract {
64 void Tesseract::set_done(WERD_RES *word, int16_t pass) {
65  word->done = word->tess_accepted &&
66  (strchr(word->best_choice->unichar_string().string(), ' ') == nullptr);
67  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
68  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
69  word->best_choice->permuter() == FREQ_DAWG_PERM ||
71  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
72  one_ell_conflict(word, false)) {
73  if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
74  word->done = FALSE;
75  }
76  if (word->done && ((!word_from_dict &&
77  word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
78  if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
79  word->done = FALSE;
80  }
82  tprintf("set_done(): done=%d\n", word->done);
83  word->best_choice->print("");
84  }
85 }
86 
87 
88 /*************************************************************************
89  * make_reject_map()
90  *
91  * Sets the done flag to indicate whether the resylt is acceptable.
92  *
93  * Sets a reject map for the word.
94  *************************************************************************/
95 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
96  int i;
97  int offset;
98 
99  flip_0O(word);
100  check_debug_pt(word, -1); // For trap only
101  set_done(word, pass); // Set acceptance
103  reject_blanks(word);
104  /*
105  0: Rays original heuristic - the baseline
106  */
107  if (tessedit_reject_mode == 0) {
108  if (!word->done)
109  reject_poor_matches(word);
110  } else if (tessedit_reject_mode == 5) {
111  /*
112  5: Reject I/1/l from words where there is no strong contextual confirmation;
113  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
114  and the whole of any words which are very small
115  */
116  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
118  } else {
119  one_ell_conflict(word, true);
120  /*
121  Originally the code here just used the done flag. Now I have duplicated
122  and unpacked the conditions for setting the done flag so that each
123  mechanism can be turned on or off independently. This works WITHOUT
124  affecting the done flag setting.
125  */
126  if (rej_use_tess_accepted && !word->tess_accepted)
128 
129  if (rej_use_tess_blanks &&
130  (strchr (word->best_choice->unichar_string().string (), ' ') != nullptr))
132 
133  WERD_CHOICE* best_choice = word->best_choice;
134  if (rej_use_good_perm) {
135  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
136  best_choice->permuter() == FREQ_DAWG_PERM ||
137  best_choice->permuter() == USER_DAWG_PERM) &&
140  best_choice->unichar_string().string(),
141  best_choice->unichar_lengths().string()) !=
142  AC_UNACCEPTABLE)) {
143  // PASSED TEST
144  } else if (best_choice->permuter() == NUMBER_PERM) {
146  for (i = 0, offset = 0;
147  best_choice->unichar_string()[offset] != '\0';
148  offset += best_choice->unichar_lengths()[i++]) {
149  if (word->reject_map[i].accepted() &&
150  word->uch_set->get_isalpha(
151  best_choice->unichar_string().string() + offset,
152  best_choice->unichar_lengths()[i]))
153  word->reject_map[i].setrej_bad_permuter();
154  // rej alpha
155  }
156  }
157  } else {
159  }
160  }
161  /* Ambig word rejection was here once !!*/
162  }
163  } else {
164  tprintf("BAD tessedit_reject_mode\n");
165  err_exit();
166  }
167 
168  if (tessedit_image_border > -1)
169  reject_edge_blobs(word);
170 
171  check_debug_pt (word, 10);
173  tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
174  tprintf("Certainty: %f Rating: %f\n",
175  word->best_choice->certainty (), word->best_choice->rating ());
176  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
177  }
178 
179  flip_hyphens(word);
180  check_debug_pt(word, 20);
181 }
182 } // namespace tesseract
183 
184 
185 void reject_blanks(WERD_RES *word) {
186  int16_t i;
187  int16_t offset;
188 
189  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
190  offset += word->best_choice->unichar_lengths()[i], i += 1) {
191  if (word->best_choice->unichar_string()[offset] == ' ')
192  //rej unrecognised blobs
193  word->reject_map[i].setrej_tess_failure ();
194  }
195 }
196 
197 namespace tesseract {
199  int16_t i;
200  int16_t offset;
201 
202  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
203  offset += word->best_choice->unichar_lengths()[i], i += 1) {
205  contains (word->best_choice->unichar_string()[offset])) {
206  //rej 1Il conflict
207  word->reject_map[i].setrej_1Il_conflict ();
208  }
209  }
210 }
211 } // namespace tesseract
212 
213 
215  float threshold = compute_reject_threshold(word->best_choice);
216  for (int i = 0; i < word->best_choice->length(); ++i) {
217  if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
218  word->reject_map[i].setrej_tess_failure();
219  else if (word->best_choice->certainty(i) < threshold)
220  word->reject_map[i].setrej_poor_match();
221  }
222 }
223 
224 
225 /**********************************************************************
226  * compute_reject_threshold
227  *
228  * Set a rejection threshold for this word.
229  * Initially this is a trivial function which looks for the largest
230  * gap in the certainty value.
231  **********************************************************************/
232 
234  float threshold; // rejection threshold
235  float bestgap = 0.0f; // biggest gap
236  float gapstart; // bottom of gap
237 
238  int blob_count = word->length();
239  GenericVector<float> ratings;
240  ratings.resize_no_init(blob_count);
241  for (int i = 0; i < blob_count; ++i) {
242  ratings[i] = word->certainty(i);
243  }
244  ratings.sort();
245  gapstart = ratings[0] - 1; // all reject if none better
246  if (blob_count >= 3) {
247  for (int index = 0; index < blob_count - 1; index++) {
248  if (ratings[index + 1] - ratings[index] > bestgap) {
249  bestgap = ratings[index + 1] - ratings[index];
250  // find biggest
251  gapstart = ratings[index];
252  }
253  }
254  }
255  threshold = gapstart + bestgap / 2;
256 
257  return threshold;
258 }
259 
260 
261 /*************************************************************************
262  * reject_edge_blobs()
263  *
264  * If the word is perilously close to the edge of the image, reject those blobs
265  * in the word which are too close to the edge as they could be clipped.
266  *************************************************************************/
267 namespace tesseract {
269  TBOX word_box = word->word->bounding_box();
270  // Use the box_word as it is already denormed back to image coordinates.
271  int blobcount = word->box_word->length();
272 
273  if (word_box.left() < tessedit_image_border ||
274  word_box.bottom() < tessedit_image_border ||
275  word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
276  word_box.top() + tessedit_image_border > ImageHeight() - 1) {
277  ASSERT_HOST(word->reject_map.length() == blobcount);
278  for (int blobindex = 0; blobindex < blobcount; blobindex++) {
279  TBOX blob_box = word->box_word->BlobBox(blobindex);
280  if (blob_box.left() < tessedit_image_border ||
281  blob_box.bottom() < tessedit_image_border ||
282  blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
283  blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
284  word->reject_map[blobindex].setrej_edge_char();
285  // Close to edge
286  }
287  }
288  }
289 }
290 
291 /**********************************************************************
292  * one_ell_conflict()
293  *
294  * Identify words where there is a potential I/l/1 error.
295  * - A bundle of contextual heuristics!
296  **********************************************************************/
297 bool Tesseract::one_ell_conflict(WERD_RES* word_res, bool update_map) {
298  const char *word;
299  const char *lengths;
300  int16_t word_len; //its length
301  int16_t first_alphanum_index_;
302  int16_t first_alphanum_offset_;
303  int16_t i;
304  int16_t offset;
305  bool non_conflict_set_char; //non conf set a/n?
306  bool conflict = false;
307  bool allow_1s;
308  ACCEPTABLE_WERD_TYPE word_type;
309  bool dict_perm_type;
310  bool dict_word_ok;
311  int dict_word_type;
312 
313  word = word_res->best_choice->unichar_string().string ();
314  lengths = word_res->best_choice->unichar_lengths().string();
315  word_len = strlen(lengths);
316  /*
317  If there are no occurrences of the conflict set characters then the word
318  is OK.
319  */
320  if (strpbrk(word, conflict_set_I_l_1.string ()) == nullptr)
321  return false;
322 
323  /*
324  There is a conflict if there are NO other (confirmed) alphanumerics apart
325  from those in the conflict set.
326  */
327 
328  for (i = 0, offset = 0, non_conflict_set_char = false;
329  (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
330  non_conflict_set_char =
331  (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
332  word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
333  !STRING (conflict_set_I_l_1).contains (word[offset]);
334  if (!non_conflict_set_char) {
335  if (update_map)
336  reject_I_1_L(word_res);
337  return true;
338  }
339 
340  /*
341  If the word is accepted by a dawg permuter, and the first alpha character
342  is "I" or "l", check to see if the alternative is also a dawg word. If it
343  is, then there is a potential error otherwise the word is ok.
344  */
345 
346  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
347  (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
349  (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
350  (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
351  dict_word_type = dict_word(*(word_res->best_choice));
352  dict_word_ok = (dict_word_type > 0) &&
353  (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
354 
355  if ((rej_1Il_use_dict_word && dict_word_ok) ||
356  (rej_1Il_trust_permuter_type && dict_perm_type) ||
357  (dict_perm_type && dict_word_ok)) {
358  first_alphanum_index_ = first_alphanum_index (word, lengths);
359  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
360  if (lengths[first_alphanum_index_] == 1 &&
361  word[first_alphanum_offset_] == 'I') {
362  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
363  if (safe_dict_word(word_res) > 0) {
364  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
365  if (update_map)
366  word_res->reject_map[first_alphanum_index_].
367  setrej_1Il_conflict();
368  return true;
369  }
370  else {
371  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
372  return false;
373  }
374  }
375 
376  if (lengths[first_alphanum_index_] == 1 &&
377  word[first_alphanum_offset_] == 'l') {
378  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
379  if (safe_dict_word(word_res) > 0) {
380  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
381  if (update_map)
382  word_res->reject_map[first_alphanum_index_].
383  setrej_1Il_conflict();
384  return true;
385  }
386  else {
387  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
388  return false;
389  }
390  }
391  return false;
392  }
393 
394  /*
395  NEW 1Il code. The old code relied on permuter types too much. In fact,
396  tess will use TOP_CHOICE permute for good things like "palette".
397  In this code the string is examined independently to see if it looks like
398  a well formed word.
399  */
400 
401  /*
402  REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
403  dictionary word.
404  */
405  first_alphanum_index_ = first_alphanum_index (word, lengths);
406  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
407  if (lengths[first_alphanum_index_] == 1 &&
408  word[first_alphanum_offset_] == 'l') {
409  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
410  if (safe_dict_word(word_res) > 0)
411  return false;
412  else
413  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
414  }
415  else if (lengths[first_alphanum_index_] == 1 &&
416  word[first_alphanum_offset_] == 'I') {
417  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
418  if (safe_dict_word(word_res) > 0)
419  return false;
420  else
421  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
422  }
423  /*
424  For strings containing digits:
425  If there are no alphas OR the numeric permuter liked the word,
426  reject any non 1 conflict chs
427  Else reject all conflict chs
428  */
429  if (word_contains_non_1_digit (word, lengths)) {
430  allow_1s = (alpha_count (word, lengths) == 0) ||
431  (word_res->best_choice->permuter () == NUMBER_PERM);
432 
433  int16_t offset;
434  conflict = false;
435  for (i = 0, offset = 0; word[offset] != '\0';
436  offset += word_res->best_choice->unichar_lengths()[i++]) {
437  if ((!allow_1s || (word[offset] != '1')) &&
438  STRING (conflict_set_I_l_1).contains (word[offset])) {
439  if (update_map)
440  word_res->reject_map[i].setrej_1Il_conflict ();
441  conflict = true;
442  }
443  }
444  return conflict;
445  }
446  /*
447  For anything else. See if it conforms to an acceptable word type. If so,
448  treat accordingly.
449  */
450  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
451  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
452  first_alphanum_index_ = first_alphanum_index (word, lengths);
453  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
454  if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
455  if (update_map)
456  word_res->reject_map[first_alphanum_index_].
457  setrej_1Il_conflict ();
458  return true;
459  }
460  else
461  return false;
462  }
463  else if (word_type == AC_UPPER_CASE) {
464  return false;
465  }
466  else {
467  if (update_map)
468  reject_I_1_L(word_res);
469  return true;
470  }
471 }
472 
473 
474 int16_t Tesseract::first_alphanum_index(const char *word,
475  const char *word_lengths) {
476  int16_t i;
477  int16_t offset;
478 
479  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
480  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
481  unicharset.get_isdigit(word + offset, word_lengths[i]))
482  return i;
483  }
484  return -1;
485 }
486 
487 int16_t Tesseract::first_alphanum_offset(const char *word,
488  const char *word_lengths) {
489  int16_t i;
490  int16_t offset;
491 
492  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
493  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
494  unicharset.get_isdigit(word + offset, word_lengths[i]))
495  return offset;
496  }
497  return -1;
498 }
499 
500 int16_t Tesseract::alpha_count(const char *word,
501  const char *word_lengths) {
502  int16_t i;
503  int16_t offset;
504  int16_t count = 0;
505 
506  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
507  if (unicharset.get_isalpha (word + offset, word_lengths[i]))
508  count++;
509  }
510  return count;
511 }
512 
513 
515  const char* word_lengths) {
516  int16_t i;
517  int16_t offset;
518 
519  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
520  if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
521  (word_lengths[i] != 1 || word[offset] != '1'))
522  return true;
523  }
524  return false;
525 }
526 
527 /*************************************************************************
528  * dont_allow_1Il()
529  * Don't unreject LONE accepted 1Il conflict set chars
530  *************************************************************************/
532  int i = 0;
533  int offset;
534  int word_len = word->reject_map.length();
535  const char *s = word->best_choice->unichar_string().string();
536  const char *lengths = word->best_choice->unichar_lengths().string();
537  bool accepted_1Il = false;
538 
539  for (i = 0, offset = 0; i < word_len;
540  offset += word->best_choice->unichar_lengths()[i++]) {
541  if (word->reject_map[i].accepted()) {
542  if (STRING(conflict_set_I_l_1).contains(s[offset])) {
543  accepted_1Il = true;
544  } else {
545  if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
546  word->uch_set->get_isdigit(s + offset, lengths[i]))
547  return; // >=1 non 1Il ch accepted
548  }
549  }
550  }
551  if (!accepted_1Il)
552  return; //Nothing to worry about
553 
554  for (i = 0, offset = 0; i < word_len;
555  offset += word->best_choice->unichar_lengths()[i++]) {
556  if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
557  word->reject_map[i].accepted())
558  word->reject_map[i].setrej_postNN_1Il();
559  }
560 }
561 
562 
564  int count = 0;
565  const WERD_CHOICE *best_choice = word_res->best_choice;
566  for (int i = 0; i < word_res->reject_map.length(); ++i) {
567  if ((word_res->reject_map[i].accepted()) &&
568  (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
569  word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
570  count++;
571  }
572  }
573  return count;
574 }
575 
576 
577 // reject all if most rejected.
579  /* Reject the whole of the word if the fraction of rejects exceeds a limit */
580 
581  if ((float) word->reject_map.reject_count() / word->reject_map.length() >=
584 }
585 
586 
588  int16_t char_quality;
589  int16_t accepted_char_quality;
590 
591  if (word->best_choice->unichar_lengths().length() <= 1)
592  return false;
593 
595  contains(word->best_choice->unichar_string()[0]))
596  return false;
597 
598  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
599  for (int i = 1; i < word->best_choice->length(); ++i) {
600  if (word->best_choice->unichar_id(i) != uch_id) return false;
601  }
602 
603  word_char_quality(word, row, &char_quality, &accepted_char_quality);
604 
605  if ((word->best_choice->unichar_lengths().length () == char_quality) &&
606  (char_quality == accepted_char_quality))
607  return true;
608  else
609  return false;
610 }
611 
612 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
613  const WERD_CHOICE &word = *werd_res->best_choice;
614  int dict_word_type = werd_res->tesseract->dict_word(word);
615  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
616 }
617 
618 // Note: After running this function word_res->ratings
619 // might not contain the right BLOB_CHOICE corresponding to each character
620 // in word_res->best_choice.
622  WERD_CHOICE *best_choice = word_res->best_choice;
623  int i;
624  int prev_right = -9999;
625  int next_left;
626  TBOX out_box;
627  float aspect_ratio;
628 
630  return;
631 
632  int num_blobs = word_res->rebuild_word->NumBlobs();
633  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
634  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
635  TBLOB* blob = word_res->rebuild_word->blobs[i];
636  out_box = blob->bounding_box();
637  if (i + 1 == num_blobs)
638  next_left = 9999;
639  else
640  next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
641  // Don't touch small or touching blobs - it is too dangerous.
642  if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
643  (out_box.left() > prev_right) && (out_box.right() < next_left)) {
644  aspect_ratio = out_box.width() / (float) out_box.height();
645  if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
646  if (aspect_ratio >= tessedit_upper_flip_hyphen &&
647  word_res->uch_set->contains_unichar_id(unichar_dash) &&
648  word_res->uch_set->get_enabled(unichar_dash)) {
649  /* Certain HYPHEN */
650  best_choice->set_unichar_id(unichar_dash, i);
651  if (word_res->reject_map[i].rejected())
652  word_res->reject_map[i].setrej_hyphen_accept();
653  }
654  if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
655  word_res->reject_map[i].accepted())
656  //Suspected HYPHEN
657  word_res->reject_map[i].setrej_hyphen ();
658  }
659  else if (best_choice->unichar_id(i) == unichar_dash) {
660  if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
661  (word_res->reject_map[i].rejected()))
662  word_res->reject_map[i].setrej_hyphen_accept();
663  //Certain HYPHEN
664 
665  if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
666  (word_res->reject_map[i].accepted()))
667  //Suspected HYPHEN
668  word_res->reject_map[i].setrej_hyphen();
669  }
670  }
671  prev_right = out_box.right();
672  }
673 }
674 
675 // Note: After running this function word_res->ratings
676 // might not contain the right BLOB_CHOICE corresponding to each character
677 // in word_res->best_choice.
678 void Tesseract::flip_0O(WERD_RES *word_res) {
679  WERD_CHOICE *best_choice = word_res->best_choice;
680  int i;
681  TBOX out_box;
682 
683  if (!tessedit_flip_0O)
684  return;
685 
686  int num_blobs = word_res->rebuild_word->NumBlobs();
687  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
688  TBLOB* blob = word_res->rebuild_word->blobs[i];
689  if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
690  word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
691  out_box = blob->bounding_box();
692  if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
693  (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
694  return; //Beware words with sub/superscripts
695  }
696  }
697  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
698  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
699  if (unichar_0 == INVALID_UNICHAR_ID ||
700  !word_res->uch_set->get_enabled(unichar_0) ||
701  unichar_O == INVALID_UNICHAR_ID ||
702  !word_res->uch_set->get_enabled(unichar_O)) {
703  return; // 0 or O are not present/enabled in unicharset
704  }
705  for (i = 1; i < best_choice->length(); ++i) {
706  if (best_choice->unichar_id(i) == unichar_0 ||
707  best_choice->unichar_id(i) == unichar_O) {
708  /* A0A */
709  if ((i+1) < best_choice->length() &&
710  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
711  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
712  best_choice->set_unichar_id(unichar_O, i);
713  }
714  /* A00A */
715  if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
716  (i+1) < best_choice->length() &&
717  (best_choice->unichar_id(i+1) == unichar_0 ||
718  best_choice->unichar_id(i+1) == unichar_O) &&
719  (i+2) < best_choice->length() &&
720  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
721  best_choice->set_unichar_id(unichar_O, i);
722  i++;
723  }
724  /* AA0<non digit or end of word> */
725  if ((i > 1) &&
726  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
727  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
728  (((i+1) < best_choice->length() &&
729  !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
730  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
731  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
732  (i == best_choice->length() - 1))) {
733  best_choice->set_unichar_id(unichar_O, i);
734  }
735  /* 9O9 */
736  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
737  (i+1) < best_choice->length() &&
738  non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
739  best_choice->set_unichar_id(unichar_0, i);
740  }
741  /* 9OOO */
742  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
743  (i+2) < best_choice->length() &&
744  (best_choice->unichar_id(i+1) == unichar_0 ||
745  best_choice->unichar_id(i+1) == unichar_O) &&
746  (best_choice->unichar_id(i+2) == unichar_0 ||
747  best_choice->unichar_id(i+2) == unichar_O)) {
748  best_choice->set_unichar_id(unichar_0, i);
749  best_choice->set_unichar_id(unichar_0, i+1);
750  best_choice->set_unichar_id(unichar_0, i+2);
751  i += 2;
752  }
753  /* 9OO<non upper> */
754  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
755  (i+2) < best_choice->length() &&
756  (best_choice->unichar_id(i+1) == unichar_0 ||
757  best_choice->unichar_id(i+1) == unichar_O) &&
758  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
759  best_choice->set_unichar_id(unichar_0, i);
760  best_choice->set_unichar_id(unichar_0, i+1);
761  i++;
762  }
763  /* 9O<non upper> */
764  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
765  (i+1) < best_choice->length() &&
766  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
767  best_choice->set_unichar_id(unichar_0, i);
768  }
769  /* 9[.,]OOO.. */
770  if ((i > 1) &&
771  (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
772  word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
773  (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
774  best_choice->unichar_id(i-2) == unichar_O)) {
775  if (best_choice->unichar_id(i-2) == unichar_O) {
776  best_choice->set_unichar_id(unichar_0, i-2);
777  }
778  while (i < best_choice->length() &&
779  (best_choice->unichar_id(i) == unichar_O ||
780  best_choice->unichar_id(i) == unichar_0)) {
781  best_choice->set_unichar_id(unichar_0, i);
782  i++;
783  }
784  i--;
785  }
786  }
787  }
788 }
789 
790 bool Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
791  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
792 }
793 
794 bool Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
795  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
796 }
797 } // namespace tesseract
798 
799 #endif // def DISABLED_LEGACY_ENGINE
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:359
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:500
int16_t first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:474
TWERD * rebuild_word
Definition: pageres.h:260
int UNICHAR_ID
Definition: unichar.h:35
void resize_no_init(int size)
Definition: genericvector.h:65
void dont_allow_1Il(WERD_RES *word)
Definition: reject.cpp:531
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:129
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:233
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:686
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
int count(LIST var_list)
Definition: oldlist.cpp:98
void print() const
Definition: ratngs.h:580
TBOX bounding_box() const
Definition: werd.cpp:159
uint8_t permuter() const
Definition: ratngs.h:346
Definition: rect.h:34
int NumBlobs() const
Definition: blobs.h:432
float y_scale() const
Definition: normalis.h:270
int32_t length() const
Definition: rejctmap.h:223
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:514
const int kBlnXHeight
Definition: normalis.h:24
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
const int kBlnBaselineOffset
Definition: normalis.h:25
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
bool repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
Definition: reject.cpp:587
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:794
const TBOX & BlobBox(int index) const
Definition: boxword.h:84
bool dangerous_ambig_found() const
Definition: ratngs.h:363
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:487
int16_t width() const
Definition: rect.h:115
int16_t left() const
Definition: rect.h:72
const STRING & unichar_lengths() const
Definition: ratngs.h:548
int16_t top() const
Definition: rect.h:58
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:790
DENORM denorm
Definition: pageres.h:204
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
CLISTIZEH(STRING) CLISTIZE(STRING) namespace tesseract
Definition: reject.cpp:55
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:612
float x_scale() const
Definition: normalis.h:267
UNICHARSET unicharset
Definition: ccutil.h:68
int16_t reject_count()
Definition: rejctmap.h:229
#define FALSE
Definition: capi.h:52
bool tess_accepted
Definition: pageres.h:296
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
void rej_word_bad_permuter()
Definition: rejctmap.cpp:381
char * ok_repeated_ch_non_alphanum_wds
void set_done(WERD_RES *word, int16_t pass)
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:873
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:185
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
Definition: ocrrow.h:36
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
TBOX bounding_box() const
Definition: blobs.cpp:478
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1764
int length() const
Definition: ratngs.h:303
void flip_0O(WERD_RES *word)
Definition: reject.cpp:678
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279
ALL but initial lc.
Definition: control.h:33
void reject_mostly_rejects(WERD_RES *word)
Definition: reject.cpp:578
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
bool done
Definition: pageres.h:298
void rej_word_small_xht()
Definition: rejctmap.cpp:345
tesseract::Tesseract * tesseract
Definition: pageres.h:282
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:214
Definition: strngs.h:45
bool contains(const char c) const
Definition: strngs.cpp:187
void reject_edge_blobs(WERD_RES *word)
Definition: reject.cpp:268
const UNICHARSET * uch_set
Definition: pageres.h:206
int length() const
Definition: boxword.h:83
ALL upper case.
Definition: control.h:32
const STRING & unichar_string() const
Definition: ratngs.h:541
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1868
int16_t right() const
Definition: rect.h:79
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:363
ACCEPTABLE_WERD_TYPE
Definition: control.h:28
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
Definition: reject.cpp:297
Definition: blobs.h:268
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:93
void rej_word_contains_blanks()
Definition: rejctmap.cpp:372
void err_exit()
Definition: globaloc.cpp:75
double rej_whole_of_mostly_reject_word_fract
int16_t bottom() const
Definition: rect.h:65
void flip_hyphens(WERD_RES *word)
Definition: reject.cpp:621
int16_t count_alphanums(const WERD_CHOICE &word)
Definition: output.cpp:383
Unacceptable word.
Definition: control.h:30
int32_t length() const
Definition: strngs.cpp:191
WERD_CHOICE * best_choice
Definition: pageres.h:235
int16_t height() const
Definition: rect.h:108
tesseract::BoxWord * box_word
Definition: pageres.h:266
void rej_word_mostly_rej()
Definition: rejctmap.cpp:408
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:198
void initialise(int16_t length)
Definition: rejctmap.cpp:275
#define ASSERT_HOST(x)
Definition: errcode.h:84
ALL lower case.
Definition: control.h:31
WERD * word
Definition: pageres.h:189