tessapi/4.0.0/a00164_source.html

 /**********************************************************************
  * File:        reject.cpp  (Formerly reject.c)
  * Description: Rejection functions used in tessedit
  * Author:    Phil Cheatle
  * Created:   Wed Sep 23 16:50:21 BST 1992
  *
  * (C) Copyright 1992, Hewlett-Packard Ltd.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **********************************************************************/

 // Include automatically generated configuration file if running autoconf.
 #ifdef HAVE_CONFIG_H
 #include "config_auto.h"
 #endif

 #ifdef DISABLED_LEGACY_ENGINE

 #include "tesseractclass.h"

 namespace tesseract {

 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
   const WERD_CHOICE &word = *werd_res->best_choice;
   int dict_word_type = werd_res->tesseract->dict_word(word);
   return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
 }
 }  // namespace tesseract

 #else

 #include "tessvars.h"
 #include <cctype>
 #include <cerrno>
 #include <cstring>
 #include "genericvector.h"
 #include "reject.h"
 #include "control.h"
 #include "docqual.h"
 #include "globaloc.h"  // For err_exit.
 #include "globals.h"
 #include "helpers.h"

 #include "tesseractclass.h"


 CLISTIZEH (STRING) CLISTIZE (STRING)

 /*************************************************************************
  * set_done()
  *
  * Set the done flag based on the word acceptability criteria
  *************************************************************************/

 namespace tesseract {
 void Tesseract::set_done(WERD_RES *word, int16_t pass) {
   word->done = word->tess_accepted &&
       (strchr(word->best_choice->unichar_string().string(), ' ') == nullptr);
   bool word_is_ambig = word->best_choice->dangerous_ambig_found();
   bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
       word->best_choice->permuter() == FREQ_DAWG_PERM ||
       word->best_choice->permuter() == USER_DAWG_PERM;
   if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
       one_ell_conflict(word, false)) {
     if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
     word->done = FALSE;
   }
   if (word->done && ((!word_from_dict &&
       word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
     if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
       word->done = FALSE;
   }
   if (tessedit_rejection_debug) {
     tprintf("set_done(): done=%d\n", word->done);
     word->best_choice->print("");
   }
 }


 /*************************************************************************
  * make_reject_map()
  *
  * Sets the done flag to indicate whether the resylt is acceptable.
  *
  * Sets a reject map for the word.
  *************************************************************************/
 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
   int i;
   int offset;

   flip_0O(word);
   check_debug_pt(word, -1);     // For trap only
   set_done(word, pass);  // Set acceptance
   word->reject_map.initialise(word->best_choice->unichar_lengths().length());
   reject_blanks(word);
   /*
   0: Rays original heuristic - the baseline
   */
   if (tessedit_reject_mode == 0) {
     if (!word->done)
       reject_poor_matches(word);
   } else if (tessedit_reject_mode == 5) {
     /*
     5: Reject I/1/l from words where there is no strong contextual confirmation;
       the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
       and the whole of any words which are very small
     */
     if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
       word->reject_map.rej_word_small_xht();
     } else {
       one_ell_conflict(word, true);
       /*
         Originally the code here just used the done flag. Now I have duplicated
         and unpacked the conditions for setting the done flag so that each
         mechanism can be turned on or off independently. This works WITHOUT
         affecting the done flag setting.
       */
       if (rej_use_tess_accepted && !word->tess_accepted)
         word->reject_map.rej_word_not_tess_accepted ();

       if (rej_use_tess_blanks &&
         (strchr (word->best_choice->unichar_string().string (), ' ') != nullptr))
         word->reject_map.rej_word_contains_blanks ();

       WERD_CHOICE* best_choice = word->best_choice;
       if (rej_use_good_perm) {
         if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
              best_choice->permuter() == FREQ_DAWG_PERM ||
              best_choice->permuter() == USER_DAWG_PERM) &&
             (!rej_use_sensible_wd ||
              acceptable_word_string(*word->uch_set,
                                     best_choice->unichar_string().string(),
                                     best_choice->unichar_lengths().string()) !=
                                         AC_UNACCEPTABLE)) {
           // PASSED TEST
         } else if (best_choice->permuter() == NUMBER_PERM) {
           if (rej_alphas_in_number_perm) {
             for (i = 0, offset = 0;
                  best_choice->unichar_string()[offset] != '\0';
                  offset += best_choice->unichar_lengths()[i++]) {
               if (word->reject_map[i].accepted() &&
                   word->uch_set->get_isalpha(
                       best_choice->unichar_string().string() + offset,
                       best_choice->unichar_lengths()[i]))
                 word->reject_map[i].setrej_bad_permuter();
               // rej alpha
             }
           }
         } else {
           word->reject_map.rej_word_bad_permuter();
         }
       }
       /* Ambig word rejection was here once !!*/
     }
   } else {
     tprintf("BAD tessedit_reject_mode\n");
     err_exit();
   }

   if (tessedit_image_border > -1)
     reject_edge_blobs(word);

   check_debug_pt (word, 10);
   if (tessedit_rejection_debug) {
     tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
     tprintf("Certainty: %f     Rating: %f\n",
       word->best_choice->certainty (), word->best_choice->rating ());
     tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
   }

   flip_hyphens(word);
   check_debug_pt(word, 20);
 }
 }  // namespace tesseract


 void reject_blanks(WERD_RES *word) {
   int16_t i;
   int16_t offset;

   for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
        offset += word->best_choice->unichar_lengths()[i], i += 1) {
     if (word->best_choice->unichar_string()[offset] == ' ')
                                  //rej unrecognised blobs
       word->reject_map[i].setrej_tess_failure ();
   }
 }

 namespace tesseract {
 void Tesseract::reject_I_1_L(WERD_RES *word) {
   int16_t i;
   int16_t offset;

   for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
        offset += word->best_choice->unichar_lengths()[i], i += 1) {
     if (STRING (conflict_set_I_l_1).
     contains (word->best_choice->unichar_string()[offset])) {
                                  //rej 1Il conflict
       word->reject_map[i].setrej_1Il_conflict ();
     }
   }
 }
 }  // namespace tesseract


 void reject_poor_matches(WERD_RES *word) {
   float threshold = compute_reject_threshold(word->best_choice);
   for (int i = 0; i < word->best_choice->length(); ++i) {
     if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
       word->reject_map[i].setrej_tess_failure();
     else if (word->best_choice->certainty(i) < threshold)
       word->reject_map[i].setrej_poor_match();
   }
 }


 /**********************************************************************
  * compute_reject_threshold
  *
  * Set a rejection threshold for this word.
  * Initially this is a trivial function which looks for the largest
  * gap in the certainty value.
  **********************************************************************/

 float compute_reject_threshold(WERD_CHOICE* word) {
   float threshold;               // rejection threshold
   float bestgap = 0.0f;          // biggest gap
   float gapstart;                // bottom of gap

   int blob_count = word->length();
   GenericVector<float> ratings;
   ratings.resize_no_init(blob_count);
   for (int i = 0; i < blob_count; ++i) {
     ratings[i] = word->certainty(i);
   }
   ratings.sort();
   gapstart = ratings[0] - 1;     // all reject if none better
   if (blob_count >= 3) {
     for (int index = 0; index < blob_count - 1; index++) {
       if (ratings[index + 1] - ratings[index] > bestgap) {
         bestgap = ratings[index + 1] - ratings[index];
         // find biggest
         gapstart = ratings[index];
       }
     }
   }
   threshold = gapstart + bestgap / 2;

   return threshold;
 }


 /*************************************************************************
  * reject_edge_blobs()
  *
  * If the word is perilously close to the edge of the image, reject those blobs
  * in the word which are too close to the edge as they could be clipped.
  *************************************************************************/
 namespace tesseract {
 void Tesseract::reject_edge_blobs(WERD_RES *word) {
   TBOX word_box = word->word->bounding_box();
   // Use the box_word as it is already denormed back to image coordinates.
   int blobcount = word->box_word->length();

   if (word_box.left() < tessedit_image_border ||
       word_box.bottom() < tessedit_image_border ||
       word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
       word_box.top() + tessedit_image_border > ImageHeight() - 1) {
     ASSERT_HOST(word->reject_map.length() == blobcount);
     for (int blobindex = 0; blobindex < blobcount; blobindex++) {
       TBOX blob_box = word->box_word->BlobBox(blobindex);
       if (blob_box.left() < tessedit_image_border ||
           blob_box.bottom() < tessedit_image_border ||
           blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
           blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
         word->reject_map[blobindex].setrej_edge_char();
         // Close to edge
       }
     }
   }
 }

 /**********************************************************************
  * one_ell_conflict()
  *
  * Identify words where there is a potential I/l/1 error.
  * - A bundle of contextual heuristics!
  **********************************************************************/
 bool Tesseract::one_ell_conflict(WERD_RES* word_res, bool update_map) {
   const char *word;
   const char *lengths;
   int16_t word_len;                //its length
   int16_t first_alphanum_index_;
   int16_t first_alphanum_offset_;
   int16_t i;
   int16_t offset;
   bool non_conflict_set_char;   //non conf set a/n?
   bool conflict = false;
   bool allow_1s;
   ACCEPTABLE_WERD_TYPE word_type;
   bool dict_perm_type;
   bool dict_word_ok;
   int dict_word_type;

   word = word_res->best_choice->unichar_string().string ();
   lengths = word_res->best_choice->unichar_lengths().string();
   word_len = strlen(lengths);
   /*
     If there are no occurrences of the conflict set characters then the word
     is OK.
   */
   if (strpbrk(word, conflict_set_I_l_1.string ()) == nullptr)
     return false;

   /*
     There is a conflict if there are NO other (confirmed) alphanumerics apart
     from those in the conflict set.
   */

   for (i = 0, offset = 0, non_conflict_set_char = false;
        (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
     non_conflict_set_char =
         (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
             word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
         !STRING (conflict_set_I_l_1).contains (word[offset]);
   if (!non_conflict_set_char) {
     if (update_map)
       reject_I_1_L(word_res);
     return true;
   }

   /*
     If the word is accepted by a dawg permuter, and the first alpha character
     is "I" or "l", check to see if the alternative is also a dawg word. If it
     is, then there is a potential error otherwise the word is ok.
   */

   dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
     (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
     (rej_trust_doc_dawg &&
     (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
     (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
   dict_word_type = dict_word(*(word_res->best_choice));
   dict_word_ok = (dict_word_type > 0) &&
     (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));

   if ((rej_1Il_use_dict_word && dict_word_ok) ||
     (rej_1Il_trust_permuter_type && dict_perm_type) ||
   (dict_perm_type && dict_word_ok)) {
     first_alphanum_index_ = first_alphanum_index (word, lengths);
     first_alphanum_offset_ = first_alphanum_offset (word, lengths);
     if (lengths[first_alphanum_index_] == 1 &&
         word[first_alphanum_offset_] == 'I') {
       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
       if (safe_dict_word(word_res) > 0) {
         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
         if (update_map)
           word_res->reject_map[first_alphanum_index_].
             setrej_1Il_conflict();
         return true;
       }
       else {
         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
         return false;
       }
     }

     if (lengths[first_alphanum_index_] == 1 &&
         word[first_alphanum_offset_] == 'l') {
       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
       if (safe_dict_word(word_res) > 0) {
         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
         if (update_map)
           word_res->reject_map[first_alphanum_index_].
             setrej_1Il_conflict();
         return true;
       }
       else {
         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
         return false;
       }
     }
     return false;
   }

   /*
     NEW 1Il code. The old code relied on permuter types too much. In fact,
     tess will use TOP_CHOICE permute for good things like "palette".
     In this code the string is examined independently to see if it looks like
     a well formed word.
   */

   /*
     REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
     dictionary word.
   */
   first_alphanum_index_ = first_alphanum_index (word, lengths);
   first_alphanum_offset_ = first_alphanum_offset (word, lengths);
   if (lengths[first_alphanum_index_] == 1 &&
       word[first_alphanum_offset_] == 'l') {
     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
     if (safe_dict_word(word_res) > 0)
       return false;
     else
       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
   }
   else if (lengths[first_alphanum_index_] == 1 &&
            word[first_alphanum_offset_] == 'I') {
     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
     if (safe_dict_word(word_res) > 0)
       return false;
     else
       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
   }
   /*
     For strings containing digits:
       If there are no alphas OR the numeric permuter liked the word,
         reject any non 1 conflict chs
       Else reject all conflict chs
   */
   if (word_contains_non_1_digit (word, lengths)) {
     allow_1s = (alpha_count (word, lengths) == 0) ||
       (word_res->best_choice->permuter () == NUMBER_PERM);

     int16_t offset;
     conflict = false;
     for (i = 0, offset = 0; word[offset] != '\0';
          offset += word_res->best_choice->unichar_lengths()[i++]) {
       if ((!allow_1s || (word[offset] != '1')) &&
       STRING (conflict_set_I_l_1).contains (word[offset])) {
         if (update_map)
           word_res->reject_map[i].setrej_1Il_conflict ();
         conflict = true;
       }
     }
     return conflict;
   }
   /*
     For anything else. See if it conforms to an acceptable word type. If so,
     treat accordingly.
   */
   word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
   if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
     first_alphanum_index_ = first_alphanum_index (word, lengths);
     first_alphanum_offset_ = first_alphanum_offset (word, lengths);
     if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
       if (update_map)
         word_res->reject_map[first_alphanum_index_].
             setrej_1Il_conflict ();
       return true;
     }
     else
       return false;
   }
   else if (word_type == AC_UPPER_CASE) {
     return false;
   }
   else {
     if (update_map)
       reject_I_1_L(word_res);
     return true;
   }
 }


 int16_t Tesseract::first_alphanum_index(const char *word,
                                       const char *word_lengths) {
   int16_t i;
   int16_t offset;

   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
         unicharset.get_isdigit(word + offset, word_lengths[i]))
       return i;
   }
   return -1;
 }

 int16_t Tesseract::first_alphanum_offset(const char *word,
                                        const char *word_lengths) {
   int16_t i;
   int16_t offset;

   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
         unicharset.get_isdigit(word + offset, word_lengths[i]))
       return offset;
   }
   return -1;
 }

 int16_t Tesseract::alpha_count(const char *word,
                              const char *word_lengths) {
   int16_t i;
   int16_t offset;
   int16_t count = 0;

   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
     if (unicharset.get_isalpha (word + offset, word_lengths[i]))
       count++;
   }
   return count;
 }


 bool Tesseract::word_contains_non_1_digit(const char* word,
                                           const char* word_lengths) {
   int16_t i;
   int16_t offset;

   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
     if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
         (word_lengths[i] != 1 || word[offset] != '1'))
       return true;
   }
   return false;
 }

 /*************************************************************************
  * dont_allow_1Il()
  * Don't unreject LONE accepted 1Il conflict set chars
  *************************************************************************/
 void Tesseract::dont_allow_1Il(WERD_RES *word) {
   int i = 0;
   int offset;
   int word_len = word->reject_map.length();
   const char *s = word->best_choice->unichar_string().string();
   const char *lengths = word->best_choice->unichar_lengths().string();
   bool accepted_1Il = false;

   for (i = 0, offset = 0; i < word_len;
        offset += word->best_choice->unichar_lengths()[i++]) {
     if (word->reject_map[i].accepted()) {
       if (STRING(conflict_set_I_l_1).contains(s[offset])) {
         accepted_1Il = true;
       } else {
         if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
             word->uch_set->get_isdigit(s + offset, lengths[i]))
           return;                // >=1 non 1Il ch accepted
       }
     }
   }
   if (!accepted_1Il)
     return;                      //Nothing to worry about

   for (i = 0, offset = 0; i < word_len;
        offset += word->best_choice->unichar_lengths()[i++]) {
     if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
       word->reject_map[i].accepted())
       word->reject_map[i].setrej_postNN_1Il();
   }
 }


 int16_t Tesseract::count_alphanums(WERD_RES *word_res) {
   int count = 0;
   const WERD_CHOICE *best_choice = word_res->best_choice;
   for (int i = 0; i < word_res->reject_map.length(); ++i) {
     if ((word_res->reject_map[i].accepted()) &&
         (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
             word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
       count++;
     }
   }
   return count;
 }


 // reject all if most rejected.
 void Tesseract::reject_mostly_rejects(WERD_RES *word) {
   /* Reject the whole of the word if the fraction of rejects exceeds a limit */

   if ((float) word->reject_map.reject_count() / word->reject_map.length() >=
     rej_whole_of_mostly_reject_word_fract)
     word->reject_map.rej_word_mostly_rej();
 }


 bool Tesseract::repeated_nonalphanum_wd(WERD_RES* word, ROW* row) {
   int16_t char_quality;
   int16_t accepted_char_quality;

   if (word->best_choice->unichar_lengths().length() <= 1)
     return false;

   if (!STRING(ok_repeated_ch_non_alphanum_wds).
     contains(word->best_choice->unichar_string()[0]))
     return false;

   UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
   for (int i = 1; i < word->best_choice->length(); ++i) {
     if (word->best_choice->unichar_id(i) != uch_id) return false;
   }

   word_char_quality(word, row, &char_quality, &accepted_char_quality);

   if ((word->best_choice->unichar_lengths().length () == char_quality) &&
     (char_quality == accepted_char_quality))
     return true;
   else
     return false;
 }

 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
   const WERD_CHOICE &word = *werd_res->best_choice;
   int dict_word_type = werd_res->tesseract->dict_word(word);
   return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
 }

 // Note: After running this function word_res->ratings
 // might not contain the right BLOB_CHOICE corresponding to each character
 // in word_res->best_choice.
 void Tesseract::flip_hyphens(WERD_RES *word_res) {
   WERD_CHOICE *best_choice = word_res->best_choice;
   int i;
   int prev_right = -9999;
   int next_left;
   TBOX out_box;
   float aspect_ratio;

   if (tessedit_lower_flip_hyphen <= 1)
     return;

   int num_blobs = word_res->rebuild_word->NumBlobs();
   UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
   for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
     TBLOB* blob = word_res->rebuild_word->blobs[i];
     out_box = blob->bounding_box();
     if (i + 1 == num_blobs)
       next_left = 9999;
     else
       next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
     // Don't touch small or touching blobs - it is too dangerous.
     if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
         (out_box.left() > prev_right) && (out_box.right() < next_left)) {
       aspect_ratio = out_box.width() / (float) out_box.height();
       if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
         if (aspect_ratio >= tessedit_upper_flip_hyphen &&
             word_res->uch_set->contains_unichar_id(unichar_dash) &&
             word_res->uch_set->get_enabled(unichar_dash)) {
           /* Certain HYPHEN */
           best_choice->set_unichar_id(unichar_dash, i);
           if (word_res->reject_map[i].rejected())
             word_res->reject_map[i].setrej_hyphen_accept();
         }
         if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
           word_res->reject_map[i].accepted())
                                  //Suspected HYPHEN
           word_res->reject_map[i].setrej_hyphen ();
       }
       else if (best_choice->unichar_id(i) == unichar_dash) {
         if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
           (word_res->reject_map[i].rejected()))
           word_res->reject_map[i].setrej_hyphen_accept();
         //Certain HYPHEN

         if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
           (word_res->reject_map[i].accepted()))
                                  //Suspected HYPHEN
           word_res->reject_map[i].setrej_hyphen();
       }
     }
     prev_right = out_box.right();
   }
 }

 // Note: After running this function word_res->ratings
 // might not contain the right BLOB_CHOICE corresponding to each character
 // in word_res->best_choice.
 void Tesseract::flip_0O(WERD_RES *word_res) {
   WERD_CHOICE *best_choice = word_res->best_choice;
   int i;
   TBOX out_box;

   if (!tessedit_flip_0O)
     return;

   int num_blobs = word_res->rebuild_word->NumBlobs();
   for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
     TBLOB* blob = word_res->rebuild_word->blobs[i];
     if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
         word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
       out_box = blob->bounding_box();
       if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
         (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
         return;                  //Beware words with sub/superscripts
     }
   }
   UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
   UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
   if (unichar_0 == INVALID_UNICHAR_ID ||
       !word_res->uch_set->get_enabled(unichar_0) ||
       unichar_O == INVALID_UNICHAR_ID ||
       !word_res->uch_set->get_enabled(unichar_O)) {
     return;  // 0 or O are not present/enabled in unicharset
   }
   for (i = 1; i < best_choice->length(); ++i) {
     if (best_choice->unichar_id(i) == unichar_0 ||
         best_choice->unichar_id(i) == unichar_O) {
       /* A0A */
       if ((i+1) < best_choice->length() &&
           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
         best_choice->set_unichar_id(unichar_O, i);
       }
       /* A00A */
       if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (i+1) < best_choice->length() &&
           (best_choice->unichar_id(i+1) == unichar_0 ||
            best_choice->unichar_id(i+1) == unichar_O) &&
           (i+2) < best_choice->length() &&
           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
         best_choice->set_unichar_id(unichar_O, i);
         i++;
       }
       /* AA0<non digit or end of word> */
       if ((i > 1) &&
           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (((i+1) < best_choice->length() &&
             !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
             !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
             !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
            (i == best_choice->length() - 1))) {
         best_choice->set_unichar_id(unichar_O, i);
       }
       /* 9O9 */
       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (i+1) < best_choice->length() &&
           non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
         best_choice->set_unichar_id(unichar_0, i);
       }
       /* 9OOO */
       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (i+2) < best_choice->length() &&
           (best_choice->unichar_id(i+1) == unichar_0 ||
            best_choice->unichar_id(i+1) == unichar_O) &&
           (best_choice->unichar_id(i+2) == unichar_0 ||
            best_choice->unichar_id(i+2) == unichar_O)) {
         best_choice->set_unichar_id(unichar_0, i);
         best_choice->set_unichar_id(unichar_0, i+1);
         best_choice->set_unichar_id(unichar_0, i+2);
         i += 2;
       }
       /* 9OO<non upper> */
       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (i+2) < best_choice->length() &&
           (best_choice->unichar_id(i+1) == unichar_0 ||
           best_choice->unichar_id(i+1) == unichar_O) &&
           !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
         best_choice->set_unichar_id(unichar_0, i);
         best_choice->set_unichar_id(unichar_0, i+1);
         i++;
       }
       /* 9O<non upper> */
       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (i+1) < best_choice->length() &&
           !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
         best_choice->set_unichar_id(unichar_0, i);
       }
       /* 9[.,]OOO.. */
       if ((i > 1) &&
           (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
               word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
           (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
            best_choice->unichar_id(i-2) == unichar_O)) {
         if (best_choice->unichar_id(i-2) == unichar_O) {
           best_choice->set_unichar_id(unichar_0, i-2);
         }
         while (i < best_choice->length() &&
                (best_choice->unichar_id(i) == unichar_O ||
                 best_choice->unichar_id(i) == unichar_0)) {
           best_choice->set_unichar_id(unichar_0, i);
           i++;
         }
         i--;
       }
     }
   }
 }

 bool Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
   return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
 }

 bool Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
   return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
 }
 }  // namespace tesseract

 #endif  // def DISABLED_LEGACY_ENGINE
WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:359

tesseract::Tesseract::alpha_count
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:500

tesseract::Tesseract::first_alphanum_index
int16_t first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:474

tesseract::Tesseract::tessedit_reject_mode
int tessedit_reject_mode
Definition: tesseractclass.h:1064

globaloc.h

WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:260

tesseract::Tesseract::min_sane_x_ht_pixels
int min_sane_x_ht_pixels
Definition: tesseractclass.h:1084

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:35

GenericVector::resize_no_init
void resize_no_init(int size)
Definition: genericvector.h:65

SYSTEM_DAWG_PERM
Definition: ratngs.h:251

tesseractclass.h

tesseract::Tesseract::dont_allow_1Il
void dont_allow_1Il(WERD_RES *word)
Definition: reject.cpp:531

tesseract::Wordrec::dict_word
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:129

compute_reject_threshold
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:233

helpers.h

tesseract::Tesseract::rej_1Il_use_dict_word
bool rej_1Il_use_dict_word
Definition: tesseractclass.h:1072

tesseract::Tesseract::tessedit_rejection_debug
bool tessedit_rejection_debug
Definition: tesseractclass.h:1065

DOC_DAWG_PERM
Definition: ratngs.h:252

GenericVector::sort
void sort()
Definition: genericvector.h:1065

UNICHARSET::eq
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:686

WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:287

STRING::string
const char * string() const
Definition: strngs.cpp:196

tesseract::Tesseract::ImageWidth
int ImageWidth() const
Definition: tesseractclass.h:247

count
int count(LIST var_list)
Definition: oldlist.cpp:98

WERD_CHOICE::print
void print() const
Definition: ratngs.h:580

WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:159

WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:346

TBOX
Definition: rect.h:34

TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:432

DENORM::y_scale
float y_scale() const
Definition: normalis.h:270

REJMAP::length
int32_t length() const
Definition: rejctmap.h:223

tesseract::Tesseract::word_contains_non_1_digit
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:514

kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:24

UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209

WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:327

WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:330

kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:25

reject.h

WERD_RES
Definition: pageres.h:169

UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486

tesseract::Tesseract::rej_use_good_perm
bool rej_use_good_perm
Definition: tesseractclass.h:1076

tesseract::Tesseract::repeated_nonalphanum_wd
bool repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
Definition: reject.cpp:587

tesseract::Tesseract::non_0_digit
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:794

genericvector.h

UNICHARSET
Definition: unicharset.h:146

tesseract::BoxWord::BlobBox
const TBOX & BlobBox(int index) const
Definition: boxword.h:84

WERD_CHOICE::dangerous_ambig_found
bool dangerous_ambig_found() const
Definition: ratngs.h:363

tesseract::Tesseract::first_alphanum_offset
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:487

tesseract::Tesseract::tessedit_lower_flip_hyphen
double tessedit_lower_flip_hyphen
Definition: tesseractclass.h:1068

TBOX::width
int16_t width() const
Definition: rect.h:115

globals.h

tesseract::Tesseract::rej_trust_doc_dawg
bool rej_trust_doc_dawg
Definition: tesseractclass.h:1071

tesseract::Tesseract::tessedit_image_border
int tessedit_image_border
Definition: tesseractclass.h:1080

TBOX::left
int16_t left() const
Definition: rect.h:72

WERD_CHOICE::unichar_lengths
const STRING & unichar_lengths() const
Definition: ratngs.h:548

tesseract::Tesseract::rej_alphas_in_number_perm
bool rej_alphas_in_number_perm
Definition: tesseractclass.h:1078

TBOX::top
int16_t top() const
Definition: rect.h:58

tesseract::Tesseract::non_O_upper
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:790

WERD_RES::denorm
DENORM denorm
Definition: pageres.h:204

UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507

GenericVector< float >

CLISTIZEH
CLISTIZEH(STRING) CLISTIZE(STRING) namespace tesseract
Definition: reject.cpp:55

tesseract::Tesseract::safe_dict_word
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:612

DENORM::x_scale
float x_scale() const
Definition: normalis.h:267

tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:68

REJMAP::reject_count
int16_t reject_count()
Definition: rejctmap.h:229

tesseract::Tesseract::rej_1Il_trust_permuter_type
bool rej_1Il_trust_permuter_type
Definition: tesseractclass.h:1073

WERD_CHOICE
Definition: ratngs.h:273

FALSE
#define FALSE
Definition: capi.h:52

WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:296

tesseract::Tesseract::make_reject_map
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)

REJMAP::rej_word_bad_permuter
void rej_word_bad_permuter()
Definition: rejctmap.cpp:381

tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds
char * ok_repeated_ch_non_alphanum_wds
Definition: tesseractclass.h:1082

tesseract::Tesseract::set_done
void set_done(WERD_RES *word, int16_t pass)

UNICHARSET::get_enabled
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:873

reject_blanks
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:185

WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315

tesseract::Tesseract::tessedit_upper_flip_hyphen
double tessedit_upper_flip_hyphen
Definition: tesseractclass.h:1070

ROW
Definition: ocrrow.h:36

tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37

TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:478

tesseract::Tesseract::ImageHeight
int ImageHeight() const
Definition: tesseractclass.h:250

tesseract::Tesseract::acceptable_word_string
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1764

WERD_CHOICE::length
int length() const
Definition: ratngs.h:303

tesseract::Tesseract::flip_0O
void flip_0O(WERD_RES *word)
Definition: reject.cpp:678

UNICHARSET::contains_unichar_id
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

AC_INITIAL_CAP
ALL but initial lc.
Definition: control.h:33

tesseract::Tesseract::reject_mostly_rejects
void reject_mostly_rejects(WERD_RES *word)
Definition: reject.cpp:578

tesseract
Definition: baseapi.cpp:94

TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:443

WERD_RES::done
bool done
Definition: pageres.h:298

REJMAP::rej_word_small_xht
void rej_word_small_xht()
Definition: rejctmap.cpp:345

tesseract::Tesseract::tessedit_flip_0O
bool tessedit_flip_0O
Definition: tesseractclass.h:1066

NUMBER_PERM
Definition: ratngs.h:249

WERD_RES::tesseract
tesseract::Tesseract * tesseract
Definition: pageres.h:282

reject_poor_matches
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:214

STRING
Definition: strngs.h:45

STRING::contains
bool contains(const char c) const
Definition: strngs.cpp:187

control.h

tesseract::Tesseract::reject_edge_blobs
void reject_edge_blobs(WERD_RES *word)
Definition: reject.cpp:268

WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:206

tesseract::BoxWord::length
int length() const
Definition: boxword.h:83

AC_UPPER_CASE
ALL upper case.
Definition: control.h:32

WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:541

tesseract::Tesseract::conflict_set_I_l_1
char * conflict_set_I_l_1
Definition: tesseractclass.h:1083

tesseract::Tesseract::check_debug_pt
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1868

USER_DAWG_PERM
Definition: ratngs.h:253

TBOX::right
int16_t right() const
Definition: rect.h:79

tesseract::Tesseract::rej_use_tess_accepted
bool rej_use_tess_accepted
Definition: tesseractclass.h:1074

REJMAP::rej_word_not_tess_accepted
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:363

ACCEPTABLE_WERD_TYPE
ACCEPTABLE_WERD_TYPE
Definition: control.h:28

UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500

CLISTIZE
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain

tesseract::Tesseract::one_ell_conflict
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
Definition: reject.cpp:297

TBLOB
Definition: blobs.h:268

tesseract::Tesseract::word_char_quality
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:93

REJMAP::rej_word_contains_blanks
void rej_word_contains_blanks()
Definition: rejctmap.cpp:372

docqual.h

tesseract::Tesseract::rej_use_tess_blanks
bool rej_use_tess_blanks
Definition: tesseractclass.h:1075

tesseract::Tesseract::rej_use_sensible_wd
bool rej_use_sensible_wd
Definition: tesseractclass.h:1077

err_exit
void err_exit()
Definition: globaloc.cpp:75

tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract
double rej_whole_of_mostly_reject_word_fract
Definition: tesseractclass.h:1079

FREQ_DAWG_PERM
Definition: ratngs.h:254

TBOX::bottom
int16_t bottom() const
Definition: rect.h:65

tesseract::Tesseract::flip_hyphens
void flip_hyphens(WERD_RES *word)
Definition: reject.cpp:621

tesseract::Tesseract::count_alphanums
int16_t count_alphanums(const WERD_CHOICE &word)
Definition: output.cpp:383

AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:30

STRING::length
int32_t length() const
Definition: strngs.cpp:191

tessvars.h

WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235

TBOX::height
int16_t height() const
Definition: rect.h:108

WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:266

UNICHAR_SPACE
Definition: unicharset.h:35

REJMAP::rej_word_mostly_rej
void rej_word_mostly_rej()
Definition: rejctmap.cpp:408

tesseract::Tesseract::reject_I_1_L
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:198

REJMAP::initialise
void initialise(int16_t length)
Definition: rejctmap.cpp:275

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:84

AC_LOWER_CASE
ALL lower case.
Definition: control.h:31

WERD_RES::word
WERD * word
Definition: pageres.h:189