tessapi/4.0.0/a00077_source.html

 /******************************************************************
  * File:        docqual.cpp  (Formerly docqual.c)
  * Description: Document Quality Metrics
  * Author:    Phil Cheatle
  * Created:   Mon May  9 11:27:28 BST 1994
  *
  * (C) Copyright 1994, Hewlett-Packard Ltd.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **********************************************************************/

 #include          <cctype>
 #include          "docqual.h"
 #include          "reject.h"
 #include          "tesscallback.h"
 #include          "tessvars.h"
 #include          "globals.h"
 #include          "tesseractclass.h"

 namespace tesseract{

 // A little class to provide the callbacks as we have no pre-bound args.
 struct DocQualCallbacks {
   explicit DocQualCallbacks(WERD_RES* word0)
     : word(word0), match_count(0), accepted_match_count(0) {}

   void CountMatchingBlobs(int index) {
     ++match_count;
   }

   void CountAcceptedBlobs(int index) {
     if (word->reject_map[index].accepted())
       ++accepted_match_count;
     ++match_count;
   }

   void AcceptIfGoodQuality(int index) {
     if (word->reject_map[index].accept_if_good_quality())
       word->reject_map[index].setrej_quality_accept();
   }

   WERD_RES* word;
   int16_t match_count;
   int16_t accepted_match_count;
 };

 /*************************************************************************
  * word_blob_quality()
  * How many blobs in the box_word are identical to those of the inword?
  * ASSUME blobs in both initial word and box_word are in ascending order of
  * left hand blob edge.
  *************************************************************************/
 int16_t Tesseract::word_blob_quality(WERD_RES *word, ROW *row) {
   if (word->bln_boxes == nullptr ||
       word->rebuild_word == nullptr || word->rebuild_word->blobs.empty())
     return 0;

   DocQualCallbacks cb(word);
   word->bln_boxes->ProcessMatchedBlobs(
       *word->rebuild_word,
       NewPermanentTessCallback(&cb, &DocQualCallbacks::CountMatchingBlobs));
   return cb.match_count;
 }

 int16_t Tesseract::word_outline_errs(WERD_RES *word) {
   int16_t i = 0;
   int16_t err_count = 0;

   if (word->rebuild_word != nullptr) {
     for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
       TBLOB* blob = word->rebuild_word->blobs[b];
       err_count += count_outline_errs(word->best_choice->unichar_string()[i],
                                       blob->NumOutlines());
       i++;
     }
   }
   return err_count;
 }

 /*************************************************************************
  * word_char_quality()
  * Combination of blob quality and outline quality - how many good chars are
  * there? - I.e chars which pass the blob AND outline tests.
  *************************************************************************/
 void Tesseract::word_char_quality(WERD_RES *word,
                                   ROW *row,
                                   int16_t *match_count,
                                   int16_t *accepted_match_count) {
   if (word->bln_boxes == nullptr || word->rebuild_word == nullptr ||
       word->rebuild_word->blobs.empty()) {
     *match_count = 0;
     *accepted_match_count = 0;
     return;
   }

   DocQualCallbacks cb(word);
   word->bln_boxes->ProcessMatchedBlobs(
       *word->rebuild_word,
       NewPermanentTessCallback(&cb, &DocQualCallbacks::CountAcceptedBlobs));
   *match_count = cb.match_count;
   *accepted_match_count = cb.accepted_match_count;
 }

 /*************************************************************************
  * unrej_good_chs()
  * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
  *************************************************************************/
 void Tesseract::unrej_good_chs(WERD_RES *word, ROW *row) {
   if (word->bln_boxes == nullptr ||
       word->rebuild_word == nullptr || word->rebuild_word->blobs.empty())
     return;

   DocQualCallbacks cb(word);
   word->bln_boxes->ProcessMatchedBlobs(
       *word->rebuild_word,
       NewPermanentTessCallback(&cb, &DocQualCallbacks::AcceptIfGoodQuality));
 }

 int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
   int expected_outline_count;

   if (STRING (outlines_odd).contains (c))
     return 0;  // Don't use this char
   else if (STRING (outlines_2).contains (c))
     expected_outline_count = 2;
   else
     expected_outline_count = 1;
   return abs (outline_count - expected_outline_count);
 }

 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
                                         bool good_quality_doc) {
   if ((tessedit_good_quality_unrej && good_quality_doc))
     unrej_good_quality_words(page_res_it);
   doc_and_block_rejection(page_res_it, good_quality_doc);
   if (unlv_tilde_crunching) {
     tilde_crunch(page_res_it);
     tilde_delete(page_res_it);
   }
 }

 /*************************************************************************
  * unrej_good_quality_words()
  * Accept potential rejects in words which pass the following checks:
  *    - Contains a potential reject
  *    - Word looks like a sensible alpha word.
  *    - Word segmentation is the same as the original image
  *    - All characters have the expected number of outlines
  * NOTE - the rejection counts are recalculated after unrejection
  *      - CAN'T do it in a single pass without a bit of fiddling
  *    - keep it simple but inefficient
  *************************************************************************/
 void Tesseract::unrej_good_quality_words(  //unreject potential
                                          PAGE_RES_IT &page_res_it) {
   WERD_RES *word;
   ROW_RES *current_row;
   BLOCK_RES *current_block;
   int i;

   page_res_it.restart_page ();
   while (page_res_it.word () != nullptr) {
     check_debug_pt (page_res_it.word (), 100);
     if (bland_unrej) {
       word = page_res_it.word ();
       for (i = 0; i < word->reject_map.length (); i++) {
         if (word->reject_map[i].accept_if_good_quality ())
           word->reject_map[i].setrej_quality_accept ();
       }
       page_res_it.forward ();
     }
     else if ((page_res_it.row ()->char_count > 0) &&
       ((page_res_it.row ()->rej_count /
       (float) page_res_it.row ()->char_count) <=
     quality_rowrej_pc)) {
       word = page_res_it.word ();
       if (word->reject_map.quality_recoverable_rejects() &&
           (tessedit_unrej_any_wd ||
            acceptable_word_string(*word->uch_set,
                                   word->best_choice->unichar_string().string(),
                                   word->best_choice->unichar_lengths().string())
                != AC_UNACCEPTABLE)) {
         unrej_good_chs(word, page_res_it.row ()->row);
       }
       page_res_it.forward ();
     }
     else {
       /* Skip to end of dodgy row */
       current_row = page_res_it.row ();
       while ((page_res_it.word () != nullptr) &&
         (page_res_it.row () == current_row))
         page_res_it.forward ();
     }
     check_debug_pt (page_res_it.word (), 110);
   }
   page_res_it.restart_page ();
   page_res_it.page_res->char_count = 0;
   page_res_it.page_res->rej_count = 0;
   current_block = nullptr;
   current_row = nullptr;
   while (page_res_it.word () != nullptr) {
     if (current_block != page_res_it.block ()) {
       current_block = page_res_it.block ();
       current_block->char_count = 0;
       current_block->rej_count = 0;
     }
     if (current_row != page_res_it.row ()) {
       current_row = page_res_it.row ();
       current_row->char_count = 0;
       current_row->rej_count = 0;
       current_row->whole_word_rej_count = 0;
     }
     page_res_it.rej_stat_word ();
     page_res_it.forward ();
   }
 }


 /*************************************************************************
  * doc_and_block_rejection()
  *
  * If the page has too many rejects - reject all of it.
  * If any block has too many rejects - reject all words in the block
  *************************************************************************/

 void Tesseract::doc_and_block_rejection(  //reject big chunks
                                         PAGE_RES_IT &page_res_it,
                                         bool good_quality_doc) {
   int16_t block_no = 0;
   int16_t row_no = 0;
   BLOCK_RES *current_block;
   ROW_RES *current_row;

   bool rej_word;
   bool prev_word_rejected;
   int16_t char_quality = 0;
   int16_t accepted_char_quality;

   if (page_res_it.page_res->rej_count * 100.0 /
       page_res_it.page_res->char_count > tessedit_reject_doc_percent) {
     reject_whole_page(page_res_it);
     if (tessedit_debug_doc_rejection) {
       tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
               page_res_it.page_res->char_count,
               page_res_it.page_res->rej_count);
     }
   } else {
     if (tessedit_debug_doc_rejection) {
       tprintf("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n",
               page_res_it.page_res->char_count,
               page_res_it.page_res->rej_count);
     }

     /* Walk blocks testing for block rejection */

     page_res_it.restart_page();
     WERD_RES* word;
     while ((word = page_res_it.word()) != nullptr) {
       current_block = page_res_it.block();
       block_no = current_block->block->pdblk.index();
       if (current_block->char_count > 0 &&
           (current_block->rej_count * 100.0 / current_block->char_count) >
            tessedit_reject_block_percent) {
         if (tessedit_debug_block_rejection) {
           tprintf("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n",
                   block_no, current_block->char_count,
                   current_block->rej_count);
         }
         prev_word_rejected = false;
         while ((word = page_res_it.word()) != nullptr &&
                (page_res_it.block() == current_block)) {
           if (tessedit_preserve_blk_rej_perfect_wds) {
             rej_word = word->reject_map.reject_count() > 0 ||
                 word->reject_map.length () < tessedit_preserve_min_wd_len;
             if (rej_word && tessedit_dont_blkrej_good_wds &&
                 word->reject_map.length() >= tessedit_preserve_min_wd_len &&
                 acceptable_word_string(
                     *word->uch_set,
                     word->best_choice->unichar_string().string(),
                     word->best_choice->unichar_lengths().string()) !=
                 AC_UNACCEPTABLE) {
               word_char_quality(word, page_res_it.row()->row,
                                 &char_quality,
                                 &accepted_char_quality);
               rej_word = char_quality !=  word->reject_map.length();
             }
           } else {
             rej_word = true;
           }
           if (rej_word) {
             /*
               Reject spacing if both current and prev words are rejected.
               NOTE - this is NOT restricted to FUZZY spaces. - When tried this
               generated more space errors.
             */
             if (tessedit_use_reject_spaces &&
                 prev_word_rejected &&
                 page_res_it.prev_row() == page_res_it.row() &&
                 word->word->space() == 1)
               word->reject_spaces = true;
             word->reject_map.rej_word_block_rej();
           }
           prev_word_rejected = rej_word;
           page_res_it.forward();
         }
       } else {
         if (tessedit_debug_block_rejection) {
           tprintf("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n",
                   block_no, page_res_it.block()->char_count,
                   page_res_it.block()->rej_count);
         }

         /* Walk rows in block testing for row rejection */
         row_no = 0;
         while (page_res_it.word() != nullptr &&
                page_res_it.block() == current_block) {
           current_row = page_res_it.row();
           row_no++;
           /* Reject whole row if:
             fraction of chars on row which are rejected exceed a limit AND
             fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
             limit
           */
           if (current_row->char_count > 0 &&
               (current_row->rej_count * 100.0 / current_row->char_count) >
               tessedit_reject_row_percent &&
               (current_row->whole_word_rej_count * 100.0 /
                   current_row->rej_count) <
               tessedit_whole_wd_rej_row_percent) {
             if (tessedit_debug_block_rejection) {
               tprintf("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n",
                       row_no, current_row->char_count,
                       current_row->rej_count);
             }
             prev_word_rejected = false;
             while ((word = page_res_it.word()) != nullptr &&
                    page_res_it.row () == current_row) {
               /* Preserve words on good docs unless they are mostly rejected*/
               if (!tessedit_row_rej_good_docs && good_quality_doc) {
                 rej_word = word->reject_map.reject_count() /
                     static_cast<float>(word->reject_map.length()) >
                     tessedit_good_doc_still_rowrej_wd;
               } else if (tessedit_preserve_row_rej_perfect_wds) {
                 /* Preserve perfect words anyway */
                 rej_word = word->reject_map.reject_count() > 0 ||
                     word->reject_map.length () < tessedit_preserve_min_wd_len;
                 if (rej_word && tessedit_dont_rowrej_good_wds &&
                     word->reject_map.length() >= tessedit_preserve_min_wd_len &&
                     acceptable_word_string(*word->uch_set,
                         word->best_choice->unichar_string().string(),
                         word->best_choice->unichar_lengths().string()) !=
                             AC_UNACCEPTABLE) {
                   word_char_quality(word, page_res_it.row()->row,
                                     &char_quality,
                                     &accepted_char_quality);
                   rej_word = char_quality != word->reject_map.length();
                 }
               } else {
                 rej_word = true;
               }
               if (rej_word) {
                 /*
                   Reject spacing if both current and prev words are rejected.
                   NOTE - this is NOT restricted to FUZZY spaces. - When tried
                   this generated more space errors.
                 */
                 if (tessedit_use_reject_spaces &&
                     prev_word_rejected &&
                     page_res_it.prev_row() == page_res_it.row() &&
                     word->word->space () == 1)
                   word->reject_spaces = true;
                 word->reject_map.rej_word_row_rej();
               }
               prev_word_rejected = rej_word;
               page_res_it.forward();
             }
           } else {
             if (tessedit_debug_block_rejection) {
               tprintf("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n",
                       row_no, current_row->char_count, current_row->rej_count);
             }
             while (page_res_it.word() != nullptr &&
                    page_res_it.row() == current_row)
               page_res_it.forward();
           }
         }
       }
     }
   }
 }

 }  // namespace tesseract

 /*************************************************************************
  * reject_whole_page()
  * Don't believe any of it - set the reject map to 00..00 in all words
  *
  *************************************************************************/

 void reject_whole_page(PAGE_RES_IT &page_res_it) {
   page_res_it.restart_page ();
   while (page_res_it.word () != nullptr) {
     page_res_it.word ()->reject_map.rej_word_doc_rej ();
     page_res_it.forward ();
   }
                                  //whole page is rejected
   page_res_it.page_res->rejected = true;
 }

 namespace tesseract {
 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
   WERD_RES *word;
   GARBAGE_LEVEL garbage_level;
   PAGE_RES_IT copy_it;
   bool prev_potential_marked = false;
   bool found_terrible_word = false;
   BOOL8 ok_dict_word;

   page_res_it.restart_page();
   while (page_res_it.word() != nullptr) {
     POLY_BLOCK* pb = page_res_it.block()->block->pdblk.poly_block();
     if (pb != nullptr && !pb->IsText()) {
       page_res_it.forward();
       continue;
     }
     word = page_res_it.word();

     if (crunch_early_convert_bad_unlv_chs)
       convert_bad_unlv_chs(word);

     if (crunch_early_merge_tess_fails)
       word->merge_tess_fails();

     if (word->reject_map.accept_count () != 0) {
       found_terrible_word = false;
                                  //Forget earlier potential crunches
       prev_potential_marked = false;
     }
     else {
       ok_dict_word = safe_dict_word(word);
       garbage_level = garbage_word(word, ok_dict_word);

       if ((garbage_level != G_NEVER_CRUNCH) &&
       (terrible_word_crunch (word, garbage_level))) {
         if (crunch_debug > 0) {
           tprintf ("T CRUNCHING: \"%s\"\n",
             word->best_choice->unichar_string().string());
         }
         word->unlv_crunch_mode = CR_KEEP_SPACE;
         if (prev_potential_marked) {
           while (copy_it.word () != word) {
             if (crunch_debug > 0) {
               tprintf ("P1 CRUNCHING: \"%s\"\n",
                 copy_it.word()->best_choice->unichar_string().string());
             }
             copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
             copy_it.forward ();
           }
           prev_potential_marked = false;
         }
         found_terrible_word = true;
       }
       else if ((garbage_level != G_NEVER_CRUNCH) &&
         (potential_word_crunch (word,
       garbage_level, ok_dict_word))) {
         if (found_terrible_word) {
           if (crunch_debug > 0) {
             tprintf ("P2 CRUNCHING: \"%s\"\n",
               word->best_choice->unichar_string().string());
           }
           word->unlv_crunch_mode = CR_KEEP_SPACE;
         }
         else if (!prev_potential_marked) {
           copy_it = page_res_it;
           prev_potential_marked = true;
           if (crunch_debug > 1) {
             tprintf ("P3 CRUNCHING: \"%s\"\n",
               word->best_choice->unichar_string().string());
           }
         }
       }
       else {
         found_terrible_word = false;
                                  //Forget earlier potential crunches
         prev_potential_marked = false;
         if (crunch_debug > 2) {
           tprintf ("NO CRUNCH: \"%s\"\n",
             word->best_choice->unichar_string().string());
         }
       }
     }
     page_res_it.forward ();
   }
 }


 bool Tesseract::terrible_word_crunch(WERD_RES* word,
                                      GARBAGE_LEVEL garbage_level) {
   float rating_per_ch;
   int adjusted_len;
   int crunch_mode = 0;

   if ((word->best_choice->unichar_string().length() == 0) ||
       (strspn(word->best_choice->unichar_string().string(), " ") ==
        word->best_choice->unichar_string().unsigned_size()))
     crunch_mode = 1;
   else {
     adjusted_len = word->reject_map.length ();
     if (adjusted_len > crunch_rating_max)
       adjusted_len = crunch_rating_max;
     rating_per_ch = word->best_choice->rating () / adjusted_len;

     if (rating_per_ch > crunch_terrible_rating)
       crunch_mode = 2;
     else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
       crunch_mode = 3;
     else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
       (garbage_level != G_OK))
       crunch_mode = 4;
     else if ((rating_per_ch > crunch_poor_garbage_rate) &&
       (garbage_level != G_OK))
       crunch_mode = 5;
   }
   if (crunch_mode > 0) {
     if (crunch_debug > 2) {
       tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
         crunch_mode, word->best_choice->unichar_string().string());
     }
     return true;
   }
   else
     return false;
 }

 bool Tesseract::potential_word_crunch(WERD_RES* word,
                                       GARBAGE_LEVEL garbage_level,
                                       bool ok_dict_word) {
   float rating_per_ch;
   int adjusted_len;
   const char *str = word->best_choice->unichar_string().string();
   const char *lengths = word->best_choice->unichar_lengths().string();
   bool word_crunchable;
   int poor_indicator_count = 0;

   word_crunchable = !crunch_leave_accept_strings ||
                     word->reject_map.length() < 3 ||
                     (acceptable_word_string(*word->uch_set,
                                             str, lengths) == AC_UNACCEPTABLE &&
                      !ok_dict_word);

   adjusted_len = word->reject_map.length();
   if (adjusted_len > 10)
     adjusted_len = 10;
   rating_per_ch = word->best_choice->rating() / adjusted_len;

   if (rating_per_ch > crunch_pot_poor_rate) {
     if (crunch_debug > 2) {
       tprintf("Potential poor rating on \"%s\"\n",
               word->best_choice->unichar_string().string());
     }
     poor_indicator_count++;
   }

   if (word_crunchable &&
       word->best_choice->certainty() < crunch_pot_poor_cert) {
     if (crunch_debug > 2) {
       tprintf("Potential poor cert on \"%s\"\n",
               word->best_choice->unichar_string().string());
     }
     poor_indicator_count++;
   }

   if (garbage_level != G_OK) {
     if (crunch_debug > 2) {
       tprintf("Potential garbage on \"%s\"\n",
               word->best_choice->unichar_string().string());
     }
     poor_indicator_count++;
   }
   return poor_indicator_count >= crunch_pot_indicators;
 }

 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
   WERD_RES *word;
   PAGE_RES_IT copy_it;
   bool deleting_from_bol = false;
   bool marked_delete_point = false;
   int16_t debug_delete_mode;
   CRUNCH_MODE delete_mode;
   int16_t x_debug_delete_mode;
   CRUNCH_MODE x_delete_mode;

   page_res_it.restart_page();
   while (page_res_it.word() != nullptr) {
     word = page_res_it.word();

     delete_mode = word_deletable (word, debug_delete_mode);
     if (delete_mode != CR_NONE) {
       if (word->word->flag (W_BOL) || deleting_from_bol) {
         if (crunch_debug > 0) {
           tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
             debug_delete_mode,
             word->best_choice->unichar_string().string());
         }
         word->unlv_crunch_mode = delete_mode;
         deleting_from_bol = true;
       } else if (word->word->flag(W_EOL)) {
         if (marked_delete_point) {
           while (copy_it.word() != word) {
             x_delete_mode = word_deletable (copy_it.word (),
               x_debug_delete_mode);
             if (crunch_debug > 0) {
               tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
                 x_debug_delete_mode,
                 copy_it.word()->best_choice->unichar_string().string());
             }
             copy_it.word ()->unlv_crunch_mode = x_delete_mode;
             copy_it.forward ();
           }
         }
         if (crunch_debug > 0) {
           tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
             debug_delete_mode,
             word->best_choice->unichar_string().string());
         }
         word->unlv_crunch_mode = delete_mode;
         deleting_from_bol = false;
         marked_delete_point = false;
       }
       else {
         if (!marked_delete_point) {
           copy_it = page_res_it;
           marked_delete_point = true;
         }
       }
     }
     else {
       deleting_from_bol = false;
                                  //Forget earlier potential crunches
       marked_delete_point = false;
     }
     /*
       The following step has been left till now as the tess fails are used to
       determine if the word is deletable.
     */
     if (!crunch_early_merge_tess_fails)
       word->merge_tess_fails();
     page_res_it.forward ();
   }
 }


 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
   int i;
   UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
   UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
   UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
   UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
   for (i = 0; i < word_res->reject_map.length(); ++i) {
     if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
       word_res->best_choice->set_unichar_id(unichar_dash, i);
       if (word_res->reject_map[i].accepted ())
         word_res->reject_map[i].setrej_unlv_rej ();
     }
     if (word_res->best_choice->unichar_id(i) == unichar_pow) {
       word_res->best_choice->set_unichar_id(unichar_space, i);
       if (word_res->reject_map[i].accepted ())
         word_res->reject_map[i].setrej_unlv_rej ();
     }
   }
 }

 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
   enum STATES
   {
     JUNK,
     FIRST_UPPER,
     FIRST_LOWER,
     FIRST_NUM,
     SUBSEQUENT_UPPER,
     SUBSEQUENT_LOWER,
     SUBSEQUENT_NUM
   };
   const char *str = word->best_choice->unichar_string().string();
   const char *lengths = word->best_choice->unichar_lengths().string();
   STATES state = JUNK;
   int len = 0;
   int isolated_digits = 0;
   int isolated_alphas = 0;
   int bad_char_count = 0;
   int tess_rejs = 0;
   int dodgy_chars = 0;
   int ok_chars;
   UNICHAR_ID last_char = -1;
   int alpha_repetition_count = 0;
   int longest_alpha_repetition_count = 0;
   int longest_lower_run_len = 0;
   int lower_string_count = 0;
   int longest_upper_run_len = 0;
   int upper_string_count = 0;
   int total_alpha_count = 0;
   int total_digit_count = 0;

   for (; *str != '\0'; str += *(lengths++)) {
     len++;
     if (word->uch_set->get_isupper (str, *lengths)) {
       total_alpha_count++;
       switch (state) {
         case SUBSEQUENT_UPPER:
         case FIRST_UPPER:
           state = SUBSEQUENT_UPPER;
           upper_string_count++;
           if (longest_upper_run_len < upper_string_count)
             longest_upper_run_len = upper_string_count;
           if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
             alpha_repetition_count++;
             if (longest_alpha_repetition_count < alpha_repetition_count) {
               longest_alpha_repetition_count = alpha_repetition_count;
             }
           }
           else {
             last_char = word->uch_set->unichar_to_id(str, *lengths);
             alpha_repetition_count = 1;
           }
           break;
         case FIRST_NUM:
           isolated_digits++;
         default:
           state = FIRST_UPPER;
           last_char = word->uch_set->unichar_to_id(str, *lengths);
           alpha_repetition_count = 1;
           upper_string_count = 1;
           break;
       }
     }
     else if (word->uch_set->get_islower (str, *lengths)) {
       total_alpha_count++;
       switch (state) {
         case SUBSEQUENT_LOWER:
         case FIRST_LOWER:
           state = SUBSEQUENT_LOWER;
           lower_string_count++;
           if (longest_lower_run_len < lower_string_count)
             longest_lower_run_len = lower_string_count;
           if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
             alpha_repetition_count++;
             if (longest_alpha_repetition_count < alpha_repetition_count) {
               longest_alpha_repetition_count = alpha_repetition_count;
             }
           }
           else {
             last_char = word->uch_set->unichar_to_id(str, *lengths);
             alpha_repetition_count = 1;
           }
           break;
         case FIRST_NUM:
           isolated_digits++;
         default:
           state = FIRST_LOWER;
           last_char = word->uch_set->unichar_to_id(str, *lengths);
           alpha_repetition_count = 1;
           lower_string_count = 1;
           break;
       }
     }
     else if (word->uch_set->get_isdigit (str, *lengths)) {
       total_digit_count++;
       switch (state) {
         case FIRST_NUM:
           state = SUBSEQUENT_NUM;
         case SUBSEQUENT_NUM:
           break;
         case FIRST_UPPER:
         case FIRST_LOWER:
           isolated_alphas++;
         default:
           state = FIRST_NUM;
           break;
       }
     }
     else {
       if (*lengths == 1 && *str == ' ')
         tess_rejs++;
       else
         bad_char_count++;
       switch (state) {
         case FIRST_NUM:
           isolated_digits++;
           break;
         case FIRST_UPPER:
         case FIRST_LOWER:
           isolated_alphas++;
         default:
           break;
       }
       state = JUNK;
     }
   }

   switch (state) {
     case FIRST_NUM:
       isolated_digits++;
       break;
     case FIRST_UPPER:
     case FIRST_LOWER:
       isolated_alphas++;
     default:
       break;
   }

   if (crunch_include_numerals) {
     total_alpha_count += total_digit_count - isolated_digits;
   }

   if (crunch_leave_ok_strings && len >= 4 &&
       2 * (total_alpha_count - isolated_alphas) > len &&
       longest_alpha_repetition_count < crunch_long_repetitions) {
     if ((crunch_accept_ok &&
          acceptable_word_string(*word->uch_set, str, lengths) !=
              AC_UNACCEPTABLE) ||
         longest_lower_run_len > crunch_leave_lc_strings ||
         longest_upper_run_len > crunch_leave_uc_strings)
       return G_NEVER_CRUNCH;
   }
   if (word->reject_map.length() > 1 &&
       strpbrk(str, " ") == nullptr &&
       (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
        word->best_choice->permuter() == FREQ_DAWG_PERM ||
        word->best_choice->permuter() == USER_DAWG_PERM ||
        word->best_choice->permuter() == NUMBER_PERM ||
        acceptable_word_string(*word->uch_set, str, lengths) !=
            AC_UNACCEPTABLE || ok_dict_word))
     return G_OK;

   ok_chars = len - bad_char_count - isolated_digits -
     isolated_alphas - tess_rejs;

   if (crunch_debug > 3) {
     tprintf("garbage_word: \"%s\"\n",
             word->best_choice->unichar_string().string());
     tprintf("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n",
             len,
             bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
   }
   if (bad_char_count == 0 &&
       tess_rejs == 0 &&
       (len > isolated_digits + isolated_alphas || len <= 2))
     return G_OK;

   if (tess_rejs > ok_chars ||
       (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
     return G_TERRIBLE;

   if (len > 4) {
     dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
         isolated_alphas;
     if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
       return G_DODGY;
     else
       return G_OK;
   } else {
     dodgy_chars = 2 * tess_rejs + bad_char_count;
     if ((len == 4 && dodgy_chars > 2) ||
         (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
       return G_DODGY;
     else
       return G_OK;
   }
 }


 /*************************************************************************
  * word_deletable()
  *     DELETE WERDS AT ENDS OF ROWS IF
  *        Word is crunched &&
  *        ( string length = 0                                          OR
  *          > 50% of chars are "|" (before merging)                    OR
  *          certainty < -10                                            OR
  *          rating /char > 60                                          OR
  *          TOP of word is more than 0.5 xht BELOW baseline            OR
  *          BOTTOM of word is more than 0.5 xht ABOVE xht              OR
  *          length of word < 3xht                                      OR
  *          height of word < 0.7 xht                                   OR
  *          height of word > 3.0 xht                                   OR
  *          >75% of the outline BBs have longest dimension < 0.5xht
  *************************************************************************/

 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
   int word_len = word->reject_map.length ();
   float rating_per_ch;
   TBOX box;                       //BB of word

   if (word->unlv_crunch_mode == CR_NONE) {
     delete_mode = 0;
     return CR_NONE;
   }

   if (word_len == 0) {
     delete_mode = 1;
     return CR_DELETE;
   }

   if (word->rebuild_word != nullptr) {
     // Cube leaves rebuild_word nullptr.
     box = word->rebuild_word->bounding_box();
     if (box.height () < crunch_del_min_ht * kBlnXHeight) {
       delete_mode = 4;
       return CR_DELETE;
     }

     if (noise_outlines(word->rebuild_word)) {
       delete_mode = 5;
       return CR_DELETE;
     }
   }

   if ((failure_count (word) * 1.5) > word_len) {
     delete_mode = 2;
     return CR_LOOSE_SPACE;
   }

   if (word->best_choice->certainty () < crunch_del_cert) {
     delete_mode = 7;
     return CR_LOOSE_SPACE;
   }

   rating_per_ch = word->best_choice->rating () / word_len;

   if (rating_per_ch > crunch_del_rating) {
     delete_mode = 8;
     return CR_LOOSE_SPACE;
   }

   if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
     delete_mode = 9;
     return CR_LOOSE_SPACE;
   }

   if (box.bottom () >
   kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
     delete_mode = 10;
     return CR_LOOSE_SPACE;
   }

   if (box.height () > crunch_del_max_ht * kBlnXHeight) {
     delete_mode = 11;
     return CR_LOOSE_SPACE;
   }

   if (box.width () < crunch_del_min_width * kBlnXHeight) {
     delete_mode = 3;
     return CR_LOOSE_SPACE;
   }

   delete_mode = 0;
   return CR_NONE;
 }

 int16_t Tesseract::failure_count(WERD_RES *word) {
   const char *str = word->best_choice->unichar_string().string();
   int tess_rejs = 0;

   for (; *str != '\0'; str++) {
     if (*str == ' ')
       tess_rejs++;
   }
   return tess_rejs;
 }


 bool Tesseract::noise_outlines(TWERD* word) {
   TBOX box;                       // BB of outline
   int16_t outline_count = 0;
   int16_t small_outline_count = 0;
   int16_t max_dimension;
   float small_limit = kBlnXHeight * crunch_small_outlines_size;

   for (int b = 0; b < word->NumBlobs(); ++b) {
     TBLOB* blob = word->blobs[b];
     for (TESSLINE* ol = blob->outlines; ol != nullptr; ol = ol->next) {
       outline_count++;
       box = ol->bounding_box();
       if (box.height() > box.width())
         max_dimension = box.height();
       else
         max_dimension = box.width();
       if (max_dimension < small_limit)
         small_outline_count++;
     }
   }
   return small_outline_count >= outline_count;
 }

 }  // namespace tesseract
TESSLINE
Definition: blobs.h:187

tesseract::Tesseract::outlines_2
char * outlines_2
Definition: tesseractclass.h:929

PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:757

WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:359

tesseract::Tesseract::crunch_leave_uc_strings
int crunch_leave_uc_strings
Definition: tesseractclass.h:999

tesseract::Tesseract::failure_count
int16_t failure_count(WERD_RES *word)
Definition: docqual.cpp:966

tesseract::DocQualCallbacks::AcceptIfGoodQuality
void AcceptIfGoodQuality(int index)
Definition: docqual.cpp:45

UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:493

REJMAP::rej_word_block_rej
void rej_word_block_rej()
Definition: rejctmap.cpp:435

WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:260

TESSLINE::next
TESSLINE * next
Definition: blobs.h:265

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:35

tesseract::BoxWord::ProcessMatchedBlobs
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:190

SYSTEM_DAWG_PERM
Definition: ratngs.h:251

tesseractclass.h

TWERD
Definition: blobs.h:402

PAGE_RES::rej_count
int32_t rej_count
Definition: pageres.h:80

PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:754

WERD_RES::bln_boxes
tesseract::BoxWord * bln_boxes
Definition: pageres.h:198

G_TERRIBLE
Definition: docqual.h:34

ROW_RES::whole_word_rej_count
int32_t whole_word_rej_count
Definition: pageres.h:146

STRING::unsigned_size
uint32_t unsigned_size() const
Definition: strngs.h:71

tesseract::Tesseract::tessedit_debug_block_rejection
bool tessedit_debug_block_rejection
Definition: tesseractclass.h:875

WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:287

REJMAP::rej_word_row_rej
void rej_word_row_rej()
Definition: rejctmap.cpp:444

STRING::string
const char * string() const
Definition: strngs.cpp:196

WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:346

tesseract::Tesseract::crunch_leave_accept_strings
bool crunch_leave_accept_strings
Definition: tesseractclass.h:994

BLOCK_RES::char_count
int32_t char_count
Definition: pageres.h:118

TWERD::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:871

tesseract::Tesseract::terrible_word_crunch
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:504

TBOX
Definition: rect.h:34

REJMAP::quality_recoverable_rejects
bool quality_recoverable_rejects()
Definition: rejctmap.cpp:302

TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:432

REJMAP::length
int32_t length() const
Definition: rejctmap.h:223

W_EOL
Definition: werd.h:35

tesseract::Tesseract::crunch_include_numerals
bool crunch_include_numerals
Definition: tesseractclass.h:995

kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:24

WERD_RES::merge_tess_fails
void merge_tess_fails()
Definition: pageres.cpp:1073

tesseract::Tesseract::tessedit_reject_doc_percent
double tessedit_reject_doc_percent
Definition: tesseractclass.h:936

UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209

WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:327

tesseract::Tesseract::tessedit_reject_block_percent
double tessedit_reject_block_percent
Definition: tesseractclass.h:938

REJMAP::rej_word_doc_rej
void rej_word_doc_rej()
Definition: rejctmap.cpp:426

tesseract::Tesseract::crunch_del_min_ht
double crunch_del_min_ht
Definition: tesseractclass.h:982

NewPermanentTessCallback
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116

tesseract::DocQualCallbacks
Definition: docqual.cpp:31

WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:330

kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:25

reject.h

CR_LOOSE_SPACE
Definition: pageres.h:163

WERD_RES
Definition: pageres.h:169

tesseract::Tesseract::crunch_early_convert_bad_unlv_chs
bool crunch_early_convert_bad_unlv_chs
Definition: tesseractclass.h:971

tesseract::Tesseract::noise_outlines
bool noise_outlines(TWERD *word)
Definition: docqual.cpp:978

tesseract::Tesseract::tilde_crunch
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:418

tesseract::Tesseract::crunch_pot_indicators
int crunch_pot_indicators
Definition: tesseractclass.h:990

tesseract::Tesseract::tessedit_preserve_min_wd_len
int tessedit_preserve_min_wd_len
Definition: tesseractclass.h:953

WERD::space
uint8_t space()
Definition: werd.h:102

tesseract::Tesseract::tessedit_reject_row_percent
double tessedit_reject_row_percent
Definition: tesseractclass.h:940

tesseract::Tesseract::crunch_del_low_word
double crunch_del_low_word
Definition: tesseractclass.h:987

tesseract::Tesseract::convert_bad_unlv_chs
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:660

tesseract::Tesseract::crunch_rating_max
int crunch_rating_max
Definition: tesseractclass.h:989

tesseract::DocQualCallbacks::accepted_match_count
int16_t accepted_match_count
Definition: docqual.cpp:52

tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds
bool tessedit_preserve_blk_rej_perfect_wds
Definition: tesseractclass.h:945

tesseract::DocQualCallbacks::DocQualCallbacks
DocQualCallbacks(WERD_RES *word0)
Definition: docqual.cpp:32

tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd
double tessedit_good_doc_still_rowrej_wd
Definition: tesseractclass.h:957

tesscallback.h

tesseract::Tesseract::crunch_del_max_ht
double crunch_del_max_ht
Definition: tesseractclass.h:983

tesseract::Tesseract::word_blob_quality
int16_t word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:61

tesseract::DocQualCallbacks::CountAcceptedBlobs
void CountAcceptedBlobs(int index)
Definition: docqual.cpp:39

TBOX::width
int16_t width() const
Definition: rect.h:115

tesseract::DocQualCallbacks::CountMatchingBlobs
void CountMatchingBlobs(int index)
Definition: docqual.cpp:35

globals.h

PAGE_RES_IT::restart_page
WERD_RES * restart_page()
Definition: pageres.h:698

BLOCK_RES::block
BLOCK * block
Definition: pageres.h:117

tesseract::Tesseract::unrej_good_quality_words
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:161

PDBLK::index
int index() const
Definition: pdblock.h:68

tesseract::Tesseract::crunch_del_cert
double crunch_del_cert
Definition: tesseractclass.h:981

PAGE_RES_IT::rej_stat_word
void rej_stat_word()
Definition: pageres.cpp:1674

WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126

WERD_CHOICE::unichar_lengths
const STRING & unichar_lengths() const
Definition: ratngs.h:548

tesseract::Tesseract::unlv_tilde_crunching
bool unlv_tilde_crunching
Definition: tesseractclass.h:967

TBOX::top
int16_t top() const
Definition: rect.h:58

PAGE_RES_IT::prev_row
ROW_RES * prev_row() const
Definition: pageres.h:745

UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507

CRUNCH_MODE
CRUNCH_MODE
Definition: pageres.h:159

tesseract::Tesseract::crunch_debug
int crunch_debug
Definition: tesseractclass.h:1001

WERD_RES::unlv_crunch_mode
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:310

tesseract::Tesseract::safe_dict_word
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:612

tesseract::Tesseract::tessedit_whole_wd_rej_row_percent
double tessedit_whole_wd_rej_row_percent
Definition: tesseractclass.h:943

POLY_BLOCK
Definition: polyblk.h:27

REJMAP::reject_count
int16_t reject_count()
Definition: rejctmap.h:229

REJMAP::accept_count
int16_t accept_count()
Definition: rejctmap.cpp:281

CR_DELETE
Definition: pageres.h:164

tesseract::Tesseract::outlines_odd
char * outlines_odd
Definition: tesseractclass.h:928

CR_NONE
Definition: pageres.h:161

tesseract::Tesseract::potential_word_crunch
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
Definition: docqual.cpp:542

tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds
bool tessedit_preserve_row_rej_perfect_wds
Definition: tesseractclass.h:947

PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:751

tesseract::Tesseract::count_outline_errs
int16_t count_outline_errs(char c, int16_t outline_count)
Definition: docqual.cpp:127

PDBLK::poly_block
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56

ROW_RES
Definition: pageres.h:141

G_OK
Definition: docqual.h:32

WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315

BOOL8
unsigned char BOOL8
Definition: host.h:34

tesseract::Tesseract::quality_rowrej_pc
double quality_rowrej_pc
Definition: tesseractclass.h:965

ROW
Definition: ocrrow.h:36

GenericVector::empty
bool empty() const
Definition: genericvector.h:90

tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37

W_BOL
Definition: werd.h:34

tesseract::Tesseract::word_outline_errs
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:73

PAGE_RES::char_count
int32_t char_count
Definition: pageres.h:79

POLY_BLOCK::IsText
bool IsText() const
Definition: polyblk.h:49

tesseract::Tesseract::acceptable_word_string
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1764

TBLOB::NumOutlines
int NumOutlines() const
Definition: blobs.cpp:464

tesseract::Tesseract::crunch_leave_ok_strings
bool crunch_leave_ok_strings
Definition: tesseractclass.h:991

PAGE_RES_IT::page_res
PAGE_RES * page_res
Definition: pageres.h:677

tesseract::Tesseract::crunch_accept_ok
bool crunch_accept_ok
Definition: tesseractclass.h:992

tesseract::Tesseract::crunch_del_high_word
double crunch_del_high_word
Definition: tesseractclass.h:986

tesseract
Definition: baseapi.cpp:94

tesseract::Tesseract::crunch_long_repetitions
int crunch_long_repetitions
Definition: tesseractclass.h:1000

GARBAGE_LEVEL
GARBAGE_LEVEL
Definition: docqual.h:29

TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:443

tesseract::Tesseract::crunch_terrible_rating
double crunch_terrible_rating
Definition: tesseractclass.h:972

WERD_RES::reject_spaces
bool reject_spaces
Definition: pageres.h:336

tesseract::Tesseract::crunch_early_merge_tess_fails
bool crunch_early_merge_tess_fails
Definition: tesseractclass.h:970

NUMBER_PERM
Definition: ratngs.h:249

PAGE_RES::rejected
bool rejected
Definition: pageres.h:82

PAGE_RES_IT
Definition: pageres.h:675

tesseract::Tesseract::tessedit_unrej_any_wd
bool tessedit_unrej_any_wd
Definition: tesseractclass.h:869

STRING
Definition: strngs.h:45

tesseract::Tesseract::crunch_del_min_width
double crunch_del_min_width
Definition: tesseractclass.h:984

BLOCK_RES
Definition: pageres.h:115

tesseract::Tesseract::doc_and_block_rejection
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:233

tesseract::Tesseract::crunch_poor_garbage_cert
double crunch_poor_garbage_cert
Definition: tesseractclass.h:975

tesseract::Tesseract::tessedit_good_quality_unrej
bool tessedit_good_quality_unrej
Definition: tesseractclass.h:933

BLOCK_RES::rej_count
int32_t rej_count
Definition: pageres.h:119

tesseract::Tesseract::tessedit_dont_blkrej_good_wds
bool tessedit_dont_blkrej_good_wds
Definition: tesseractclass.h:949

CR_KEEP_SPACE
Definition: pageres.h:162

WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:206

tesseract::Tesseract::tilde_delete
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:590

WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:541

tesseract::Tesseract::crunch_leave_lc_strings
int crunch_leave_lc_strings
Definition: tesseractclass.h:997

tesseract::Tesseract::check_debug_pt
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1868

tesseract::DocQualCallbacks::match_count
int16_t match_count
Definition: docqual.cpp:51

USER_DAWG_PERM
Definition: ratngs.h:253

tesseract::Tesseract::tessedit_dont_rowrej_good_wds
bool tessedit_dont_rowrej_good_wds
Definition: tesseractclass.h:951

tesseract::Tesseract::tessedit_use_reject_spaces
bool tessedit_use_reject_spaces
Definition: tesseractclass.h:934

tesseract::Tesseract::crunch_poor_garbage_rate
double crunch_poor_garbage_rate
Definition: tesseractclass.h:976

tesseract::Tesseract::tessedit_debug_doc_rejection
bool tessedit_debug_doc_rejection
Definition: tesseractclass.h:960

tesseract::Tesseract::quality_based_rejection
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:139

PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:731

UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500

tesseract::Tesseract::bland_unrej
bool bland_unrej
Definition: tesseractclass.h:963

tesseract::Tesseract::garbage_word
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
Definition: docqual.cpp:680

ROW_RES::char_count
int32_t char_count
Definition: pageres.h:144

reject_whole_page
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:407

TBLOB
Definition: blobs.h:268

tesseract::Tesseract::tessedit_row_rej_good_docs
bool tessedit_row_rej_good_docs
Definition: tesseractclass.h:955

tesseract::Tesseract::word_deletable
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
Definition: docqual.cpp:895

tesseract::Tesseract::word_char_quality
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:93

tesseract::Tesseract::crunch_small_outlines_size
double crunch_small_outlines_size
Definition: tesseractclass.h:988

docqual.h

G_NEVER_CRUNCH
Definition: docqual.h:31

FREQ_DAWG_PERM
Definition: ratngs.h:254

ROW_RES::rej_count
int32_t rej_count
Definition: pageres.h:145

TBOX::bottom
int16_t bottom() const
Definition: rect.h:65

TBLOB::outlines
TESSLINE * outlines
Definition: blobs.h:384

AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:30

G_DODGY
Definition: docqual.h:33

tesseract::Tesseract::crunch_del_rating
double crunch_del_rating
Definition: tesseractclass.h:980

BLOCK::pdblk
PDBLK pdblk
Definition: ocrblock.h:192

STRING::length
int32_t length() const
Definition: strngs.cpp:191

tessvars.h

WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235

TBOX::height
int16_t height() const
Definition: rect.h:108

tesseract::Tesseract::crunch_terrible_garbage
bool crunch_terrible_garbage
Definition: tesseractclass.h:973

tesseract::Tesseract::unrej_good_chs
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:116

tesseract::DocQualCallbacks::word
WERD_RES * word
Definition: docqual.cpp:50

tesseract::Tesseract::crunch_pot_poor_rate
double crunch_pot_poor_rate
Definition: tesseractclass.h:977

ROW_RES::row
ROW * row
Definition: pageres.h:143

tesseract::Tesseract::crunch_pot_poor_cert
double crunch_pot_poor_cert
Definition: tesseractclass.h:978

WERD_RES::word
WERD * word
Definition: pageres.h:189