tessapi/4.0.0/a00089_source.html

 /******************************************************************
  * File:        fixspace.cpp  (Formerly fixspace.c)
  * Description: Implements a pass over the page res, exploring the alternative
  *              spacing possibilities, trying to use context to improve the
  *              word spacing
  * Author:      Phil Cheatle
  * Created:     Thu Oct 21 11:38:43 BST 1993
  *
  * (C) Copyright 1993, Hewlett-Packard Ltd.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **********************************************************************/

 #include "fixspace.h"
 #include <cstdint>             // for INT16_MAX, int16_t, int32_t
 #include "blobs.h"             // for TWERD, TBLOB, TESSLINE
 #include "boxword.h"           // for BoxWord
 #include "errcode.h"           // for ASSERT_HOST
 #include "host.h"              // for FALSE, TRUE
 #include "normalis.h"          // for kBlnXHeight, kBlnBaselineOffset
 #include "ocrclass.h"          // for ETEXT_DESC
 #include "pageres.h"           // for WERD_RES_IT, WERD_RES, WERD_RES_LIST
 #include "params.h"            // for IntParam, StringParam, BoolParam, Doub...
 #include "ratngs.h"            // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM
 #include "rect.h"              // for TBOX
 #include "stepblob.h"          // for C_BLOB_IT, C_BLOB_LIST, C_BLOB
 #include "strngs.h"            // for STRING
 #include "tesseractclass.h"    // for Tesseract, TesseractStats, WordData
 #include "tessvars.h"          // for debug_fp
 #include "tprintf.h"           // for tprintf
 #include "unichar.h"           // for UNICHAR_ID
 #include "unicharset.h"        // for UNICHARSET
 #include "werd.h"              // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP

 class BLOCK;
 class ROW;

 #define PERFECT_WERDS   999
 #define MAXSPACING      128      /*max expected spacing in pix */

 namespace tesseract {

 /**********************************************************************
  *  c_blob_comparator()
  *
  *  Blob comparator used to sort a blob list so that blobs are in increasing
  *  order of left edge.
  **********************************************************************/

 static int c_blob_comparator(              // sort blobs
                       const void *blob1p,  // ptr to ptr to blob1
                       const void *blob2p   // ptr to ptr to blob2
                      ) {
   const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB* const*>(blob1p);
   const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB* const*>(blob2p);

   return blob1->bounding_box ().left () - blob2->bounding_box ().left ();
 }

 void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
                                  int32_t word_count,
                                  PAGE_RES *page_res) {
   BLOCK_RES_IT block_res_it;
   ROW_RES_IT row_res_it;
   WERD_RES_IT word_res_it_from;
   WERD_RES_IT word_res_it_to;
   WERD_RES *word_res;
   WERD_RES_LIST fuzzy_space_words;
   int16_t new_length;
   bool prevent_null_wd_fixsp;   // DON'T process blobless wds
   int32_t word_index;              // current word

   block_res_it.set_to_list(&page_res->block_res_list);
   word_index = 0;
   for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
        block_res_it.forward()) {
     row_res_it.set_to_list(&block_res_it.data()->row_res_list);
     for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
          row_res_it.forward()) {
       word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
       while (!word_res_it_from.at_last()) {
         word_res = word_res_it_from.data();
         while (!word_res_it_from.at_last() &&
                !(word_res->combination ||
                  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
                  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
           fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
                          block_res_it.data()->block);
           word_res = word_res_it_from.forward();
           word_index++;
           if (monitor != nullptr) {
             monitor->ocr_alive = TRUE;
             monitor->progress = 90 + 5 * word_index / word_count;
             if (monitor->deadline_exceeded() ||
                 (monitor->cancel != nullptr &&
                  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
             return;
           }
         }

         if (!word_res_it_from.at_last()) {
           word_res_it_to = word_res_it_from;
           prevent_null_wd_fixsp =
             word_res->word->cblob_list()->empty();
           if (check_debug_pt(word_res, 60))
             debug_fix_space_level.set_value(10);
           word_res_it_to.forward();
           word_index++;
           if (monitor != nullptr) {
             monitor->ocr_alive = TRUE;
             monitor->progress = 90 + 5 * word_index / word_count;
             if (monitor->deadline_exceeded() ||
                 (monitor->cancel != nullptr &&
                  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
             return;
           }
           while (!word_res_it_to.at_last () &&
                  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
                   word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
             if (check_debug_pt(word_res, 60))
               debug_fix_space_level.set_value(10);
             if (word_res->word->cblob_list()->empty())
               prevent_null_wd_fixsp = true;
             word_res = word_res_it_to.forward();
           }
           if (check_debug_pt(word_res, 60))
             debug_fix_space_level.set_value(10);
           if (word_res->word->cblob_list()->empty())
             prevent_null_wd_fixsp = true;
           if (prevent_null_wd_fixsp) {
             word_res_it_from = word_res_it_to;
           } else {
             fuzzy_space_words.assign_to_sublist(&word_res_it_from,
                                                 &word_res_it_to);
             fix_fuzzy_space_list(fuzzy_space_words,
                                  row_res_it.data()->row,
                                  block_res_it.data()->block);
             new_length = fuzzy_space_words.length();
             word_res_it_from.add_list_before(&fuzzy_space_words);
             for (;
                  !word_res_it_from.at_last() && new_length > 0;
                  new_length--) {
               word_res_it_from.forward();
             }
           }
           if (test_pt)
             debug_fix_space_level.set_value(0);
         }
         fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
                        block_res_it.data()->block);
         // Last word in row
       }
     }
   }
 }

 void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
                                      ROW *row,
                                      BLOCK* block) {
   int16_t best_score;
   WERD_RES_LIST current_perm;
   int16_t current_score;
   bool improved = false;

   best_score = eval_word_spacing(best_perm);  // default score
   dump_words(best_perm, best_score, 1, improved);

   if (best_score != PERFECT_WERDS)
     initialise_search(best_perm, current_perm);

   while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
     match_current_words(current_perm, row, block);
     current_score = eval_word_spacing(current_perm);
     dump_words(current_perm, current_score, 2, improved);
     if (current_score > best_score) {
       best_perm.clear();
       best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
       best_score = current_score;
       improved = true;
     }
     if (current_score < PERFECT_WERDS)
       transform_to_next_perm(current_perm);
   }
   dump_words(best_perm, best_score, 3, improved);
 }

 }  // namespace tesseract

 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
   WERD_RES_IT src_it(&src_list);
   WERD_RES_IT new_it(&new_list);
   WERD_RES *src_wd;
   WERD_RES *new_wd;

   for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
     src_wd = src_it.data();
     if (!src_wd->combination) {
       new_wd = WERD_RES::deep_copy(src_wd);
       new_wd->combination = false;
       new_wd->part_of_combo = false;
       new_it.add_after_then_move(new_wd);
     }
   }
 }


 namespace tesseract {
 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
                                     BLOCK* block) {
   WERD_RES_IT word_it(&words);
   WERD_RES *word;
   // Since we are not using PAGE_RES to iterate over words, we need to update
   // prev_word_best_choice_ before calling classify_word_pass2().
   prev_word_best_choice_ = nullptr;
   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
     word = word_it.data();
     if ((!word->part_of_combo) && (word->box_word == nullptr)) {
       WordData word_data(block, row, word);
       SetupWordPassN(2, &word_data);
       classify_word_and_language(2, nullptr, &word_data);
     }
     prev_word_best_choice_ = word->best_choice;
   }
 }

 int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
   WERD_RES_IT word_res_it(&word_res_list);
   int16_t total_score = 0;
   int16_t word_count = 0;
   int16_t done_word_count = 0;
   int16_t word_len;
   int16_t i;
   int16_t offset;
   WERD_RES *word;                 // current word
   int16_t prev_word_score = 0;
   bool prev_word_done = false;
   bool prev_char_1 = false;      // prev ch a "1/I/l"?
   bool prev_char_digit = false;  // prev ch 2..9 or 0
   bool current_char_1 = false;
   bool current_word_ok_so_far;
   STRING punct_chars = "!\"`',.:;";
   bool prev_char_punct = false;
   bool current_char_punct = false;
   bool word_done = false;

   do {
     word = word_res_it.data();
     word_done = fixspace_thinks_word_done(word);
     word_count++;
     if (word->tess_failed) {
       total_score += prev_word_score;
       if (prev_word_done)
         done_word_count++;
       prev_word_score = 0;
       prev_char_1 = false;
       prev_char_digit = false;
       prev_word_done = false;
     } else {
       /*
         Can we add the prev word score and potentially count this word?
         Yes IF it didn't end in a 1 when the first char of this word is a digit
           AND it didn't end in a digit when the first char of this word is a 1
       */
       word_len = word->reject_map.length();
       current_word_ok_so_far = false;
       if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
             (prev_char_digit && (
                 (word_done &&
                  word->best_choice->unichar_lengths().string()[0] == 1 &&
                  word->best_choice->unichar_string()[0] == '1') ||
                 (!word_done && STRING(conflict_set_I_l_1).contains(
                       word->best_choice->unichar_string()[0])))))) {
         total_score += prev_word_score;
         if (prev_word_done)
           done_word_count++;
         current_word_ok_so_far = word_done;
       }

       if (current_word_ok_so_far) {
         prev_word_done = true;
         prev_word_score = word_len;
       } else {
         prev_word_done = false;
         prev_word_score = 0;
       }

       /* Add 1 to total score for every joined 1 regardless of context and
          rejtn */
       for (i = 0, prev_char_1 = false; i < word_len; i++) {
         current_char_1 = word->best_choice->unichar_string()[i] == '1';
         if (prev_char_1 || (current_char_1 && (i > 0)))
           total_score++;
         prev_char_1 = current_char_1;
       }

       /* Add 1 to total score for every joined punctuation regardless of context
         and rejtn */
       if (tessedit_prefer_joined_punct) {
         for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
              offset += word->best_choice->unichar_lengths()[i++]) {
           current_char_punct =
             punct_chars.contains(word->best_choice->unichar_string()[offset]);
           if (prev_char_punct || (current_char_punct && i > 0))
             total_score++;
           prev_char_punct = current_char_punct;
         }
       }
       prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
       for (i = 0, offset = 0; i < word_len - 1;
            offset += word->best_choice->unichar_lengths()[i++]);
       prev_char_1 =
           ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
            || (!word_done && STRING(conflict_set_I_l_1).contains(
                    word->best_choice->unichar_string()[offset])));
     }
     /* Find next word */
     do {
       word_res_it.forward();
     } while (word_res_it.data()->part_of_combo);
   } while (!word_res_it.at_first());
   total_score += prev_word_score;
   if (prev_word_done)
     done_word_count++;
   if (done_word_count == word_count)
     return PERFECT_WERDS;
   else
     return total_score;
 }

 bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
   int i;
   int offset;

   for (i = 0, offset = 0; i < char_position;
        offset += word->best_choice->unichar_lengths()[i++]);
   return (
       word->uch_set->get_isdigit(
           word->best_choice->unichar_string().string() + offset,
           word->best_choice->unichar_lengths()[i]) ||
       (word->best_choice->permuter() == NUMBER_PERM &&
        STRING(numeric_punctuation).contains(
            word->best_choice->unichar_string().string()[offset])));
 }

 }  // namespace tesseract


 void transform_to_next_perm(WERD_RES_LIST &words) {
   WERD_RES_IT word_it(&words);
   WERD_RES_IT prev_word_it(&words);
   WERD_RES *word;
   WERD_RES *prev_word;
   WERD_RES *combo;
   WERD *copy_word;
   int16_t prev_right = -INT16_MAX;
   TBOX box;
   int16_t gap;
   int16_t min_gap = INT16_MAX;

   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
     word = word_it.data();
     if (!word->part_of_combo) {
       box = word->word->bounding_box();
       if (prev_right > -INT16_MAX) {
         gap = box.left() - prev_right;
         if (gap < min_gap)
           min_gap = gap;
       }
       prev_right = box.right();
     }
   }
   if (min_gap < INT16_MAX) {
     prev_right = -INT16_MAX;        // back to start
     word_it.set_to_list(&words);
     // Note: we can't use cycle_pt due to inserted combos at start of list.
     for (; (prev_right == -INT16_MAX) || !word_it.at_first();
          word_it.forward()) {
       word = word_it.data();
       if (!word->part_of_combo) {
         box = word->word->bounding_box();
         if (prev_right > -INT16_MAX) {
           gap = box.left() - prev_right;
           if (gap <= min_gap) {
             prev_word = prev_word_it.data();
             if (prev_word->combination) {
               combo = prev_word;
             } else {
               /* Make a new combination and insert before
                * the first word being joined. */
               copy_word = new WERD;
               *copy_word = *(prev_word->word);
               // deep copy
               combo = new WERD_RES(copy_word);
               combo->combination = TRUE;
               combo->x_height = prev_word->x_height;
               prev_word->part_of_combo = true;
               prev_word_it.add_before_then_move(combo);
             }
             combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
             if (word->combination) {
               combo->word->join_on(word->word);
               // Move blobs to combo
               // old combo no longer needed
               delete word_it.extract();
             } else {
               // Copy current wd to combo
               combo->copy_on(word);
               word->part_of_combo = true;
             }
             combo->done = FALSE;
             combo->ClearResults();
           } else {
             prev_word_it = word_it;  // catch up
           }
         }
         prev_right = box.right();
       }
     }
   } else {
     words.clear();  // signal termination
   }
 }

 namespace tesseract {
 void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score,
                            int16_t mode, bool improved) {
   WERD_RES_IT word_res_it(&perm);

   if (debug_fix_space_level > 0) {
     if (mode == 1) {
       stats_.dump_words_str = "";
       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
            word_res_it.forward()) {
         if (!word_res_it.data()->part_of_combo) {
           stats_.dump_words_str +=
               word_res_it.data()->best_choice->unichar_string();
           stats_.dump_words_str += ' ';
         }
       }
     }

     if (debug_fix_space_level > 1) {
       switch (mode) {
         case 1:
           tprintf("EXTRACTED (%d): \"", score);
           break;
         case 2:
           tprintf("TESTED (%d): \"", score);
           break;
         case 3:
           tprintf("RETURNED (%d): \"", score);
           break;
       }

       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
            word_res_it.forward()) {
         if (!word_res_it.data()->part_of_combo) {
           tprintf("%s/%1d ",
                   word_res_it.data()->best_choice->unichar_string().string(),
                   (int)word_res_it.data()->best_choice->permuter());
         }
       }
       tprintf("\"\n");
     } else if (improved) {
       tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
            word_res_it.forward()) {
         if (!word_res_it.data()->part_of_combo) {
           tprintf("%s/%1d ",
                   word_res_it.data()->best_choice->unichar_string().string(),
                   (int)word_res_it.data()->best_choice->permuter());
         }
       }
       tprintf("\"\n");
     }
   }
 }

 bool Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
   if (word->done)
     return true;

   /*
     Use all the standard pass 2 conditions for mode 5 in set_done() in
     reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
     CARE WHETHER WE HAVE of/at on/an etc.
   */
   if (fixsp_done_mode > 0 &&
       (word->tess_accepted ||
        (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
        fixsp_done_mode == 3) &&
       (strchr(word->best_choice->unichar_string().string(), ' ') == nullptr) &&
       ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
        (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
        (word->best_choice->permuter() == USER_DAWG_PERM) ||
        (word->best_choice->permuter() == NUMBER_PERM))) {
     return true;
   } else {
     return false;
   }
 }


 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
                                BLOCK* block) {
   WERD_RES *word_res;
   WERD_RES_LIST sub_word_list;
   WERD_RES_IT sub_word_list_it(&sub_word_list);
   int16_t blob_index;
   int16_t new_length;
   float junk;

   word_res = word_res_it.data();
   if (word_res->word->flag(W_REP_CHAR) ||
       word_res->combination ||
       word_res->part_of_combo ||
       !word_res->word->flag(W_DONT_CHOP))
     return;

   blob_index = worst_noise_blob(word_res, &junk);
   if (blob_index < 0)
     return;

   if (debug_fix_space_level > 1) {
     tprintf("FP fixspace working on \"%s\"\n",
             word_res->best_choice->unichar_string().string());
   }
   word_res->word->rej_cblob_list()->sort(c_blob_comparator);
   sub_word_list_it.add_after_stay_put(word_res_it.extract());
   fix_noisy_space_list(sub_word_list, row, block);
   new_length = sub_word_list.length();
   word_res_it.add_list_before(&sub_word_list);
   for (; !word_res_it.at_last() && new_length > 1; new_length--) {
     word_res_it.forward();
   }
 }

 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
                                      BLOCK* block) {
   int16_t best_score;
   WERD_RES_IT best_perm_it(&best_perm);
   WERD_RES_LIST current_perm;
   WERD_RES_IT current_perm_it(&current_perm);
   WERD_RES *old_word_res;
   int16_t current_score;
   bool improved = false;

   best_score = fp_eval_word_spacing(best_perm);  // default score

   dump_words(best_perm, best_score, 1, improved);

   old_word_res = best_perm_it.data();
   // Even deep_copy doesn't copy the underlying WERD unless its combination
   // flag is true!.
   old_word_res->combination = true;   // Kludge to force deep copy
   current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
   old_word_res->combination = false;  // Undo kludge

   break_noisiest_blob_word(current_perm);

   while (best_score != PERFECT_WERDS && !current_perm.empty()) {
     match_current_words(current_perm, row, block);
     current_score = fp_eval_word_spacing(current_perm);
     dump_words(current_perm, current_score, 2, improved);
     if (current_score > best_score) {
       best_perm.clear();
       best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
       best_score = current_score;
       improved = true;
     }
     if (current_score < PERFECT_WERDS) {
       break_noisiest_blob_word(current_perm);
     }
   }
   dump_words(best_perm, best_score, 3, improved);
 }


 void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
   WERD_RES_IT word_it(&words);
   WERD_RES_IT worst_word_it;
   float worst_noise_score = 9999;
   int worst_blob_index = -1;     // Noisiest blob of noisiest wd
   int blob_index;                // of wds noisiest blob
   float noise_score;             // of wds noisiest blob
   WERD_RES *word_res;
   C_BLOB_IT blob_it;
   C_BLOB_IT rej_cblob_it;
   C_BLOB_LIST new_blob_list;
   C_BLOB_IT new_blob_it;
   C_BLOB_IT new_rej_cblob_it;
   WERD *new_word;
   int16_t start_of_noise_blob;
   int16_t i;

   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
     blob_index = worst_noise_blob(word_it.data(), &noise_score);
     if (blob_index > -1 && worst_noise_score > noise_score) {
       worst_noise_score = noise_score;
       worst_blob_index = blob_index;
       worst_word_it = word_it;
     }
   }
   if (worst_blob_index < 0) {
     words.clear();          // signal termination
     return;
   }

   /* Now split the worst_word_it */

   word_res = worst_word_it.data();

   /* Move blobs before noise blob to a new bloblist */

   new_blob_it.set_to_list(&new_blob_list);
   blob_it.set_to_list(word_res->word->cblob_list());
   for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
     new_blob_it.add_after_then_move(blob_it.extract());
   }
   start_of_noise_blob = blob_it.data()->bounding_box().left();
   delete blob_it.extract();     // throw out noise blob

   new_word = new WERD(&new_blob_list, word_res->word);
   new_word->set_flag(W_EOL, FALSE);
   word_res->word->set_flag(W_BOL, FALSE);
   word_res->word->set_blanks(1);  // After break

   new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
   rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
   for (;
        (!rej_cblob_it.empty() &&
         (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
        rej_cblob_it.forward()) {
     new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
   }

   WERD_RES* new_word_res = new WERD_RES(new_word);
   new_word_res->combination = true;
   worst_word_it.add_before_then_move(new_word_res);

   word_res->ClearResults();
 }

 int16_t Tesseract::worst_noise_blob(WERD_RES *word_res,
                                   float *worst_noise_score) {
   float noise_score[512];
   int i;
   int min_noise_blob;            // 1st contender
   int max_noise_blob;            // last contender
   int non_noise_count;
   int worst_noise_blob;          // Worst blob
   float small_limit = kBlnXHeight * fixsp_small_outlines_size;
   float non_noise_limit = kBlnXHeight * 0.8;

   if (word_res->rebuild_word == nullptr)
     return -1;  // Can't handle cube words.

   // Normalised.
   int blob_count = word_res->box_word->length();
   ASSERT_HOST(blob_count <= 512);
   if (blob_count < 5)
     return -1;                   // too short to split

   /* Get the noise scores for all blobs */

   #ifndef SECURE_NAMES
   if (debug_fix_space_level > 5)
     tprintf("FP fixspace Noise metrics for \"%s\": ",
             word_res->best_choice->unichar_string().string());
   #endif

   for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
     TBLOB* blob = word_res->rebuild_word->blobs[i];
     if (word_res->reject_map[i].accepted())
       noise_score[i] = non_noise_limit;
     else
       noise_score[i] = blob_noise_score(blob);

     if (debug_fix_space_level > 5)
       tprintf("%1.1f ", noise_score[i]);
   }
   if (debug_fix_space_level > 5)
     tprintf("\n");

   /* Now find the worst one which is far enough away from the end of the word */

   non_noise_count = 0;
   for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
     if (noise_score[i] >= non_noise_limit) {
       non_noise_count++;
     }
   }
   if (non_noise_count < fixsp_non_noise_limit)
     return -1;

   min_noise_blob = i;

   non_noise_count = 0;
   for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
        i--) {
     if (noise_score[i] >= non_noise_limit) {
       non_noise_count++;
     }
   }
   if (non_noise_count < fixsp_non_noise_limit)
     return -1;

   max_noise_blob = i;

   if (min_noise_blob > max_noise_blob)
     return -1;

   *worst_noise_score = small_limit;
   worst_noise_blob = -1;
   for (i = min_noise_blob; i <= max_noise_blob; i++) {
     if (noise_score[i] < *worst_noise_score) {
       worst_noise_blob = i;
       *worst_noise_score = noise_score[i];
     }
   }
   return worst_noise_blob;
 }

 float Tesseract::blob_noise_score(TBLOB *blob) {
   TBOX box;                       // BB of outline
   int16_t outline_count = 0;
   int16_t max_dimension;
   int16_t largest_outline_dimension = 0;

   for (TESSLINE* ol = blob->outlines; ol != nullptr; ol= ol->next) {
     outline_count++;
     box = ol->bounding_box();
     if (box.height() > box.width()) {
       max_dimension = box.height();
     } else {
       max_dimension = box.width();
     }

     if (largest_outline_dimension < max_dimension)
       largest_outline_dimension = max_dimension;
   }

   if (outline_count > 5) {
     // penalise LOTS of blobs
     largest_outline_dimension *= 2;
   }

   box = blob->bounding_box();
   if (box.bottom() > kBlnBaselineOffset * 4 ||
       box.top() < kBlnBaselineOffset / 2) {
     // Lax blob is if high or low
     largest_outline_dimension /= 2;
   }

   return largest_outline_dimension;
 }
 }  // namespace tesseract

 void fixspace_dbg(WERD_RES *word) {
   TBOX box = word->word->bounding_box();
   const bool show_map_detail = false;
   int16_t i;

   box.print();
   tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
   tprintf("Blob count: %d (word); %d/%d (rebuild word)\n",
           word->word->cblob_list()->length(),
           word->rebuild_word->NumBlobs(),
           word->box_word->length());
   word->reject_map.print(debug_fp);
   tprintf("\n");
   if (show_map_detail) {
     tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
     for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
       tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
       word->reject_map[i].full_print(debug_fp);
     }
   }

   tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
   tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
 }


 namespace tesseract {
 int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
   WERD_RES_IT word_it(&word_res_list);
   WERD_RES *word;
   int16_t score = 0;
   int16_t i;
   float small_limit = kBlnXHeight * fixsp_small_outlines_size;

   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
     word = word_it.data();
     if (word->rebuild_word == nullptr)
       continue;  // Can't handle cube words.
     if (word->done ||
         word->tess_accepted ||
         word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
         word->best_choice->permuter() == FREQ_DAWG_PERM ||
         word->best_choice->permuter() == USER_DAWG_PERM ||
         safe_dict_word(word) > 0) {
       int num_blobs = word->rebuild_word->NumBlobs();
       UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
       for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
         TBLOB* blob = word->rebuild_word->blobs[i];
         if (word->best_choice->unichar_id(i) == space ||
             blob_noise_score(blob) < small_limit) {
           score -= 1;  // penalise possibly erroneous non-space
         } else if (word->reject_map[i].accepted()) {
           score++;
         }
       }
     }
   }
   if (score < 0)
     score = 0;
   return score;
 }

 }  // namespace tesseract
TESSLINE
Definition: blobs.h:187

tesseract::Tesseract::tessedit_prefer_joined_punct
bool tessedit_prefer_joined_punct
Definition: tesseractclass.h:1005

WERD_RES::tess_failed
bool tess_failed
Definition: pageres.h:288

WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:260

TESSLINE::next
TESSLINE * next
Definition: blobs.h:265

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:35

SYSTEM_DAWG_PERM
Definition: ratngs.h:251

tesseract::Tesseract::digit_or_numeric_punct
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:373

TRUE
#define TRUE
Definition: capi.h:51

tesseractclass.h

tesseract::Wordrec::prev_word_best_choice_
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481

pageres.h

debug_fp
FILE * debug_fp
Definition: tessvars.cpp:24

errcode.h

TBOX::print
void print() const
Definition: rect.h:278

ratngs.h

blobs.h

host.h

WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:287

ETEXT_DESC::cancel_this
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:132

tesseract::TesseractStats::dict_words
int32_t dict_words
Definition: tesseractclass.h:137

W_FUZZY_NON
Definition: werd.h:42

STRING::string
const char * string() const
Definition: strngs.cpp:196

WERD_RES::deep_copy
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:649

REJMAP::full_print
void full_print(FILE *fp)
Definition: rejctmap.cpp:335

WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:159

WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:346

TBOX
Definition: rect.h:34

tesseract::Tesseract::fix_sp_fp_word
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:565

TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:432

REJMAP::length
int32_t length() const
Definition: rejctmap.h:223

W_EOL
Definition: werd.h:35

kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:24

unicharset.h

UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209

kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:25

WERD_RES
Definition: pageres.h:169

WERD::set_flag
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:127

tesseract::Tesseract::eval_word_spacing
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:269

strngs.h

PAGE_RES
Definition: pageres.h:77

WERD::rej_cblob_list
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:93

tesseract::Tesseract::dump_words
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
Definition: fixspace.cpp:479

ETEXT_DESC::ocr_alive
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:127

PAGE_RES::block_res_list
BLOCK_RES_LIST block_res_list
Definition: pageres.h:81

tesseract::Tesseract::blob_noise_score
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:790

fixspace_dbg
void fixspace_dbg(WERD_RES *word)
Definition: fixspace.cpp:825

tprintf.h

tesseract::Tesseract::numeric_punctuation
char * numeric_punctuation
Definition: tesseractclass.h:1009

TBOX::width
int16_t width() const
Definition: rect.h:115

tesseract::Tesseract::fixsp_small_outlines_size
double fixsp_small_outlines_size
Definition: tesseractclass.h:1004

TBOX::left
int16_t left() const
Definition: rect.h:72

WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126

WERD_CHOICE::unichar_lengths
const STRING & unichar_lengths() const
Definition: ratngs.h:548

TBOX::top
int16_t top() const
Definition: rect.h:58

tesseract::TesseractStats::dump_words_str
STRING dump_words_str
Definition: tesseractclass.h:138

normalis.h

W_FUZZY_SP
Definition: werd.h:41

UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507

W_DONT_CHOP
Definition: werd.h:39

tesseract::Tesseract::safe_dict_word
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:612

tesseract::Tesseract::debug_fix_space_level
int debug_fix_space_level
Definition: tesseractclass.h:1007

PERFECT_WERDS
#define PERFECT_WERDS
Definition: fixspace.cpp:46

REJMAP::reject_count
int16_t reject_count()
Definition: rejctmap.h:229

FALSE
#define FALSE
Definition: capi.h:52

ETEXT_DESC::deadline_exceeded
bool deadline_exceeded() const
Definition: ocrclass.h:164

tesseract::Tesseract::fixspace_thinks_word_done
bool fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:533

WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:296

tesseract::Tesseract::test_pt
bool test_pt
Definition: tesseractclass.h:919

WERD::set_blanks
void set_blanks(uint8_t new_blanks)
Definition: werd.h:105

tesseract::Tesseract::fixsp_done_mode
int fixsp_done_mode
Definition: tesseractclass.h:1006

WERD_RES::copy_on
void copy_on(WERD_RES *word_res)
Definition: pageres.h:660

unichar.h

WERD
Definition: werd.h:59

WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315

ROW
Definition: ocrrow.h:36

tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37

TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:478

tesseract::Tesseract::fix_fuzzy_space_list
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:175

werd.h

W_BOL
Definition: werd.h:34

BLOCK
Definition: ocrblock.h:30

WERD_CHOICE::length
int length() const
Definition: ratngs.h:303

WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:98

stepblob.h

tesseract
Definition: baseapi.cpp:94

TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:443

WERD_RES::done
bool done
Definition: pageres.h:298

NUMBER_PERM
Definition: ratngs.h:249

WERD_RES::combination
bool combination
Definition: pageres.h:334

WERD_RES::x_height
float x_height
Definition: pageres.h:311

WERD_RES::ClearResults
void ClearResults()
Definition: pageres.cpp:1153

STRING
Definition: strngs.h:45

initialise_search
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:207

C_BLOB
Definition: stepblob.h:37

STRING::contains
bool contains(const char c) const
Definition: strngs.cpp:187

tesseract::Tesseract::SetupWordPassN
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:182

tesseract::Tesseract::fix_fuzzy_spaces
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:78

tesseract::Tesseract::worst_noise_blob
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:710

params.h

C_BLOB::bounding_box
TBOX bounding_box() const
Definition: stepblob.cpp:255

W_REP_CHAR
Definition: werd.h:40

WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:206

tesseract::BoxWord::length
int length() const
Definition: boxword.h:83

tesseract::Tesseract::match_current_words
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:226

ETEXT_DESC::cancel
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:129

WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:541

tesseract::Tesseract::conflict_set_I_l_1
char * conflict_set_I_l_1
Definition: tesseractclass.h:1083

tesseract::Tesseract::fix_noisy_space_list
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:599

WERD_RES::part_of_combo
bool part_of_combo
Definition: pageres.h:335

tesseract::Tesseract::check_debug_pt
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1868

USER_DAWG_PERM
Definition: ratngs.h:253

tesseract::WordData
Definition: tesseractclass.h:147

TBOX::right
int16_t right() const
Definition: rect.h:79

fixspace.h

TBLOB
Definition: blobs.h:268

WERD::join_on
void join_on(WERD *other)
Definition: werd.cpp:210

ETEXT_DESC
Definition: ocrclass.h:119

FREQ_DAWG_PERM
Definition: ratngs.h:254

ocrclass.h

TBOX::bottom
int16_t bottom() const
Definition: rect.h:65

transform_to_next_perm
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:402

TBLOB::outlines
TESSLINE * outlines
Definition: blobs.h:384

ETEXT_DESC::progress
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:122

tessvars.h

WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235

TBOX::height
int16_t height() const
Definition: rect.h:108

WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:266

tesseract::Tesseract::fixsp_non_noise_limit
int fixsp_non_noise_limit
Definition: tesseractclass.h:1003

tesseract::Tesseract::break_noisiest_blob_word
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:645

REJMAP::print
void print(FILE *fp)
Definition: rejctmap.cpp:323

tesseract::Tesseract::fp_eval_word_spacing
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:860

boxword.h

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:84

tesseract::Tesseract::classify_word_and_language
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1338

rect.h

WERD_RES::word
WERD * word
Definition: pageres.h:189