tessapi/4.0.0/a00341_source.html

 /**********************************************************************
  * File:        pageres.cpp  (Formerly page_res.c)
  * Description: Hierarchy of results classes from PAGE_RES to WERD_RES
  *              and an iterator class to iterate over the words.
  * Main purposes:
  *              Easy way to iterate over the words without a 3-nested loop.
  *              Holds data used during word recognition.
  *              Holds information about alternative spacing paths.
  * Author:      Phil Cheatle
  * Created:     Tue Sep 22 08:42:49 BST 1992
  *
  * (C) Copyright 1992, Hewlett-Packard Ltd.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **********************************************************************/

 #include "pageres.h"
 #include <cassert>         // for assert
 #include <cstdint>         // for INT32_MAX
 #include <cstring>         // for strlen
 #include "blamer.h"        // for BlamerBundle
 #include "blobs.h"         // for TWERD, TBLOB
 #include "boxword.h"       // for BoxWord
 #include "errcode.h"       // for ASSERT_HOST
 #include "host.h"          // for TRUE, FALSE
 #include "ocrblock.h"      // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
 #include "ocrrow.h"        // for ROW, ROW_IT
 #include "pdblock.h"       // for PDBLK
 #include "polyblk.h"       // for POLY_BLOCK
 #include "publictypes.h"   // for OcrEngineMode, OEM_LSTM_ONLY
 #include "seam.h"          // for SEAM, start_seam_list
 #include "stepblob.h"      // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
 #include "tesscallback.h"  // for NewPermanentTessCallback, TessResultCallback2
 #include "tprintf.h"       // for tprintf

 struct Pix;

 ELISTIZE (BLOCK_RES)
 CLISTIZE (BLOCK_RES) ELISTIZE (ROW_RES) ELISTIZE (WERD_RES)

 // Gain factor for computing thresholds that determine the ambiguity of a word.
 static const double kStopperAmbiguityThresholdGain = 8.0;
 // Constant offset for computing thresholds that determine the ambiguity of a
 // word.
 static const double kStopperAmbiguityThresholdOffset = 1.5;
 // Max number of broken pieces to associate.
 const int kWordrecMaxNumJoinChunks = 4;
 // Max ratio of word box height to line size to allow it to be processed as
 // a line with other words.
 const double kMaxWordSizeRatio = 1.25;
 // Max ratio of line box height to line size to allow a new word to be added.
 const double kMaxLineSizeRatio = 1.25;
 // Max ratio of word gap to line size to allow a new word to be added.
 const double kMaxWordGapRatio = 2.0;

 // Computes and returns a threshold of certainty difference used to determine
 // which words to keep, based on the adjustment factors of the two words.
 // TODO(rays) This is horrible. Replace with an enhance params training model.
 static double StopperAmbigThreshold(double f1, double f2) {
   return (f2 - f1) * kStopperAmbiguityThresholdGain -
       kStopperAmbiguityThresholdOffset;
 }

 /*************************************************************************
  * PAGE_RES::PAGE_RES
  *
  * Constructor for page results
  *************************************************************************/
 PAGE_RES::PAGE_RES(
     bool merge_similar_words,
     BLOCK_LIST *the_block_list,
     WERD_CHOICE **prev_word_best_choice_ptr) {
   Init();
   BLOCK_IT block_it(the_block_list);
   BLOCK_RES_IT block_res_it(&block_res_list);
   for (block_it.mark_cycle_pt();
        !block_it.cycled_list(); block_it.forward()) {
     block_res_it.add_to_end(new BLOCK_RES(merge_similar_words,
                                           block_it.data()));
   }
   prev_word_best_choice = prev_word_best_choice_ptr;
 }

 /*************************************************************************
  * BLOCK_RES::BLOCK_RES
  *
  * Constructor for BLOCK results
  *************************************************************************/

 BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
   ROW_IT row_it (the_block->row_list ());
   ROW_RES_IT row_res_it(&row_res_list);

   char_count = 0;
   rej_count = 0;
   font_class = -1;               //not assigned
   x_height = -1.0;
   font_assigned = false;
   bold = false;
   italic = false;
   row_count = 0;

   block = the_block;

   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
     row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
   }
 }

 /*************************************************************************
  * ROW_RES::ROW_RES
  *
  * Constructor for ROW results
  *************************************************************************/

 ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
   WERD_IT word_it(the_row->word_list());
   WERD_RES_IT word_res_it(&word_res_list);
   WERD_RES *combo = nullptr;        // current combination of fuzzies
   WERD *copy_word;

   char_count = 0;
   rej_count = 0;
   whole_word_rej_count = 0;

   row = the_row;
   bool add_next_word = false;
   TBOX union_box;
   float line_height = the_row->x_height() + the_row->ascenders() -
       the_row->descenders();
   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
     WERD_RES* word_res = new WERD_RES(word_it.data());
     word_res->x_height = the_row->x_height();
     if (add_next_word) {
       ASSERT_HOST(combo != nullptr);
       // We are adding this word to the combination.
       word_res->part_of_combo = TRUE;
       combo->copy_on(word_res);
     } else if (merge_similar_words) {
       union_box = word_res->word->bounding_box();
       add_next_word = !word_res->word->flag(W_REP_CHAR) &&
           union_box.height() <= line_height * kMaxWordSizeRatio;
       word_res->odd_size = !add_next_word;
     }
     WERD* next_word = word_it.data_relative(1);
     if (merge_similar_words) {
       if (add_next_word && !next_word->flag(W_REP_CHAR)) {
         // Next word will be added on if all of the following are true:
         // Not a rep char.
         // Box height small enough.
         // Union box height small enough.
         // Horizontal gap small enough.
         TBOX next_box = next_word->bounding_box();
         int prev_right = union_box.right();
         union_box += next_box;
         if (next_box.height() > line_height * kMaxWordSizeRatio ||
             union_box.height() > line_height * kMaxLineSizeRatio ||
             next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
           add_next_word = false;
         }
       }
       next_word->set_flag(W_FUZZY_NON, add_next_word);
     } else {
       add_next_word = next_word->flag(W_FUZZY_NON);
     }
     if (add_next_word) {
       if (combo == nullptr) {
         copy_word = new WERD;
         *copy_word = *(word_it.data());  // deep copy
         combo = new WERD_RES(copy_word);
         combo->x_height = the_row->x_height();
         combo->combination = TRUE;
         word_res_it.add_to_end(combo);
       }
       word_res->part_of_combo = TRUE;
     } else {
       combo = nullptr;
     }
     word_res_it.add_to_end(word_res);
   }
 }


 WERD_RES& WERD_RES::operator=(const WERD_RES & source) {
   this->ELIST_LINK::operator=(source);
   Clear();
   if (source.combination) {
     word = new WERD;
     *word = *(source.word);      // deep copy
   } else {
     word = source.word;          // pt to same word
   }
   if (source.bln_boxes != nullptr)
     bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
   if (source.chopped_word != nullptr)
     chopped_word = new TWERD(*source.chopped_word);
   if (source.rebuild_word != nullptr)
     rebuild_word = new TWERD(*source.rebuild_word);
   // TODO(rays) Do we ever need to copy the seam_array?
   blob_row = source.blob_row;
   denorm = source.denorm;
   if (source.box_word != nullptr)
     box_word = new tesseract::BoxWord(*source.box_word);
   best_state = source.best_state;
   correct_text = source.correct_text;
   blob_widths = source.blob_widths;
   blob_gaps = source.blob_gaps;
   // None of the uses of operator= require the ratings matrix to be copied,
   // so don't as it would be really slow.

   // Copy the cooked choices.
   WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&source.best_choices));
   WERD_CHOICE_IT wc_dest_it(&best_choices);
   for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
     const WERD_CHOICE *choice = wc_it.data();
     wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));
   }
   if (!wc_dest_it.empty()) {
     wc_dest_it.move_to_first();
     best_choice = wc_dest_it.data();
   } else {
     best_choice = nullptr;
   }

   if (source.raw_choice != nullptr) {
     raw_choice = new WERD_CHOICE(*source.raw_choice);
   } else {
     raw_choice = nullptr;
   }
   if (source.ep_choice != nullptr) {
     ep_choice = new WERD_CHOICE(*source.ep_choice);
   } else {
     ep_choice = nullptr;
   }
   reject_map = source.reject_map;
   combination = source.combination;
   part_of_combo = source.part_of_combo;
   CopySimpleFields(source);
   if (source.blamer_bundle != nullptr) {
     blamer_bundle =  new BlamerBundle(*(source.blamer_bundle));
   }
   return *this;
 }

 // Copies basic fields that don't involve pointers that might be useful
 // to copy when making one WERD_RES from another.
 void WERD_RES::CopySimpleFields(const WERD_RES& source) {
   tess_failed = source.tess_failed;
   tess_accepted = source.tess_accepted;
   tess_would_adapt = source.tess_would_adapt;
   done = source.done;
   unlv_crunch_mode = source.unlv_crunch_mode;
   small_caps = source.small_caps;
   odd_size = source.odd_size;
   italic = source.italic;
   bold = source.bold;
   fontinfo = source.fontinfo;
   fontinfo2 = source.fontinfo2;
   fontinfo_id_count = source.fontinfo_id_count;
   fontinfo_id2_count = source.fontinfo_id2_count;
   x_height = source.x_height;
   caps_height = source.caps_height;
   baseline_shift = source.baseline_shift;
   guessed_x_ht = source.guessed_x_ht;
   guessed_caps_ht = source.guessed_caps_ht;
   reject_spaces = source.reject_spaces;
   uch_set = source.uch_set;
   tesseract = source.tesseract;
 }

 // Initializes a blank (default constructed) WERD_RES from one that has
 // already been recognized.
 // Use SetupFor*Recognition afterwards to complete the setup and make
 // it ready for a retry recognition.
 void WERD_RES::InitForRetryRecognition(const WERD_RES& source) {
   word = source.word;
   CopySimpleFields(source);
   if (source.blamer_bundle != nullptr) {
     blamer_bundle = new BlamerBundle();
     blamer_bundle->CopyTruth(*source.blamer_bundle);
   }
 }

 // Sets up the members used in recognition: bln_boxes, chopped_word,
 // seam_array, denorm.  Returns false if
 // the word is empty and sets up fake results.  If use_body_size is
 // true and row->body_size is set, then body_size will be used for
 // blob normalization instead of xheight + ascrise. This flag is for
 // those languages that are using CJK pitch model and thus it has to
 // be true if and only if tesseract->textord_use_cjk_fp_model is
 // true.
 // If allow_detailed_fx is true, the feature extractor will receive fine
 // precision outline information, allowing smoother features and better
 // features on low resolution images.
 // The norm_mode_hint sets the default mode for normalization in absence
 // of any of the above flags.
 // norm_box is used to override the word bounding box to determine the
 // normalization scale and offset.
 // Returns false if the word is empty and sets up fake results.
 bool WERD_RES::SetupForRecognition(const UNICHARSET& unicharset_in,
                                    tesseract::Tesseract* tess, Pix* pix,
                                    int norm_mode,
                                    const TBOX* norm_box,
                                    bool numeric_mode,
                                    bool use_body_size,
                                    bool allow_detailed_fx,
                                    ROW *row, const BLOCK* block) {
   tesseract::OcrEngineMode norm_mode_hint =
       static_cast<tesseract::OcrEngineMode>(norm_mode);
   tesseract = tess;
   POLY_BLOCK* pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
   if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY &&
        word->cblob_list()->empty()) ||
       (pb != nullptr && !pb->IsText())) {
     // Empty words occur when all the blobs have been moved to the rej_blobs
     // list, which seems to occur frequently in junk.
     SetupFake(unicharset_in);
     word->set_flag(W_REP_CHAR, false);
     return false;
   }
   ClearResults();
   SetupWordScript(unicharset_in);
   chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);
   float word_xheight = use_body_size && row != nullptr && row->body_size() > 0.0f
                      ? row->body_size() : x_height;
   chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
                             word_xheight, baseline_shift, numeric_mode,
                             norm_mode_hint, norm_box, &denorm);
   blob_row = row;
   SetupBasicsFromChoppedWord(unicharset_in);
   SetupBlamerBundle();
   int num_blobs = chopped_word->NumBlobs();
   ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);
   tess_failed = false;
   return true;
 }

 // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
 // accumulators from a made chopped word.  We presume the fields are already
 // empty.
 void WERD_RES::SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in) {
   bln_boxes = tesseract::BoxWord::CopyFromNormalized(chopped_word);
   start_seam_list(chopped_word, &seam_array);
   SetupBlobWidthsAndGaps();
   ClearWordChoices();
 }

 // Sets up the members used in recognition for an empty recognition result:
 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
 void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
   ClearResults();
   SetupWordScript(unicharset_in);
   chopped_word = new TWERD;
   rebuild_word = new TWERD;
   bln_boxes = new tesseract::BoxWord;
   box_word = new tesseract::BoxWord;
   int blob_count = word->cblob_list()->length();
   if (blob_count > 0) {
     BLOB_CHOICE** fake_choices = new BLOB_CHOICE*[blob_count];
     // For non-text blocks, just pass any blobs through to the box_word
     // and call the word failed with a fake classification.
     C_BLOB_IT b_it(word->cblob_list());
     int blob_id = 0;
     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
       TBOX box = b_it.data()->bounding_box();
       box_word->InsertBox(box_word->length(), box);
       fake_choices[blob_id++] = new BLOB_CHOICE;
     }
     FakeClassifyWord(blob_count, fake_choices);
     delete [] fake_choices;
   } else {
     WERD_CHOICE* word = new WERD_CHOICE(&unicharset_in);
     word->make_bad();
     LogNewRawChoice(word);
     // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.
     LogNewCookedChoice(1, false, word);
   }
   tess_failed = true;
   done = true;
 }

 void WERD_RES::SetupWordScript(const UNICHARSET& uch) {
   uch_set = &uch;
   int script = uch.default_sid();
   word->set_script_id(script);
   word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight());
   word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
 }

 // Sets up the blamer_bundle if it is not null, using the initialized denorm.
 void WERD_RES::SetupBlamerBundle() {
   if (blamer_bundle != nullptr) {
     blamer_bundle->SetupNormTruthWord(denorm);
   }
 }

 // Computes the blob_widths and blob_gaps from the chopped_word.
 void WERD_RES::SetupBlobWidthsAndGaps() {
   blob_widths.truncate(0);
   blob_gaps.truncate(0);
   int num_blobs = chopped_word->NumBlobs();
   for (int b = 0; b < num_blobs; ++b) {
     TBLOB *blob = chopped_word->blobs[b];
     TBOX box = blob->bounding_box();
     blob_widths.push_back(box.width());
     if (b + 1 < num_blobs) {
       blob_gaps.push_back(
           chopped_word->blobs[b + 1]->bounding_box().left() - box.right());
     }
   }
 }

 // Updates internal data to account for a new SEAM (chop) at the given
 // blob_number. Fixes the ratings matrix and states in the choices, as well
 // as the blob widths and gaps.
 void WERD_RES::InsertSeam(int blob_number, SEAM* seam) {
   // Insert the seam into the SEAMS array.
   seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
   seam_array.insert(seam, blob_number);
   if (ratings != nullptr) {
     // Expand the ratings matrix.
     ratings = ratings->ConsumeAndMakeBigger(blob_number);
     // Fix all the segmentation states.
     if (raw_choice != nullptr)
       raw_choice->UpdateStateForSplit(blob_number);
     WERD_CHOICE_IT wc_it(&best_choices);
     for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
       WERD_CHOICE* choice = wc_it.data();
       choice->UpdateStateForSplit(blob_number);
     }
     SetupBlobWidthsAndGaps();
   }
 }

 // Returns true if all the word choices except the first have adjust_factors
 // worse than the given threshold.
 bool WERD_RES::AlternativeChoiceAdjustmentsWorseThan(float threshold) const {
   // The choices are not changed by this iteration.
   WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
   for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
     WERD_CHOICE* choice = wc_it.data();
     if (choice->adjust_factor() <= threshold)
       return false;
   }
   return true;
 }

 // Returns true if the current word is ambiguous (by number of answers or
 // by dangerous ambigs.)
 bool WERD_RES::IsAmbiguous() {
   return !best_choices.singleton() || best_choice->dangerous_ambig_found();
 }

 // Returns true if the ratings matrix size matches the sum of each of the
 // segmentation states.
 bool WERD_RES::StatesAllValid() {
   int ratings_dim = ratings->dimension();
   if (raw_choice->TotalOfStates() != ratings_dim) {
     tprintf("raw_choice has total of states = %d vs ratings dim of %d\n",
             raw_choice->TotalOfStates(), ratings_dim);
     return false;
   }
   WERD_CHOICE_IT it(&best_choices);
   int index = 0;
   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
     WERD_CHOICE* choice = it.data();
     if (choice->TotalOfStates() != ratings_dim) {
       tprintf("Cooked #%d has total of states = %d vs ratings dim of %d\n",
               index, choice->TotalOfStates(), ratings_dim);
       return false;
     }
   }
   return true;
 }

 // Prints a list of words found if debug is true or the word result matches
 // the word_to_debug.
 void WERD_RES::DebugWordChoices(bool debug, const char* word_to_debug) {
   if (debug ||
       (word_to_debug != nullptr && *word_to_debug != '\0' && best_choice != nullptr &&
        best_choice->unichar_string() == STRING(word_to_debug))) {
     if (raw_choice != nullptr)
       raw_choice->print("\nBest Raw Choice");

     WERD_CHOICE_IT it(&best_choices);
     int index = 0;
     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
       WERD_CHOICE* choice = it.data();
       STRING label;
       label.add_str_int("\nCooked Choice #", index);
       choice->print(label.string());
     }
   }
 }

 // Prints the top choice along with the accepted/done flags.
 void WERD_RES::DebugTopChoice(const char* msg) const {
   tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ",
           tess_accepted, tess_would_adapt, done);
   if (best_choice == nullptr)
     tprintf("<Null choice>\n");
   else
     best_choice->print(msg);
 }

 // Removes from best_choices all choices which are not within a reasonable
 // range of the best choice.
 // TODO(rays) incorporate the information used here into the params training
 // re-ranker, in place of this heuristic that is based on the previous
 // adjustment factor.
 void WERD_RES::FilterWordChoices(int debug_level) {
   if (best_choice == nullptr || best_choices.singleton())
     return;

   if (debug_level >= 2)
     best_choice->print("\nFiltering against best choice");
   WERD_CHOICE_IT it(&best_choices);
   int index = 0;
   for (it.forward(); !it.at_first(); it.forward(), ++index) {
     WERD_CHOICE* choice = it.data();
     float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
                                             choice->adjust_factor());
     // i, j index the blob choice in choice, best_choice.
     // chunk is an index into the chopped_word blobs (AKA chunks).
     // Since the two words may use different segmentations of the chunks, we
     // iterate over the chunks to find out whether a comparable blob
     // classification is much worse than the best result.
     int i = 0, j = 0, chunk = 0;
     // Each iteration of the while deals with 1 chunk. On entry choice_chunk
     // and best_chunk are the indices of the first chunk in the NEXT blob,
     // i.e. we don't have to increment i, j while chunk < choice_chunk and
     // best_chunk respectively.
     int choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
     while (i < choice->length() && j < best_choice->length()) {
       if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
           choice->certainty(i) - best_choice->certainty(j) < threshold) {
         if (debug_level >= 2) {
           choice->print("WorstCertaintyDiffWorseThan");
           tprintf(
               "i %d j %d Choice->Blob[i].Certainty %.4g"
               " WorstOtherChoiceCertainty %g Threshold %g\n",
               i, j, choice->certainty(i), best_choice->certainty(j), threshold);
           tprintf("Discarding bad choice #%d\n", index);
         }
         delete it.extract();
         break;
       }
       ++chunk;
       // If needed, advance choice_chunk to keep up with chunk.
       while (choice_chunk < chunk && ++i < choice->length())
         choice_chunk += choice->state(i);
       // If needed, advance best_chunk to keep up with chunk.
       while (best_chunk < chunk && ++j < best_choice->length())
         best_chunk += best_choice->state(j);
     }
   }
 }

 void WERD_RES::ComputeAdaptionThresholds(float certainty_scale,
                                          float min_rating,
                                          float max_rating,
                                          float rating_margin,
                                          float* thresholds) {
   int chunk = 0;
   int end_chunk = best_choice->state(0);
   int end_raw_chunk = raw_choice->state(0);
   int raw_blob = 0;
   for (int i = 0; i < best_choice->length(); i++, thresholds++) {
     float avg_rating = 0.0f;
     int num_error_chunks = 0;

     // For each chunk in best choice blob i, count non-matching raw results.
     while (chunk < end_chunk) {
       if (chunk >= end_raw_chunk) {
         ++raw_blob;
         end_raw_chunk += raw_choice->state(raw_blob);
       }
       if (best_choice->unichar_id(i) !=
           raw_choice->unichar_id(raw_blob)) {
         avg_rating += raw_choice->certainty(raw_blob);
         ++num_error_chunks;
       }
       ++chunk;
     }

     if (num_error_chunks > 0) {
       avg_rating /= num_error_chunks;
       *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
     } else {
       *thresholds = max_rating;
     }

     if (*thresholds > max_rating)
       *thresholds = max_rating;
     if (*thresholds < min_rating)
       *thresholds = min_rating;
   }
 }

 // Saves a copy of the word_choice if it has the best unadjusted rating.
 // Returns true if the word_choice was the new best.
 bool WERD_RES::LogNewRawChoice(WERD_CHOICE* word_choice) {
   if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) {
     delete raw_choice;
     raw_choice = new WERD_CHOICE(*word_choice);
     raw_choice->set_permuter(TOP_CHOICE_PERM);
     return true;
   }
   return false;
 }

 // Consumes word_choice by adding it to best_choices, (taking ownership) if
 // the certainty for word_choice is some distance of the best choice in
 // best_choices, or by deleting the word_choice and returning false.
 // The best_choices list is kept in sorted order by rating. Duplicates are
 // removed, and the list is kept no longer than max_num_choices in length.
 // Returns true if the word_choice is still a valid pointer.
 bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,
                                   WERD_CHOICE* word_choice) {
   if (best_choice != nullptr) {
     // Throw out obviously bad choices to save some work.
     // TODO(rays) Get rid of this! This piece of code produces different
     // results according to the order in which words are found, which is an
     // undesirable behavior. It would be better to keep all the choices and
     // prune them later when more information is available.
     float max_certainty_delta =
         StopperAmbigThreshold(best_choice->adjust_factor(),
                               word_choice->adjust_factor());
     if (max_certainty_delta > -kStopperAmbiguityThresholdOffset)
       max_certainty_delta = -kStopperAmbiguityThresholdOffset;
     if (word_choice->certainty() - best_choice->certainty() <
         max_certainty_delta) {
       if (debug) {
         STRING bad_string;
         word_choice->string_and_lengths(&bad_string, nullptr);
         tprintf("Discarding choice \"%s\" with an overly low certainty"
                 " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
                 bad_string.string(), word_choice->certainty(),
                 best_choice->certainty(),
                 max_certainty_delta + best_choice->certainty());
       }
       delete word_choice;
       return false;
     }
   }

   // Insert in the list in order of increasing rating, but knock out worse
   // string duplicates.
   WERD_CHOICE_IT it(&best_choices);
   const STRING& new_str = word_choice->unichar_string();
   bool inserted = false;
   int num_choices = 0;
   if (!it.empty()) {
     do {
       WERD_CHOICE* choice = it.data();
       if (choice->rating() > word_choice->rating() && !inserted) {
         // Time to insert.
         it.add_before_stay_put(word_choice);
         inserted = true;
         if (num_choices == 0)
           best_choice = word_choice;  // This is the new best.
         ++num_choices;
       }
       if (choice->unichar_string() == new_str) {
         if (inserted) {
           // New is better.
           delete it.extract();
         } else {
           // Old is better.
           if (debug) {
             tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",
                     new_str.string(), word_choice->rating(), choice->rating());
           }
           delete word_choice;
           return false;
         }
       } else {
         ++num_choices;
         if (num_choices > max_num_choices)
           delete it.extract();
       }
       it.forward();
     } while (!it.at_first());
   }
   if (!inserted && num_choices < max_num_choices) {
     it.add_to_end(word_choice);
     inserted = true;
     if (num_choices == 0)
       best_choice = word_choice;  // This is the new best.
   }
   if (debug) {
     if (inserted)
       tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");
     else
       tprintf("Poor");
     word_choice->print(" Word Choice");
   }
   if (!inserted) {
     delete word_choice;
     return false;
   }
   return true;
 }


 // Simple helper moves the ownership of the pointer data from src to dest,
 // first deleting anything in dest, and nulling out src afterwards.
 template<class T> static void MovePointerData(T** dest, T**src) {
   delete *dest;
   *dest = *src;
   *src = nullptr;
 }

 // Prints a brief list of all the best choices.
 void WERD_RES::PrintBestChoices() const {
   STRING alternates_str;
   WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
     if (!it.at_first()) alternates_str += "\", \"";
     alternates_str += it.data()->unichar_string();
   }
   tprintf("Alternates for \"%s\": {\"%s\"}\n",
           best_choice->unichar_string().string(), alternates_str.string());
 }

 // Returns the sum of the widths of the blob between start_blob and last_blob
 // inclusive.
 int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) {
   int result = 0;
   for (int b = start_blob; b <= last_blob; ++b) {
     result += blob_widths[b];
     if (b < last_blob)
       result += blob_gaps[b];
   }
   return result;
 }
 // Returns the width of a gap between the specified blob and the next one.
 int WERD_RES::GetBlobsGap(int blob_index) {
   if (blob_index < 0 || blob_index >= blob_gaps.size())
     return 0;
   return blob_gaps[blob_index];
 }

 // Returns the BLOB_CHOICE corresponding to the given index in the
 // best choice word taken from the appropriate cell in the ratings MATRIX.
 // Borrowed pointer, so do not delete. May return nullptr if there is no
 // BLOB_CHOICE matching the unichar_id at the given index.
 BLOB_CHOICE* WERD_RES::GetBlobChoice(int index) const {
   if (index < 0 || index >= best_choice->length()) return nullptr;
   BLOB_CHOICE_LIST* choices = GetBlobChoices(index);
   return FindMatchingChoice(best_choice->unichar_id(index), choices);
 }

 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
 // best choice word taken from the appropriate cell in the ratings MATRIX.
 // Borrowed pointer, so do not delete.
 BLOB_CHOICE_LIST* WERD_RES::GetBlobChoices(int index) const {
   return best_choice->blob_choices(index, ratings);
 }

 // Moves the results fields from word to this. This takes ownership of all
 // the data, so src can be destructed.
 void WERD_RES::ConsumeWordResults(WERD_RES* word) {
   denorm = word->denorm;
   blob_row = word->blob_row;
   MovePointerData(&chopped_word, &word->chopped_word);
   MovePointerData(&rebuild_word, &word->rebuild_word);
   MovePointerData(&box_word, &word->box_word);
   seam_array.delete_data_pointers();
   seam_array = word->seam_array;
   word->seam_array.clear();
   best_state.move(&word->best_state);
   correct_text.move(&word->correct_text);
   blob_widths.move(&word->blob_widths);
   blob_gaps.move(&word->blob_gaps);
   if (ratings != nullptr) ratings->delete_matrix_pointers();
   MovePointerData(&ratings, &word->ratings);
   best_choice = word->best_choice;
   MovePointerData(&raw_choice, &word->raw_choice);
   best_choices.clear();
   WERD_CHOICE_IT wc_it(&best_choices);
   wc_it.add_list_after(&word->best_choices);
   reject_map = word->reject_map;
   if (word->blamer_bundle != nullptr) {
     assert(blamer_bundle != nullptr);
     blamer_bundle->CopyResults(*(word->blamer_bundle));
   }
   CopySimpleFields(*word);
 }

 // Replace the best choice and rebuild box word.
 // choice must be from the current best_choices list.
 void WERD_RES::ReplaceBestChoice(WERD_CHOICE* choice) {
   best_choice = choice;
   RebuildBestState();
   SetupBoxWord();
   // Make up a fake reject map of the right length to keep the
   // rejection pass happy.
   reject_map.initialise(best_state.length());
   done = tess_accepted = tess_would_adapt = true;
   SetScriptPositions();
 }

 // Builds the rebuild_word and sets the best_state from the chopped_word and
 // the best_choice->state.
 void WERD_RES::RebuildBestState() {
   ASSERT_HOST(best_choice != nullptr);
   delete rebuild_word;
   rebuild_word = new TWERD;
   if (seam_array.empty())
     start_seam_list(chopped_word, &seam_array);
   best_state.truncate(0);
   int start = 0;
   for (int i = 0; i < best_choice->length(); ++i) {
     int length = best_choice->state(i);
     best_state.push_back(length);
     if (length > 1) {
       SEAM::JoinPieces(seam_array, chopped_word->blobs, start,
                        start + length - 1);
     }
     TBLOB* blob = chopped_word->blobs[start];
     rebuild_word->blobs.push_back(new TBLOB(*blob));
     if (length > 1) {
       SEAM::BreakPieces(seam_array, chopped_word->blobs, start,
                         start + length - 1);
     }
     start += length;
   }
 }

 // Copies the chopped_word to the rebuild_word, faking a best_state as well.
 // Also sets up the output box_word.
 void WERD_RES::CloneChoppedToRebuild() {
   delete rebuild_word;
   rebuild_word = new TWERD(*chopped_word);
   SetupBoxWord();
   int word_len = box_word->length();
   best_state.reserve(word_len);
   correct_text.reserve(word_len);
   for (int i = 0; i < word_len; ++i) {
     best_state.push_back(1);
     correct_text.push_back(STRING(""));
   }
 }

 // Sets/replaces the box_word with one made from the rebuild_word.
 void WERD_RES::SetupBoxWord() {
   delete box_word;
   rebuild_word->ComputeBoundingBoxes();
   box_word = tesseract::BoxWord::CopyFromNormalized(rebuild_word);
   box_word->ClipToOriginalWord(denorm.block(), word);
 }

 // Sets up the script positions in the output best_choice using the best_choice
 // to get the unichars, and the unicharset to get the target positions.
 void WERD_RES::SetScriptPositions() {
   best_choice->SetScriptPositions(small_caps, chopped_word);
 }
 // Sets all the blobs in all the words (raw choice and best choices) to be
 // the given position. (When a sub/superscript is recognized as a separate
 // word, it falls victim to the rule that a whole word cannot be sub or
 // superscript, so this function overrides that problem.)
 void WERD_RES::SetAllScriptPositions(tesseract::ScriptPos position) {
   raw_choice->SetAllScriptPositions(position);
   WERD_CHOICE_IT wc_it(&best_choices);
   for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward())
     wc_it.data()->SetAllScriptPositions(position);
 }

 // Classifies the word with some already-calculated BLOB_CHOICEs.
 // The choices are an array of blob_count pointers to BLOB_CHOICE,
 // providing a single classifier result for each blob.
 // The BLOB_CHOICEs are consumed and the word takes ownership.
 // The number of blobs in the box_word must match blob_count.
 void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
   // Setup the WERD_RES.
   ASSERT_HOST(box_word != nullptr);
   ASSERT_HOST(blob_count == box_word->length());
   ClearWordChoices();
   ClearRatings();
   ratings = new MATRIX(blob_count, 1);
   for (int c = 0; c < blob_count; ++c) {
     BLOB_CHOICE_LIST* choice_list = new BLOB_CHOICE_LIST;
     BLOB_CHOICE_IT choice_it(choice_list);
     choice_it.add_after_then_move(choices[c]);
     ratings->put(c, c, choice_list);
   }
   FakeWordFromRatings(TOP_CHOICE_PERM);
   reject_map.initialise(blob_count);
   best_state.init_to_size(blob_count, 1);
   done = true;
 }

 // Creates a WERD_CHOICE for the word using the top choices from the leading
 // diagonal of the ratings matrix.
 void WERD_RES::FakeWordFromRatings(PermuterType permuter) {
   int num_blobs = ratings->dimension();
   WERD_CHOICE* word_choice = new WERD_CHOICE(uch_set, num_blobs);
   word_choice->set_permuter(permuter);
   for (int b = 0; b < num_blobs; ++b) {
     UNICHAR_ID unichar_id = UNICHAR_SPACE;
     float rating = INT32_MAX;
     float certainty = -INT32_MAX;
     BLOB_CHOICE_LIST* choices = ratings->get(b, b);
     if (choices != nullptr && !choices->empty()) {
       BLOB_CHOICE_IT bc_it(choices);
       BLOB_CHOICE* choice = bc_it.data();
       unichar_id = choice->unichar_id();
       rating = choice->rating();
       certainty = choice->certainty();
     }
     word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
                                                    certainty);
   }
   LogNewRawChoice(word_choice);
   // Ownership of word_choice taken by word here.
   LogNewCookedChoice(1, false, word_choice);
 }

 // Copies the best_choice strings to the correct_text for adaption/training.
 void WERD_RES::BestChoiceToCorrectText() {
   correct_text.clear();
   ASSERT_HOST(best_choice != nullptr);
   for (int i = 0; i < best_choice->length(); ++i) {
     UNICHAR_ID choice_id = best_choice->unichar_id(i);
     const char* blob_choice = uch_set->id_to_unichar(choice_id);
     correct_text.push_back(STRING(blob_choice));
   }
 }

 // Merges 2 adjacent blobs in the result if the permanent callback
 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
 // callback box_cb is nullptr or returns true, setting the merged blob
 // result to the class returned from class_cb.
 // Returns true if anything was merged.
 bool WERD_RES::ConditionalBlobMerge(
     TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb,
     TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb) {
   ASSERT_HOST(best_choice->length() == 0 || ratings != nullptr);
   bool modified = false;
   for (int i = 0; i + 1 < best_choice->length(); ++i) {
     UNICHAR_ID new_id = class_cb->Run(best_choice->unichar_id(i),
                                       best_choice->unichar_id(i+1));
     if (new_id != INVALID_UNICHAR_ID &&
         (box_cb == nullptr || box_cb->Run(box_word->BlobBox(i),
                                        box_word->BlobBox(i + 1)))) {
       // Raw choice should not be fixed.
       best_choice->set_unichar_id(new_id, i);
       modified = true;
       MergeAdjacentBlobs(i);
       const MATRIX_COORD& coord = best_choice->MatrixCoord(i);
       if (!coord.Valid(*ratings)) {
         ratings->IncreaseBandSize(coord.row + 1 - coord.col);
       }
       BLOB_CHOICE_LIST* blob_choices = GetBlobChoices(i);
       if (FindMatchingChoice(new_id, blob_choices) == nullptr) {
         // Insert a fake result.
         BLOB_CHOICE* blob_choice = new BLOB_CHOICE;
         blob_choice->set_unichar_id(new_id);
         BLOB_CHOICE_IT bc_it(blob_choices);
         bc_it.add_before_then_move(blob_choice);
       }
     }
   }
   delete class_cb;
   delete box_cb;
   return modified;
 }

 // Merges 2 adjacent blobs in the result (index and index+1) and corrects
 // all the data to account for the change.
 void WERD_RES::MergeAdjacentBlobs(int index) {
   if (reject_map.length() == best_choice->length())
     reject_map.remove_pos(index);
   best_choice->remove_unichar_id(index + 1);
   rebuild_word->MergeBlobs(index, index + 2);
   box_word->MergeBoxes(index, index + 2);
   if (index + 1 < best_state.length()) {
     best_state[index] += best_state[index + 1];
     best_state.remove(index + 1);
   }
 }

 // TODO(tkielbus) Decide between keeping this behavior here or modifying the
 // training data.

 // Utility function for fix_quotes
 // Return true if the next character in the string (given the UTF8 length in
 // bytes) is a quote character.
 static int is_simple_quote(const char* signed_str, int length) {
   const unsigned char* str =
       reinterpret_cast<const unsigned char*>(signed_str);
   // Standard 1 byte quotes.
   return (length == 1 && (*str == '\'' || *str == '`')) ||
       // UTF-8 3 bytes curved quotes.
       (length == 3 && ((*str == 0xe2 &&
                         *(str + 1) == 0x80 &&
                         *(str + 2) == 0x98) ||
                        (*str == 0xe2 &&
                         *(str + 1) == 0x80 &&
                         *(str + 2) == 0x99)));
 }

 // Callback helper for fix_quotes returns a double quote if both
 // arguments are quote, otherwise INVALID_UNICHAR_ID.
 UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) {
   const char *ch = uch_set->id_to_unichar(id1);
   const char *next_ch = uch_set->id_to_unichar(id2);
   if (is_simple_quote(ch, strlen(ch)) &&
       is_simple_quote(next_ch, strlen(next_ch)))
     return uch_set->unichar_to_id("\"");
   return INVALID_UNICHAR_ID;
 }

 // Change pairs of quotes to double quotes.
 void WERD_RES::fix_quotes() {
   if (!uch_set->contains_unichar("\"") ||
       !uch_set->get_enabled(uch_set->unichar_to_id("\"")))
     return;  // Don't create it if it is disallowed.

   ConditionalBlobMerge(
       NewPermanentTessCallback(this, &WERD_RES::BothQuotes),
       nullptr);
 }

 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
 // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
 UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) {
   const char *ch = uch_set->id_to_unichar(id1);
   const char *next_ch = uch_set->id_to_unichar(id2);
   if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
       (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~'))
     return uch_set->unichar_to_id("-");
   return INVALID_UNICHAR_ID;
 }

 // Callback helper for fix_hyphens returns true if box1 and box2 overlap
 // (assuming both on the same textline, are in order and a chopped em dash.)
 bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) {
   return box1.right() >= box2.left();
 }

 // Change pairs of hyphens to a single hyphen if the bounding boxes touch
 // Typically a long dash which has been segmented.
 void WERD_RES::fix_hyphens() {
   if (!uch_set->contains_unichar("-") ||
       !uch_set->get_enabled(uch_set->unichar_to_id("-")))
     return;  // Don't create it if it is disallowed.

   ConditionalBlobMerge(
       NewPermanentTessCallback(this, &WERD_RES::BothHyphens),
       NewPermanentTessCallback(this, &WERD_RES::HyphenBoxesOverlap));
 }

 // Callback helper for merge_tess_fails returns a space if both
 // arguments are space, otherwise INVALID_UNICHAR_ID.
 UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {
   if (id1 == id2 && id1 == uch_set->unichar_to_id(" "))
     return id1;
   else
     return INVALID_UNICHAR_ID;
 }

 // Change pairs of tess failures to a single one
 void WERD_RES::merge_tess_fails() {
   if (ConditionalBlobMerge(
       NewPermanentTessCallback(this, &WERD_RES::BothSpaces), nullptr)) {
     int len = best_choice->length();
     ASSERT_HOST(reject_map.length() == len);
     ASSERT_HOST(box_word->length() == len);
   }
 }

 // Returns true if the collection of count pieces, starting at start, are all
 // natural connected components, ie there are no real chops involved.
 bool WERD_RES::PiecesAllNatural(int start, int count) const {
   // all seams must have no splits.
   for (int index = start; index < start + count - 1; ++index) {
     if (index >= 0 && index < seam_array.size()) {
       SEAM* seam = seam_array[index];
       if (seam != nullptr && seam->HasAnySplits()) return false;
     }
   }
   return true;
 }


 WERD_RES::~WERD_RES () {
   Clear();
 }

 void WERD_RES::InitNonPointers() {
   tess_failed = false;
   tess_accepted = false;
   tess_would_adapt = false;
   done = false;
   unlv_crunch_mode = CR_NONE;
   small_caps = false;
   odd_size = false;
   italic = FALSE;
   bold = FALSE;
   // The fontinfos and tesseract count as non-pointers as they point to
   // data owned elsewhere.
   fontinfo = nullptr;
   fontinfo2 = nullptr;
   tesseract = nullptr;
   fontinfo_id_count = 0;
   fontinfo_id2_count = 0;
   x_height = 0.0;
   caps_height = 0.0;
   baseline_shift = 0.0f;
   space_certainty = 0.0f;
   guessed_x_ht = true;
   guessed_caps_ht = true;
   combination = false;
   part_of_combo = false;
   reject_spaces = false;
 }

 void WERD_RES::InitPointers() {
   word = nullptr;
   bln_boxes = nullptr;
   blob_row = nullptr;
   uch_set = nullptr;
   chopped_word = nullptr;
   rebuild_word = nullptr;
   box_word = nullptr;
   ratings = nullptr;
   best_choice = nullptr;
   raw_choice = nullptr;
   ep_choice = nullptr;
   blamer_bundle = nullptr;
 }

 void WERD_RES::Clear() {
   if (combination) {
     delete word;
   }
   word = nullptr;
   delete blamer_bundle;
   blamer_bundle = nullptr;
   ClearResults();
 }

 void WERD_RES::ClearResults() {
   done = false;
   fontinfo = nullptr;
   fontinfo2 = nullptr;
   fontinfo_id_count = 0;
   fontinfo_id2_count = 0;
   delete bln_boxes;
   bln_boxes = nullptr;
   blob_row = nullptr;
   delete chopped_word;
   chopped_word = nullptr;
   delete rebuild_word;
   rebuild_word = nullptr;
   delete box_word;
   box_word = nullptr;
   best_state.clear();
   correct_text.clear();
   seam_array.delete_data_pointers();
   seam_array.clear();
   blob_widths.clear();
   blob_gaps.clear();
   ClearRatings();
   ClearWordChoices();
   if (blamer_bundle != nullptr) blamer_bundle->ClearResults();
 }
 void WERD_RES::ClearWordChoices() {
   best_choice = nullptr;
   delete raw_choice;
   raw_choice = nullptr;
   best_choices.clear();
   delete ep_choice;
   ep_choice = nullptr;
 }
 void WERD_RES::ClearRatings() {
   if (ratings != nullptr) {
     ratings->delete_matrix_pointers();
     delete ratings;
     ratings = nullptr;
   }
 }


 bool PAGE_RES_IT::operator ==(const PAGE_RES_IT &other) const {
   return word_res == other.word_res &&
       row_res == other.row_res &&
       block_res == other.block_res;
 }

 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
   ASSERT_HOST(page_res == other.page_res);
   if (other.block_res == nullptr) {
     // other points to the end of the page.
     if (block_res == nullptr)
       return 0;
     return -1;
   }
   if (block_res == nullptr) {
     return 1; // we point to the end of the page.
   }
   if (block_res == other.block_res) {
     if (other.row_res == nullptr || row_res == nullptr) {
       // this should only happen if we hit an image block.
       return 0;
     }
     if (row_res == other.row_res) {
       // we point to the same block and row.
       ASSERT_HOST(other.word_res != nullptr && word_res != nullptr);
       if (word_res == other.word_res) {
         // we point to the same word!
         return 0;
       }

       WERD_RES_IT word_res_it(&row_res->word_res_list);
       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
            word_res_it.forward()) {
         if (word_res_it.data() == word_res) {
           return -1;
         } else if (word_res_it.data() == other.word_res) {
           return 1;
         }
       }
       ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
     }

     // we both point to the same block, but different rows.
     ROW_RES_IT row_res_it(&block_res->row_res_list);
     for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
          row_res_it.forward()) {
       if (row_res_it.data() == row_res) {
         return -1;
       } else if (row_res_it.data() == other.row_res) {
         return 1;
       }
     }
     ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
   }

   // We point to different blocks.
   BLOCK_RES_IT block_res_it(&page_res->block_res_list);
   for (block_res_it.mark_cycle_pt();
        !block_res_it.cycled_list(); block_res_it.forward()) {
     if (block_res_it.data() == block_res) {
       return -1;
     } else if (block_res_it.data() == other.block_res) {
       return 1;
     }
   }
   // Shouldn't happen...
   ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
   return 0;
 }

 // Inserts the new_word as a combination owned by a corresponding WERD_RES
 // before the current position. The simple fields of the WERD_RES are copied
 // from clone_res and the resulting WERD_RES is returned for further setup
 // with best_choice etc.
 WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res,
                                              WERD* new_word) {
   // Make a WERD_RES for the new_word.
   WERD_RES* new_res = new WERD_RES(new_word);
   new_res->CopySimpleFields(clone_res);
   new_res->combination = true;
   // Insert into the appropriate place in the ROW_RES.
   WERD_RES_IT wr_it(&row()->word_res_list);
   for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
     WERD_RES* word = wr_it.data();
     if (word == word_res)
       break;
   }
   ASSERT_HOST(!wr_it.cycled_list());
   wr_it.add_before_then_move(new_res);
   if (wr_it.at_first()) {
     // This is the new first word, so reset the member iterator so it
     // detects the cycled_list state correctly.
     ResetWordIterator();
   }
   return new_res;
 }

 // Helper computes the boundaries between blobs in the word. The blob bounds
 // are likely very poor, if they come from LSTM, where it only outputs the
 // character at one pixel within it, so we find the midpoints between them.
 static void ComputeBlobEnds(const WERD_RES& word, C_BLOB_LIST* next_word_blobs,
                             GenericVector<int>* blob_ends) {
   C_BLOB_IT blob_it(word.word->cblob_list());
   for (int i = 0; i < word.best_state.size(); ++i) {
     int length = word.best_state[i];
     // Get the bounding box of the fake blobs
     TBOX blob_box = blob_it.data()->bounding_box();
     blob_it.forward();
     for (int b = 1; b < length; ++b) {
       blob_box += blob_it.data()->bounding_box();
       blob_it.forward();
     }
     // This blob_box is crap, so for now we are only looking for the
     // boundaries between them.
     int blob_end = INT32_MAX;
     if (!blob_it.at_first() || next_word_blobs != nullptr) {
       if (blob_it.at_first())
         blob_it.set_to_list(next_word_blobs);
       blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
     }
     blob_ends->push_back(blob_end);
   }
 }

 // Replaces the current WERD/WERD_RES with the given words. The given words
 // contain fake blobs that indicate the position of the characters. These are
 // replaced with real blobs from the current word as much as possible.
 void PAGE_RES_IT::ReplaceCurrentWord(
     tesseract::PointerVector<WERD_RES>* words) {
   if (words->empty()) {
     DeleteCurrentWord();
     return;
   }
   WERD_RES* input_word = word();
   // Set the BOL/EOL flags on the words from the input word.
   if (input_word->word->flag(W_BOL)) {
     (*words)[0]->word->set_flag(W_BOL, true);
   } else {
     (*words)[0]->word->set_blanks(input_word->word->space());
   }
   words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));

   // Move the blobs from the input word to the new set of words.
   // If the input word_res is a combination, then the replacements will also be
   // combinations, and will own their own words. If the input word_res is not a
   // combination, then the final replacements will not be either, (although it
   // is allowed for the input words to be combinations) and their words
   // will get put on the row list. This maintains the ownership rules.
   WERD_IT w_it(row()->row->word_list());
   if (!input_word->combination) {
     for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
       WERD* word = w_it.data();
       if (word == input_word->word)
         break;
     }
     // w_it is now set to the input_word's word.
     ASSERT_HOST(!w_it.cycled_list());
   }
   // Insert into the appropriate place in the ROW_RES.
   WERD_RES_IT wr_it(&row()->word_res_list);
   for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
     WERD_RES* word = wr_it.data();
     if (word == input_word)
       break;
   }
   ASSERT_HOST(!wr_it.cycled_list());
   // Since we only have an estimate of the bounds between blobs, use the blob
   // x-middle as the determiner of where to put the blobs
   C_BLOB_IT src_b_it(input_word->word->cblob_list());
   src_b_it.sort(&C_BLOB::SortByXMiddle);
   C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
   rej_b_it.sort(&C_BLOB::SortByXMiddle);
   for (int w = 0; w < words->size(); ++w) {
     WERD_RES* word_w = (*words)[w];
     // Compute blob boundaries.
     GenericVector<int> blob_ends;
     C_BLOB_LIST* next_word_blobs =
         w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
     ComputeBlobEnds(*word_w, next_word_blobs, &blob_ends);
     // Delete the fake blobs on the current word.
     word_w->word->cblob_list()->clear();
     C_BLOB_IT dest_it(word_w->word->cblob_list());
     // Build the box word as we move the blobs.
     tesseract::BoxWord* box_word = new tesseract::BoxWord;
     for (int i = 0; i < blob_ends.size(); ++i) {
       int end_x = blob_ends[i];
       TBOX blob_box;
       // Add the blobs up to end_x.
       while (!src_b_it.empty() &&
              src_b_it.data()->bounding_box().x_middle() < end_x) {
         blob_box += src_b_it.data()->bounding_box();
         dest_it.add_after_then_move(src_b_it.extract());
         src_b_it.forward();
       }
       while (!rej_b_it.empty() &&
              rej_b_it.data()->bounding_box().x_middle() < end_x) {
         blob_box += rej_b_it.data()->bounding_box();
         dest_it.add_after_then_move(rej_b_it.extract());
         rej_b_it.forward();
       }
       // Clip to the previously computed bounds. Although imperfectly accurate,
       // it is good enough, and much more complicated to determine where else
       // to clip.
       if (i > 0 && blob_box.left() < blob_ends[i - 1])
         blob_box.set_left(blob_ends[i - 1]);
       if (blob_box.right() > end_x)
         blob_box.set_right(end_x);
       box_word->InsertBox(i, blob_box);
     }
     // Fix empty boxes. If a very joined blob sits over multiple characters,
     // then we will have some empty boxes from using the middle, so look for
     // overlaps.
     for (int i = 0; i < box_word->length(); ++i) {
       TBOX box = box_word->BlobBox(i);
       if (box.null_box()) {
         // Nothing has its middle in the bounds of this blob, so use anything
         // that overlaps.
         for (dest_it.mark_cycle_pt(); !dest_it.cycled_list();
              dest_it.forward()) {
           TBOX blob_box = dest_it.data()->bounding_box();
           if (blob_box.left() < blob_ends[i] &&
               (i == 0 || blob_box.right() >= blob_ends[i - 1])) {
             if (i > 0 && blob_box.left() < blob_ends[i - 1])
               blob_box.set_left(blob_ends[i - 1]);
             if (blob_box.right() > blob_ends[i])
               blob_box.set_right(blob_ends[i]);
             box_word->ChangeBox(i, blob_box);
             break;
           }
         }
       }
     }
     delete word_w->box_word;
     word_w->box_word = box_word;
     if (!input_word->combination) {
       // Insert word_w->word into the ROW. It doesn't own its word, so the
       // ROW needs to own it.
       w_it.add_before_stay_put(word_w->word);
       word_w->combination = false;
     }
     (*words)[w] = nullptr;  // We are taking ownership.
     wr_it.add_before_stay_put(word_w);
   }
   // We have taken ownership of the words.
   words->clear();
   // Delete the current word, which has been replaced. We could just call
   // DeleteCurrentWord, but that would iterate both lists again, and we know
   // we are already in the right place.
   if (!input_word->combination)
     delete w_it.extract();
   delete wr_it.extract();
   ResetWordIterator();
 }

 // Deletes the current WERD_RES and its underlying WERD.
 void PAGE_RES_IT::DeleteCurrentWord() {
   // Check that this word is as we expect. part_of_combos are NEVER iterated
   // by the normal iterator, so we should never be trying to delete them.
   ASSERT_HOST(!word_res->part_of_combo);
   if (!word_res->combination) {
     // Combinations own their own word, so we won't find the word on the
     // row's word_list, but it is legitimate to try to delete them.
     // Delete word from the ROW when not a combination.
     WERD_IT w_it(row()->row->word_list());
     for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
       if (w_it.data() == word_res->word) {
         break;
       }
     }
     ASSERT_HOST(!w_it.cycled_list());
     delete w_it.extract();
   }
   // Remove the WERD_RES for the new_word.
   // Remove the WORD_RES from the ROW_RES.
   WERD_RES_IT wr_it(&row()->word_res_list);
   for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
     if (wr_it.data() == word_res) {
       word_res = nullptr;
       break;
     }
   }
   ASSERT_HOST(!wr_it.cycled_list());
   delete wr_it.extract();
   ResetWordIterator();
 }

 // Makes the current word a fuzzy space if not already fuzzy. Updates
 // corresponding part of combo if required.
 void PAGE_RES_IT::MakeCurrentWordFuzzy() {
   WERD* real_word = word_res->word;
   if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
     real_word->set_flag(W_FUZZY_SP, true);
     if (word_res->combination) {
       // The next word should be the corresponding part of combo, but we have
       // already stepped past it, so find it by search.
       WERD_RES_IT wr_it(&row()->word_res_list);
       for (wr_it.mark_cycle_pt();
            !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
       }
       wr_it.forward();
       ASSERT_HOST(wr_it.data()->part_of_combo);
       real_word = wr_it.data()->word;
       ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
                   !real_word->flag(W_FUZZY_NON));
       real_word->set_flag(W_FUZZY_SP, true);
     }
   }
 }

 /*************************************************************************
  * PAGE_RES_IT::restart_page
  *
  * Set things up at the start of the page
  *************************************************************************/

 WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) {
   block_res_it.set_to_list(&page_res->block_res_list);
   block_res_it.mark_cycle_pt();
   prev_block_res = nullptr;
   prev_row_res = nullptr;
   prev_word_res = nullptr;
   block_res = nullptr;
   row_res = nullptr;
   word_res = nullptr;
   next_block_res = nullptr;
   next_row_res = nullptr;
   next_word_res = nullptr;
   internal_forward(true, empty_ok);
   return internal_forward(false, empty_ok);
 }

 // Recovers from operations on the current word, such as in InsertCloneWord
 // and DeleteCurrentWord.
 // Resets the word_res_it so that it is one past the next_word_res, as
 // it should be after internal_forward. If next_row_res != row_res,
 // then the next_word_res is in the next row, so there is no need to do
 // anything to word_res_it, but it is still a good idea to reset the pointers
 // word_res and prev_word_res, which are still in the current row.
 void PAGE_RES_IT::ResetWordIterator() {
   if (row_res == next_row_res) {
     // Reset the member iterator so it can move forward and detect the
     // cycled_list state correctly.
     word_res_it.move_to_first();
     for (word_res_it.mark_cycle_pt();
          !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
          word_res_it.forward()) {
       if (!word_res_it.data()->part_of_combo) {
         if (prev_row_res == row_res) prev_word_res = word_res;
         word_res = word_res_it.data();
       }
     }
     ASSERT_HOST(!word_res_it.cycled_list());
     word_res_it.forward();
   } else {
     // word_res_it is OK, but reset word_res and prev_word_res if needed.
     WERD_RES_IT wr_it(&row_res->word_res_list);
     for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
       if (!wr_it.data()->part_of_combo) {
         if (prev_row_res == row_res) prev_word_res = word_res;
         word_res = wr_it.data();
       }
     }
   }
 }

 /*************************************************************************
  * PAGE_RES_IT::internal_forward
  *
  * Find the next word on the page. If empty_ok is true, then non-text blocks
  * and text blocks with no text are visited as if they contain a single
  * imaginary word in a single imaginary row. (word() and row() both return nullptr
  * in such a block and the return value is nullptr.)
  * If empty_ok is false, the old behaviour is maintained. Each real word
  * is visited and empty and non-text blocks and rows are skipped.
  * new_block is used to initialize the iterators for a new block.
  * The iterator maintains pointers to block, row and word for the previous,
  * current and next words.  These are correct, regardless of block/row
  * boundaries. nullptr values denote start and end of the page.
  *************************************************************************/

 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
   bool new_row = false;

   prev_block_res = block_res;
   prev_row_res = row_res;
   prev_word_res = word_res;
   block_res = next_block_res;
   row_res = next_row_res;
   word_res = next_word_res;
   next_block_res = nullptr;
   next_row_res = nullptr;
   next_word_res = nullptr;

   while (!block_res_it.cycled_list()) {
     if (new_block) {
       new_block = false;
       row_res_it.set_to_list(&block_res_it.data()->row_res_list);
       row_res_it.mark_cycle_pt();
       if (row_res_it.empty() && empty_ok) {
         next_block_res = block_res_it.data();
         break;
       }
       new_row = true;
     }
     while (!row_res_it.cycled_list()) {
       if (new_row) {
         new_row = false;
         word_res_it.set_to_list(&row_res_it.data()->word_res_list);
         word_res_it.mark_cycle_pt();
       }
       // Skip any part_of_combo words.
       while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
         word_res_it.forward();
       if (!word_res_it.cycled_list()) {
         next_block_res = block_res_it.data();
         next_row_res = row_res_it.data();
         next_word_res = word_res_it.data();
         word_res_it.forward();
         goto foundword;
       }
       // end of row reached
       row_res_it.forward();
       new_row = true;
     }
     // end of block reached
     block_res_it.forward();
     new_block = true;
   }
   foundword:
   // Update prev_word_best_choice pointer.
   if (page_res != nullptr && page_res->prev_word_best_choice != nullptr) {
     *page_res->prev_word_best_choice =
       (new_block || prev_word_res == nullptr) ? nullptr : prev_word_res->best_choice;
   }
   return word_res;
 }

 /*************************************************************************
  * PAGE_RES_IT::restart_row()
  *
  * Move to the beginning (leftmost word) of the current row.
  *************************************************************************/
 WERD_RES *PAGE_RES_IT::restart_row() {
   ROW_RES *row = this->row();
   if (!row) return nullptr;
   for (restart_page(); this->row() != row; forward()) {
     // pass
   }
   return word();
 }

 /*************************************************************************
  * PAGE_RES_IT::forward_paragraph
  *
  * Move to the beginning of the next paragraph, allowing empty blocks.
  *************************************************************************/

 WERD_RES *PAGE_RES_IT::forward_paragraph() {
   while (block_res == next_block_res &&
          (next_row_res != nullptr && next_row_res->row != nullptr &&
           row_res->row->para() == next_row_res->row->para())) {
     internal_forward(false, true);
   }
   return internal_forward(false, true);
 }

 /*************************************************************************
  * PAGE_RES_IT::forward_block
  *
  * Move to the beginning of the next block, allowing empty blocks.
  *************************************************************************/

 WERD_RES *PAGE_RES_IT::forward_block() {
   while (block_res == next_block_res) {
     internal_forward(false, true);
   }
   return internal_forward(false, true);
 }

 void PAGE_RES_IT::rej_stat_word() {
   int16_t chars_in_word;
   int16_t rejects_in_word = 0;

   chars_in_word = word_res->reject_map.length ();
   page_res->char_count += chars_in_word;
   block_res->char_count += chars_in_word;
   row_res->char_count += chars_in_word;

   rejects_in_word = word_res->reject_map.reject_count ();

   page_res->rej_count += rejects_in_word;
   block_res->rej_count += rejects_in_word;
   row_res->rej_count += rejects_in_word;
   if (chars_in_word == rejects_in_word)
     row_res->whole_word_rej_count += rejects_in_word;
 }
TWERD::BLNormalize
void BLNormalize(const BLOCK *block, const ROW *row, Pix *pix, bool inverse, float x_height, float baseline_shift, bool numeric_mode, tesseract::OcrEngineMode hint, const TBOX *norm_box, DENORM *word_denorm)
Definition: blobs.cpp:800

WERD_RES::DebugTopChoice
void DebugTopChoice(const char *msg) const
Definition: pageres.cpp:505

WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:359

PAGE_RES_IT::ReplaceCurrentWord
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1322

BLOCK_RES::BLOCK_RES
BLOCK_RES()=default

WERD_RES::PiecesAllNatural
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1084

WERD_RES::best_choices
WERD_CHOICE_LIST best_choices
Definition: pageres.h:243

ROW_RES::word_res_list
WERD_RES_LIST word_res_list
Definition: pageres.h:147

WERD_RES::ConsumeWordResults
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:771

W_INVERSE
Definition: werd.h:43

WERD_RES::BothHyphens
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1036

kMaxWordGapRatio
const double kMaxWordGapRatio
Definition: pageres.cpp:62

WERD_RES::space_certainty
float space_certainty
Definition: pageres.h:316

WERD_RES::tess_failed
bool tess_failed
Definition: pageres.h:288

WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:260

WERD_RES::MergeAdjacentBlobs
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:980

BLOB_CHOICE::certainty
float certainty() const
Definition: ratngs.h:83

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:35

WERD_CHOICE::remove_unichar_id
void remove_unichar_id(int index)
Definition: ratngs.h:484

WERD_RES::guessed_x_ht
bool guessed_x_ht
Definition: pageres.h:308

GenericVector::size
int size() const
Definition: genericvector.h:71

GENERIC_2D_ARRAY::delete_matrix_pointers
void delete_matrix_pointers()
Definition: matrix.h:455

PAGE_RES_IT::start_page
WERD_RES * start_page(bool empty_ok)
Definition: pageres.cpp:1510

TRUE
#define TRUE
Definition: capi.h:51

pdblock.h

TWERD
Definition: blobs.h:402

BLOCK_RES::x_height
float x_height
Definition: pageres.h:122

tesseract::PointerVector::clear
void clear()
Definition: genericvector.h:529

PAGE_RES::rej_count
int32_t rej_count
Definition: pageres.h:80

WERD_RES::FakeWordFromRatings
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:904

kMaxLineSizeRatio
const double kMaxLineSizeRatio
Definition: pageres.cpp:60

BLOB_CHOICE
Definition: ratngs.h:49

MATRIX_COORD
Definition: matrix.h:605

PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:754

pageres.h

WERD_RES::SetupBlobWidthsAndGaps
void SetupBlobWidthsAndGaps()
Definition: pageres.cpp:406

BlamerBundle::CopyResults
void CopyResults(const BlamerBundle &other)
Definition: blamer.h:206

WERD_RES::bln_boxes
tesseract::BoxWord * bln_boxes
Definition: pageres.h:198

PAGE_RES_IT::forward_paragraph
WERD_RES * forward_paragraph()
Definition: pageres.cpp:1652

ROW_RES::whole_word_rej_count
int32_t whole_word_rej_count
Definition: pageres.h:146

WERD_RES::guessed_caps_ht
bool guessed_caps_ht
Definition: pageres.h:309

WERD_CHOICE::MatrixCoord
MATRIX_COORD MatrixCoord(int index) const
Definition: ratngs.cpp:302

start_seam_list
void start_seam_list(TWERD *word, GenericVector< SEAM *> *seam_array)
Definition: seam.cpp:269

errcode.h

WERD_CHOICE::SetAllScriptPositions
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: ratngs.cpp:626

WERD_RES::blob_widths
GenericVector< int > blob_widths
Definition: pageres.h:219

TBOX::null_box
bool null_box() const
Definition: rect.h:50

blobs.h

kWordrecMaxNumJoinChunks
const int kWordrecMaxNumJoinChunks
Definition: pageres.cpp:55

GenericVector::move
void move(GenericVector< T > *from)
Definition: genericvector.h:1050

host.h

WERD_RES::ComputeAdaptionThresholds
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:567

WERD_RES::italic
int8_t italic
Definition: pageres.h:301

WERD_RES::GetBlobsGap
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:746

WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:287

tesseract::BoxWord::ClipToOriginalWord
void ClipToOriginalWord(const BLOCK *block, WERD *original_word)
Definition: boxword.cpp:92

WERD_CHOICE::append_unichar_id_space_allocated
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:452

MATRIX::ConsumeAndMakeBigger
MATRIX * ConsumeAndMakeBigger(int ind)
Definition: matrix.cpp:63

W_FUZZY_NON
Definition: werd.h:42

STRING::string
const char * string() const
Definition: strngs.cpp:196

count
int count(LIST var_list)
Definition: oldlist.cpp:98

WERD_CHOICE::print
void print() const
Definition: ratngs.h:580

WERD_RES::CopySimpleFields
void CopySimpleFields(const WERD_RES &source)
Definition: pageres.cpp:255

WERD_CHOICE::state
int state(int index) const
Definition: ratngs.h:319

WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:159

WERD_RES::GetBlobChoices
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:765

SEAM
Definition: seam.h:44

WERD_RES::InsertSeam
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:424

tesseract::BoxWord::MergeBoxes
void MergeBoxes(int start, int end)
Definition: boxword.cpp:131

GenericVector::remove
void remove(int index)
Definition: genericvector.h:766

WERD_RES::HyphenBoxesOverlap
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
Definition: pageres.cpp:1047

WERD_RES::fix_hyphens
void fix_hyphens()
Definition: pageres.cpp:1053

tesseract::BoxWord
Definition: boxword.h:37

BLOCK_RES::font_assigned
bool font_assigned
Definition: pageres.h:123

BLOCK_RES::char_count
int32_t char_count
Definition: pageres.h:118

BLOCK::row_list
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:118

DENORM::block
const BLOCK * block() const
Definition: normalis.h:273

BLOCK_RES::row_res_list
ROW_RES_LIST row_res_list
Definition: pageres.h:128

TBOX
Definition: rect.h:34

TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:432

ROW::word_list
WERD_LIST * word_list()
Definition: ocrrow.h:55

SEAM::JoinPieces
static void JoinPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:216

SEAM::BreakPieces
static void BreakPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:194

WERD_RES::Clear
void Clear()
Definition: pageres.cpp:1143

REJMAP::length
int32_t length() const
Definition: rejctmap.h:223

W_EOL
Definition: werd.h:35

GenericVector::reserve
void reserve(int size)
Definition: genericvector.h:684

UNICHARSET::latin_sid
int latin_sid() const
Definition: unicharset.h:880

BLOCK_RES::bold
bool bold
Definition: pageres.h:125

WERD_RES::odd_size
bool odd_size
Definition: pageres.h:300

WERD_RES::AlternativeChoiceAdjustmentsWorseThan
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:445

tesseract::ScriptPos
ScriptPos
Definition: ratngs.h:262

WERD_RES::merge_tess_fails
void merge_tess_fails()
Definition: pageres.cpp:1073

ROW_RES::ROW_RES
ROW_RES()=default

WERD_RES::fontinfo
const FontInfo * fontinfo
Definition: pageres.h:304

WERD_CHOICE::UpdateStateForSplit
void UpdateStateForSplit(int blob_position)
Definition: ratngs.cpp:702

UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209

PAGE_RES_IT::operator==
bool operator==(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1195

WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:327

NewPermanentTessCallback
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116

WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:330

WERD_RES
Definition: pageres.h:169

WERD_RES::BothSpaces
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1065

WERD::set_flag
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:127

UNICHARSET::script_has_xheight
bool script_has_xheight() const
Definition: unicharset.h:898

tesseract::PointerVector< WERD_RES >

BlamerBundle::CopyTruth
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:199

WERD::rej_cblob_list
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:93

ROW::body_size
float body_size() const
Definition: ocrrow.h:73

WERD_RES::InitNonPointers
void InitNonPointers()
Definition: pageres.cpp:1100

WERD::space
uint8_t space()
Definition: werd.h:102

PAGE_RES::block_res_list
BLOCK_RES_LIST block_res_list
Definition: pageres.h:81

WERD_RES::baseline_shift
float baseline_shift
Definition: pageres.h:313

blamer.h

UNICHARSET
Definition: unicharset.h:146

tesseract::BoxWord::BlobBox
const TBOX & BlobBox(int index) const
Definition: boxword.h:84

GenericVector::back
T & back() const
Definition: genericvector.h:730

WERD_RES::GetBlobsWidth
int GetBlobsWidth(int start_blob, int last_blob)
Definition: pageres.cpp:736

MATRIX_COORD::Valid
bool Valid(const MATRIX &m) const
Definition: matrix.h:615

PAGE_RES_IT::forward_block
WERD_RES * forward_block()
Definition: pageres.cpp:1667

WERD_RES::SetScriptPositions
void SetScriptPositions()
Definition: pageres.cpp:864

WERD_RES::fontinfo_id2_count
int8_t fontinfo_id2_count
Definition: pageres.h:307

WERD_RES::small_caps
bool small_caps
Definition: pageres.h:299

PAGE_RES_IT::ResetWordIterator
void ResetWordIterator()
Definition: pageres.cpp:1533

tprintf.h

WERD_RES::correct_text
GenericVector< STRING > correct_text
Definition: pageres.h:275

WERD_RES::IsAmbiguous
bool IsAmbiguous()
Definition: pageres.cpp:458

WERD_CHOICE::dangerous_ambig_found
bool dangerous_ambig_found() const
Definition: ratngs.h:363

SEAM::PrepareToInsertSeam
bool PrepareToInsertSeam(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int insert_index, bool modify)
Definition: seam.cpp:82

MATRIX::IncreaseBandSize
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:54

tesscallback.h

TBOX::set_right
void set_right(int x)
Definition: rect.h:82

MATRIX_COORD::row
int row
Definition: matrix.h:634

tesseract::OEM_LSTM_ONLY
Definition: publictypes.h:270

GenericVector::clear
void clear()
Definition: genericvector.h:868

TBOX::width
int16_t width() const
Definition: rect.h:115

PAGE_RES_IT::restart_page
WERD_RES * restart_page()
Definition: pageres.h:698

WERD_RES::FilterWordChoices
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:519

BLOCK_RES::block
BLOCK * block
Definition: pageres.h:117

WERD_RES::SetupBlamerBundle
void SetupBlamerBundle()
Definition: pageres.cpp:399

WERD_RES::SetupBoxWord
void SetupBoxWord()
Definition: pageres.cpp:855

WERD_RES::tess_would_adapt
bool tess_would_adapt
Definition: pageres.h:297

TBOX::left
int16_t left() const
Definition: rect.h:72

PAGE_RES_IT::rej_stat_word
void rej_stat_word()
Definition: pageres.cpp:1674

ROW::ascenders
float ascenders() const
Definition: ocrrow.h:82

WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126

PAGE_RES_IT::restart_row
WERD_RES * restart_row()
Definition: pageres.cpp:1637

WERD_RES::DebugWordChoices
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:486

GenericVector::insert
void insert(const T &t, int index)
Definition: genericvector.h:752

W_FUZZY_SP
Definition: werd.h:41

WERD_RES::denorm
DENORM denorm
Definition: pageres.h:204

GenericVector< int >

WERD_RES::unlv_crunch_mode
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:310

WERD_RES::operator=
WERD_RES & operator=(const WERD_RES &source)
Definition: pageres.cpp:192

BlamerBundle::ClearResults
void ClearResults()
Definition: blamer.h:185

POLY_BLOCK
Definition: polyblk.h:27

ROW::x_height
float x_height() const
Definition: ocrrow.h:64

BlamerBundle
Definition: blamer.h:100

WERD_RES::fix_quotes
void fix_quotes()
Definition: pageres.cpp:1024

REJMAP::reject_count
int16_t reject_count()
Definition: rejctmap.h:229

WERD_RES::PrintBestChoices
void PrintBestChoices() const
Definition: pageres.cpp:723

UNICHARSET::contains_unichar
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670

BLOB_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:145

WERD_CHOICE
Definition: ratngs.h:273

GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:708

PAGE_RES::prev_word_best_choice
WERD_CHOICE ** prev_word_best_choice
Definition: pageres.h:85

FALSE
#define FALSE
Definition: capi.h:52

WERD_RES::bold
int8_t bold
Definition: pageres.h:302

WERD_RES::FakeClassifyWord
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:883

W_SCRIPT_HAS_XHEIGHT
Definition: werd.h:37

TessResultCallback2::Run
virtual R Run(A1, A2)=0

ELIST_LINK::operator=
void operator=(const ELIST_LINK &)
Definition: elst.h:101

CR_NONE
Definition: pageres.h:161

WERD_RES::LogNewRawChoice
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:610

PAGE_RES::Init
void Init()
Definition: pageres.h:94

WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:296

TOP_CHOICE_PERM
Definition: ratngs.h:245

GenericVector::length
int length() const
Definition: genericvector.h:85

WERD_RES::copy_on
void copy_on(WERD_RES *word_res)
Definition: pageres.h:660

PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:751

WERD_RES::best_state
GenericVector< int > best_state
Definition: pageres.h:271

WERD_RES::ConditionalBlobMerge
bool ConditionalBlobMerge(TessResultCallback2< UNICHAR_ID, UNICHAR_ID, UNICHAR_ID > *class_cb, TessResultCallback2< bool, const TBOX &, const TBOX &> *box_cb)
Definition: pageres.cpp:944

WERD_RES::InitPointers
void InitPointers()
Definition: pageres.cpp:1128

WERD_RES::SetupForRecognition
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:308

PDBLK::poly_block
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56

BandTriMatrix::dimension
int dimension() const
Definition: matrix.h:533

ELISTIZE
#define ELISTIZE(CLASSNAME)
Definition: elst.h:961

WERD_RES::BestChoiceToCorrectText
void BestChoiceToCorrectText()
Definition: pageres.cpp:929

ocrrow.h

WERD_RES::CloneChoppedToRebuild
void CloneChoppedToRebuild()
Definition: pageres.cpp:841

WERD
Definition: werd.h:59

tesseract::BoxWord::CopyFromNormalized
static BoxWord * CopyFromNormalized(TWERD *tessword)
Definition: boxword.cpp:56

ROW_RES
Definition: pageres.h:141

UNICHARSET::get_enabled
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:873

WERD_RES::fontinfo_id_count
int8_t fontinfo_id_count
Definition: pageres.h:306

WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315

ocrblock.h

ROW
Definition: ocrrow.h:36

GenericVector::empty
bool empty() const
Definition: genericvector.h:90

tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37

TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:478

WERD_CHOICE::adjust_factor
float adjust_factor() const
Definition: ratngs.h:306

W_BOL
Definition: werd.h:34

PAGE_RES::char_count
int32_t char_count
Definition: pageres.h:79

GENERIC_2D_ARRAY::put
void put(ICOORD pos, const T &thing)
Definition: matrix.h:220

POLY_BLOCK::IsText
bool IsText() const
Definition: polyblk.h:49

BLOCK
Definition: ocrblock.h:30

WERD_CHOICE::length
int length() const
Definition: ratngs.h:303

PAGE_RES_IT::page_res
PAGE_RES * page_res
Definition: pageres.h:677

WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:98

WERD_RES::caps_height
float caps_height
Definition: pageres.h:312

stepblob.h

WERD_RES::seam_array
GenericVector< SEAM * > seam_array
Definition: pageres.h:217

WERD::set_script_id
void set_script_id(int id)
Definition: werd.h:111

tesseract
Definition: baseapi.cpp:94

PAGE_RES_IT::cmp
int cmp(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1201

WERD_RES::blob_gaps
GenericVector< int > blob_gaps
Definition: pageres.h:222

GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799

UNICHARSET::default_sid
int default_sid() const
Definition: unicharset.h:888

tesseract::Tesseract
Definition: tesseractclass.h:173

TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:443

WERD_CHOICE::string_and_lengths
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:449

tesseract::OcrEngineMode
OcrEngineMode
Definition: publictypes.h:268

STRING::add_str_int
void add_str_int(const char *str, int number)
Definition: strngs.cpp:379

WERD_RES::SetupFake
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:358

WERD_RES::done
bool done
Definition: pageres.h:298

PAGE_RES_IT::MakeCurrentWordFuzzy
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1483

ROW::descenders
float descenders() const
Definition: ocrrow.h:85

TBOX::set_left
void set_left(int x)
Definition: rect.h:75

WERD_RES::reject_spaces
bool reject_spaces
Definition: pageres.h:336

PAGE_RES::PAGE_RES
PAGE_RES()
Definition: pageres.h:102

WERD_RES::SetupWordScript
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:390

PAGE_RES_IT
Definition: pageres.h:675

WERD_RES::combination
bool combination
Definition: pageres.h:334

BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:80

WERD_RES::tesseract
tesseract::Tesseract * tesseract
Definition: pageres.h:282

WERD_RES::x_height
float x_height
Definition: pageres.h:311

kMaxWordSizeRatio
const double kMaxWordSizeRatio
Definition: pageres.cpp:58

WERD_RES::ClearResults
void ClearResults()
Definition: pageres.cpp:1153

WERD_RES::SetupBasicsFromChoppedWord
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:349

STRING
Definition: strngs.h:45

C_BLOB::SortByXMiddle
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:125

BLOCK_RES
Definition: pageres.h:115

W_SCRIPT_IS_LATIN
Definition: werd.h:38

PAGE_RES_IT::DeleteCurrentWord
void DeleteCurrentWord()
Definition: pageres.cpp:1450

polyblk.h

BLOCK_RES::rej_count
int32_t rej_count
Definition: pageres.h:119

PAGE_RES_IT::InsertSimpleCloneWord
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1269

TWERD::MergeBlobs
void MergeBlobs(int start, int end)
Definition: blobs.cpp:882

WERD_RES::ratings
MATRIX * ratings
Definition: pageres.h:231

W_REP_CHAR
Definition: werd.h:40

WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:206

UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290

WERD_RES::ep_choice
WERD_CHOICE * ep_choice
Definition: pageres.h:286

GenericVector::delete_data_pointers
void delete_data_pointers()
Definition: genericvector.h:884

BLOCK_RES::italic
bool italic
Definition: pageres.h:126

WERD_RES::ClearRatings
void ClearRatings()
Definition: pageres.cpp:1186

tesseract::BoxWord::length
int length() const
Definition: boxword.h:83

WERD_RES::blamer_bundle
BlamerBundle * blamer_bundle
Definition: pageres.h:246

WERD_RES::InitForRetryRecognition
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:283

TessResultCallback2
Definition: blamer.h:43

WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:541

BLOCK_RES::font_class
int16_t font_class
Definition: pageres.h:120

WERD_RES::ClearWordChoices
void ClearWordChoices()
Definition: pageres.cpp:1178

WERD_RES::part_of_combo
bool part_of_combo
Definition: pageres.h:335

TBOX::right
int16_t right() const
Definition: rect.h:79

GenericVector::truncate
void truncate(int size)
Definition: genericvector.h:136

WERD_RES::blob_row
ROW * blob_row
Definition: pageres.h:200

WERD_RES::RebuildBestState
void RebuildBestState()
Definition: pageres.cpp:814

FindMatchingChoice
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:180

PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:731

WERD_CHOICE::SetScriptPositions
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:550

CLISTIZE
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain

ROW_RES::char_count
int32_t char_count
Definition: pageres.h:144

MATRIX
Definition: matrix.h:575

TBLOB
Definition: blobs.h:268

BLOCK_RES::row_count
int16_t row_count
Definition: pageres.h:121

WERD_RES::raw_choice
WERD_CHOICE * raw_choice
Definition: pageres.h:240

TWERD::PolygonalCopy
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:786

WERD_RES::SetAllScriptPositions
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:871

TWERD::ComputeBoundingBoxes
void ComputeBoundingBoxes()
Definition: blobs.cpp:865

WERD_RES::ReplaceBestChoice
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:801

WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:215

publictypes.h

WERD_RES::~WERD_RES
~WERD_RES()
Definition: pageres.cpp:1096

SEAM::HasAnySplits
bool HasAnySplits() const
Definition: seam.h:67

ROW_RES::rej_count
int32_t rej_count
Definition: pageres.h:145

WERD_RES::StatesAllValid
bool StatesAllValid()
Definition: pageres.cpp:464

WERD_RES::BothQuotes
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1014

BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77

BLOCK::pdblk
PDBLK pdblk
Definition: ocrblock.h:192

WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235

MATRIX_COORD::col
int col
Definition: matrix.h:633

BlamerBundle::SetupNormTruthWord
void SetupNormTruthWord(const DENORM &denorm)
Definition: blamer.cpp:150

TBOX::height
int16_t height() const
Definition: rect.h:108

WERD_RES::LogNewCookedChoice
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:626

PermuterType
PermuterType
Definition: ratngs.h:242

WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:266

tesseract::BoxWord::InsertBox
void InsertBox(int index, const TBOX &box)
Definition: boxword.cpp:148

GENERIC_2D_ARRAY::get
T get(ICOORD pos) const
Definition: matrix.h:228

UNICHAR_SPACE
Definition: unicharset.h:35

WERD_CHOICE::blob_choices
BLOB_CHOICE_LIST * blob_choices(int index, MATRIX *ratings) const
Definition: ratngs.cpp:290

WERD_RES::GetBlobChoice
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:756

REJMAP::initialise
void initialise(int16_t length)
Definition: rejctmap.cpp:275

seam.h

boxword.h

WERD_CHOICE::TotalOfStates
int TotalOfStates() const
Definition: ratngs.cpp:714

REJMAP::remove_pos
void remove_pos(int16_t pos)
Definition: rejctmap.cpp:311

ROW_RES::row
ROW * row
Definition: pageres.h:143

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:84

WERD_RES::fontinfo2
const FontInfo * fontinfo2
Definition: pageres.h:305

ROW::para
PARA * para() const
Definition: ocrrow.h:118

WERD_CHOICE::set_permuter
void set_permuter(uint8_t perm)
Definition: ratngs.h:375

WERD_RES::word
WERD * word
Definition: pageres.h:189