tessapi/3.x/a01248_source.html

 // File:        language_model.cpp

 // Description: Functions that utilize the knowledge about the properties,

 //              structure and statistics of the language to help recognition.

 // Author:      Daria Antonova

 // Created:     Mon Nov 11 11:26:43 PST 2009

 //

 // (C) Copyright 2009, Google Inc.

 // Licensed under the Apache License, Version 2.0 (the "License");

 // you may not use this file except in compliance with the License.

 // You may obtain a copy of the License at

 // http://www.apache.org/licenses/LICENSE-2.0

 // Unless required by applicable law or agreed to in writing, software

 // distributed under the License is distributed on an "AS IS" BASIS,

 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 // See the License for the specific language governing permissions and

 // limitations under the License.

 //


 #include <math.h>


 #include "language_model.h"


 #include "dawg.h"

 #include "freelist.h"

 #include "intproto.h"

 #include "helpers.h"

 #include "lm_state.h"

 #include "lm_pain_points.h"

 #include "matrix.h"

 #include "params.h"

 #include "params_training_featdef.h"


 #if defined(_MSC_VER) || defined(ANDROID)

 double log2(double n) {

   return log(n) / log(2.0);

 }

 #endif  // _MSC_VER


 namespace tesseract {


 const float LanguageModel::kMaxAvgNgramCost = 25.0f;


 LanguageModel::LanguageModel(const UnicityTable<FontInfo> *fontinfo_table,

                              Dict *dict)

   : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",

                dict->getCCUtil()->params()),

     BOOL_INIT_MEMBER(language_model_ngram_on, false,

                      "Turn on/off the use of character ngram model",

                      dict->getCCUtil()->params()),

     INT_MEMBER(language_model_ngram_order, 8,

                "Maximum order of the character ngram model",

                dict->getCCUtil()->params()),

     INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,

                "Maximum number of prunable (those for which"

                " PrunablePath() is true) entries in each viterbi list"

                " recorded in BLOB_CHOICEs",

                dict->getCCUtil()->params()),

     INT_MEMBER(language_model_viterbi_list_max_size, 500,

                "Maximum size of viterbi lists recorded in BLOB_CHOICEs",

                dict->getCCUtil()->params()),

     double_MEMBER(language_model_ngram_small_prob, 0.000001,

                   "To avoid overly small denominators use this as the "

                   "floor of the probability returned by the ngram model.",

                   dict->getCCUtil()->params()),

     double_MEMBER(language_model_ngram_nonmatch_score, -40.0,

                   "Average classifier score of a non-matching unichar.",

                   dict->getCCUtil()->params()),

     BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,

                 "Use only the first UTF8 step of the given string"

                 " when computing log probabilities.",

                 dict->getCCUtil()->params()),

     double_MEMBER(language_model_ngram_scale_factor, 0.03,

                   "Strength of the character ngram model relative to the"

                   " character classifier ",

                   dict->getCCUtil()->params()),

     double_MEMBER(language_model_ngram_rating_factor, 16.0,

                   "Factor to bring log-probs into the same range as ratings"

                   " when multiplied by outline length ",

                   dict->getCCUtil()->params()),

     BOOL_MEMBER(language_model_ngram_space_delimited_language, true,

                 "Words are delimited by space",

                 dict->getCCUtil()->params()),

     INT_MEMBER(language_model_min_compound_length, 3,

                "Minimum length of compound words",

                dict->getCCUtil()->params()),

     double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,

                   "Penalty for words not in the frequent word dictionary",

                   dict->getCCUtil()->params()),

     double_MEMBER(language_model_penalty_non_dict_word, 0.15,

                   "Penalty for non-dictionary words",

                   dict->getCCUtil()->params()),

     double_MEMBER(language_model_penalty_punc, 0.2,

                   "Penalty for inconsistent punctuation",

                   dict->getCCUtil()->params()),

     double_MEMBER(language_model_penalty_case, 0.1,

                   "Penalty for inconsistent case",

                   dict->getCCUtil()->params()),

     double_MEMBER(language_model_penalty_script, 0.5,

                   "Penalty for inconsistent script",

                   dict->getCCUtil()->params()),

     double_MEMBER(language_model_penalty_chartype, 0.3,

                   "Penalty for inconsistent character type",

                   dict->getCCUtil()->params()),

     // TODO(daria, rays): enable font consistency checking

     // after improving font analysis.

     double_MEMBER(language_model_penalty_font, 0.00,

                   "Penalty for inconsistent font",

                   dict->getCCUtil()->params()),

     double_MEMBER(language_model_penalty_spacing, 0.05,

                   "Penalty for inconsistent spacing",

                   dict->getCCUtil()->params()),

     double_MEMBER(language_model_penalty_increment, 0.01,

                   "Penalty increment",

                   dict->getCCUtil()->params()),

     INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",

                dict->getCCUtil()->params()),

     BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false,

                      "Use sigmoidal score for certainty",

                      dict->getCCUtil()->params()),

   fontinfo_table_(fontinfo_table), dict_(dict),

   fixed_pitch_(false), max_char_wh_ratio_(0.0),

   acceptable_choice_found_(false) {

   ASSERT_HOST(dict_ != NULL);

   dawg_args_ = new DawgArgs(NULL, new DawgPositionVector(), NO_PERM);

   very_beginning_active_dawgs_ = new DawgPositionVector();

   beginning_active_dawgs_ = new DawgPositionVector();

 }


 LanguageModel::~LanguageModel() {

   delete very_beginning_active_dawgs_;

   delete beginning_active_dawgs_;

   delete dawg_args_->updated_dawgs;

   delete dawg_args_;

 }


 void LanguageModel::InitForWord(const WERD_CHOICE *prev_word,

                                 bool fixed_pitch, float max_char_wh_ratio,

                                 float rating_cert_scale) {

   fixed_pitch_ = fixed_pitch;

   max_char_wh_ratio_ = max_char_wh_ratio;

   rating_cert_scale_ = rating_cert_scale;

   acceptable_choice_found_ = false;

   correct_segmentation_explored_ = false;


   // Initialize vectors with beginning DawgInfos.

   very_beginning_active_dawgs_->clear();

   dict_->init_active_dawgs(very_beginning_active_dawgs_, false);

   beginning_active_dawgs_->clear();

   dict_->default_dawgs(beginning_active_dawgs_, false);


   // Fill prev_word_str_ with the last language_model_ngram_order

   // unichars from prev_word.

   if (language_model_ngram_on) {

     if (prev_word != NULL && prev_word->unichar_string() != NULL) {

       prev_word_str_ = prev_word->unichar_string();

       if (language_model_ngram_space_delimited_language) prev_word_str_ += ' ';

     } else {

       prev_word_str_ = " ";

     }

     const char *str_ptr = prev_word_str_.string();

     const char *str_end = str_ptr + prev_word_str_.length();

     int step;

     prev_word_unichar_step_len_ = 0;

     while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {

       str_ptr += step;

       ++prev_word_unichar_step_len_;

     }

     ASSERT_HOST(str_ptr == str_end);

   }

 }


 static void ScanParentsForCaseMix(const UNICHARSET& unicharset,

                                   LanguageModelState* parent_node) {

   if (parent_node == NULL) return;

   ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);

   for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {

     ViterbiStateEntry* vse = vit.data();

     vse->competing_vse = NULL;

     UNICHAR_ID unichar_id = vse->curr_b->unichar_id();

     if (unicharset.get_isupper(unichar_id) ||

         unicharset.get_islower(unichar_id)) {

       UNICHAR_ID other_case = unicharset.get_other_case(unichar_id);

       if (other_case == unichar_id) continue;  // Not in unicharset.

       // Find other case in same list. There could be multiple entries with

       // the same unichar_id, but in theory, they should all point to the

       // same BLOB_CHOICE, and that is what we will be using to decide

       // which to keep.

       ViterbiStateEntry_IT vit2(&parent_node->viterbi_state_entries);

       for (vit2.mark_cycle_pt(); !vit2.cycled_list() &&

            vit2.data()->curr_b->unichar_id() != other_case;

            vit2.forward()) {}

       if (!vit2.cycled_list()) {

         vse->competing_vse = vit2.data();

       }

     }

   }

 }


 static bool HasBetterCaseVariant(const UNICHARSET& unicharset,

                                  const BLOB_CHOICE* choice,

                                  BLOB_CHOICE_LIST* choices) {

   UNICHAR_ID choice_id = choice->unichar_id();

   UNICHAR_ID other_case = unicharset.get_other_case(choice_id);

   if (other_case == choice_id || other_case == INVALID_UNICHAR_ID)

     return false;  // Not upper or lower or not in unicharset.

   if (unicharset.SizesDistinct(choice_id, other_case))

     return false;  // Can be separated by size.

   BLOB_CHOICE_IT bc_it(choices);

   for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {

     BLOB_CHOICE* better_choice = bc_it.data();

     if (better_choice->unichar_id() == other_case)

       return true;  // Found an earlier instance of other_case.

     else if (better_choice == choice)

       return false;  // Reached the original choice.

   }

   return false;  // Should never happen, but just in case.

 }


 bool LanguageModel::UpdateState(

     bool just_classified,

     int curr_col, int curr_row,

     BLOB_CHOICE_LIST *curr_list,

     LanguageModelState *parent_node,

     LMPainPoints *pain_points,

     WERD_RES *word_res,

     BestChoiceBundle *best_choice_bundle,

     BlamerBundle *blamer_bundle) {

   if (language_model_debug_level > 0) {

     tprintf("\nUpdateState: col=%d row=%d %s",

             curr_col, curr_row, just_classified ? "just_classified" : "");

     if (language_model_debug_level > 5)

       tprintf("(parent=%p)\n", parent_node);

     else

       tprintf("\n");

   }

   // Initialize helper variables.

   bool word_end = (curr_row+1 >= word_res->ratings->dimension());

   bool new_changed = false;

   float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;

   const UNICHARSET& unicharset = dict_->getUnicharset();

   BLOB_CHOICE *first_lower = NULL;

   BLOB_CHOICE *first_upper = NULL;

   BLOB_CHOICE *first_digit = NULL;

   bool has_alnum_mix = false;

   if (parent_node != NULL) {

     int result = SetTopParentLowerUpperDigit(parent_node);

     if (result < 0) {

       if (language_model_debug_level > 0)

         tprintf("No parents found to process\n");

       return false;

     }

     if (result > 0)

       has_alnum_mix = true;

   }

   if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,

                              &first_digit))

     has_alnum_mix = false;;

   ScanParentsForCaseMix(unicharset, parent_node);

   if (language_model_debug_level > 3 && parent_node != NULL) {

     parent_node->Print("Parent viterbi list");

   }

   LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];


   // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.

   ViterbiStateEntry_IT vit;

   BLOB_CHOICE_IT c_it(curr_list);

   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {

     BLOB_CHOICE* choice = c_it.data();

     // TODO(antonova): make sure commenting this out if ok for ngram

     // model scoring (I think this was introduced to fix ngram model quirks).

     // Skip NULL unichars unless it is the only choice.

     //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;

     UNICHAR_ID unichar_id = choice->unichar_id();

     if (unicharset.get_fragment(unichar_id)) {

       continue;  // Skip fragments.

     }

     // Set top choice flags.

     LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;

     if (c_it.at_first() || !new_changed)

       blob_choice_flags |= kSmallestRatingFlag;

     if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;

     if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;

     if (first_digit == choice) blob_choice_flags |= kDigitFlag;


     if (parent_node == NULL) {

       // Process the beginning of a word.

       // If there is a better case variant that is not distinguished by size,

       // skip this blob choice, as we have no choice but to accept the result

       // of the character classifier to distinguish between them, even if

       // followed by an upper case.

       // With words like iPoc, and other CamelBackWords, the lower-upper

       // transition can only be achieved if the classifier has the correct case

       // as the top choice, and leaving an initial I lower down the list

       // increases the chances of choosing IPoc simply because it doesn't

       // include such a transition. iPoc will beat iPOC and ipoc because

       // the other words are baseline/x-height inconsistent.

       if (HasBetterCaseVariant(unicharset, choice, curr_list))

         continue;

       // Upper counts as lower at the beginning of a word.

       if (blob_choice_flags & kUpperCaseFlag)

         blob_choice_flags |= kLowerCaseFlag;

       new_changed |= AddViterbiStateEntry(

           blob_choice_flags, denom, word_end, curr_col, curr_row,

           choice, curr_state, NULL, pain_points,

           word_res, best_choice_bundle, blamer_bundle);

     } else {

       // Get viterbi entries from each parent ViterbiStateEntry.

       vit.set_to_list(&parent_node->viterbi_state_entries);

       int vit_counter = 0;

       vit.mark_cycle_pt();

       ViterbiStateEntry* parent_vse = NULL;

       LanguageModelFlagsType top_choice_flags;

       while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,

                                             c_it.data(), blob_choice_flags,

                                             unicharset, word_res, &vit,

                                             &top_choice_flags)) != NULL) {

         // Skip pruned entries and do not look at prunable entries if already

         // examined language_model_viterbi_list_max_num_prunable of those.

         if (PrunablePath(*parent_vse) &&

             (++vit_counter > language_model_viterbi_list_max_num_prunable ||

              (language_model_ngram_on && parent_vse->ngram_info->pruned))) {

           continue;

         }

         // If the parent has no alnum choice, (ie choice is the first in a

         // string of alnum), and there is a better case variant that is not

         // distinguished by size, skip this blob choice/parent, as with the

         // initial blob treatment above.

         if (!parent_vse->HasAlnumChoice(unicharset) &&

             HasBetterCaseVariant(unicharset, choice, curr_list))

           continue;

         // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()

         // looks good according to the Dawgs or character ngram model.

         new_changed |= AddViterbiStateEntry(

             top_choice_flags, denom, word_end, curr_col, curr_row,

             c_it.data(), curr_state, parent_vse, pain_points,

             word_res, best_choice_bundle, blamer_bundle);

       }

     }

   }

   return new_changed;

 }


 bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,

                                           BLOB_CHOICE **first_lower,

                                           BLOB_CHOICE **first_upper,

                                           BLOB_CHOICE **first_digit) const {

   BLOB_CHOICE_IT c_it(curr_list);

   const UNICHARSET &unicharset = dict_->getUnicharset();

   BLOB_CHOICE *first_unichar = NULL;

   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {

     UNICHAR_ID unichar_id = c_it.data()->unichar_id();

     if (unicharset.get_fragment(unichar_id)) continue;  // skip fragments

     if (first_unichar == NULL) first_unichar = c_it.data();

     if (*first_lower == NULL && unicharset.get_islower(unichar_id)) {

       *first_lower = c_it.data();

     }

     if (*first_upper == NULL && unicharset.get_isalpha(unichar_id) &&

         !unicharset.get_islower(unichar_id)) {

       *first_upper = c_it.data();

     }

     if (*first_digit == NULL && unicharset.get_isdigit(unichar_id)) {

       *first_digit = c_it.data();

     }

   }

   ASSERT_HOST(first_unichar != NULL);

   bool mixed = (*first_lower != NULL || *first_upper != NULL) &&

       *first_digit != NULL;

   if (*first_lower == NULL) *first_lower = first_unichar;

   if (*first_upper == NULL) *first_upper = first_unichar;

   if (*first_digit == NULL) *first_digit = first_unichar;

   return mixed;

 }


 int LanguageModel::SetTopParentLowerUpperDigit(

     LanguageModelState *parent_node) const {

   if (parent_node == NULL) return -1;

   UNICHAR_ID top_id = INVALID_UNICHAR_ID;

   ViterbiStateEntry* top_lower = NULL;

   ViterbiStateEntry* top_upper = NULL;

   ViterbiStateEntry* top_digit = NULL;

   ViterbiStateEntry* top_choice = NULL;

   float lower_rating = 0.0f;

   float upper_rating = 0.0f;

   float digit_rating = 0.0f;

   float top_rating = 0.0f;

   const UNICHARSET &unicharset = dict_->getUnicharset();

   ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);

   for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {

     ViterbiStateEntry* vse = vit.data();

     // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan

     // back to the real character if needed.

     ViterbiStateEntry* unichar_vse = vse;

     UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();

     float rating = unichar_vse->curr_b->rating();

     while (unichar_id == INVALID_UNICHAR_ID &&

            unichar_vse->parent_vse != NULL) {

       unichar_vse = unichar_vse->parent_vse;

       unichar_id = unichar_vse->curr_b->unichar_id();

       rating = unichar_vse->curr_b->rating();

     }

     if (unichar_id != INVALID_UNICHAR_ID) {

       if (unicharset.get_islower(unichar_id)) {

         if (top_lower == NULL || lower_rating > rating) {

           top_lower = vse;

           lower_rating = rating;

         }

       } else if (unicharset.get_isalpha(unichar_id)) {

         if (top_upper == NULL || upper_rating > rating) {

           top_upper = vse;

           upper_rating = rating;

         }

       } else if (unicharset.get_isdigit(unichar_id)) {

         if (top_digit == NULL || digit_rating > rating) {

           top_digit = vse;

           digit_rating = rating;

         }

       }

     }

     if (top_choice == NULL || top_rating > rating) {

       top_choice = vse;

       top_rating = rating;

       top_id = unichar_id;

     }

   }

   if (top_choice == NULL) return -1;

   bool mixed = (top_lower != NULL || top_upper != NULL) &&

       top_digit != NULL;

   if (top_lower == NULL) top_lower = top_choice;

   top_lower->top_choice_flags |= kLowerCaseFlag;

   if (top_upper == NULL) top_upper = top_choice;

   top_upper->top_choice_flags |= kUpperCaseFlag;

   if (top_digit == NULL) top_digit = top_choice;

   top_digit->top_choice_flags |= kDigitFlag;

   top_choice->top_choice_flags |= kSmallestRatingFlag;

   if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&

       (top_choice->top_choice_flags &

           (kLowerCaseFlag | kUpperCaseFlag | kDigitFlag))) {

     // If the compound marker top choice carries any of the top alnum flags,

     // then give it all of them, allowing words like I-295 to be chosen.

     top_choice->top_choice_flags |=

         kLowerCaseFlag | kUpperCaseFlag | kDigitFlag;

   }

   return mixed ? 1 : 0;

 }


 ViterbiStateEntry* LanguageModel::GetNextParentVSE(

     bool just_classified, bool mixed_alnum, const BLOB_CHOICE* bc,

     LanguageModelFlagsType blob_choice_flags, const UNICHARSET& unicharset,

     WERD_RES* word_res, ViterbiStateEntry_IT* vse_it,

     LanguageModelFlagsType* top_choice_flags) const {

   for (; !vse_it->cycled_list(); vse_it->forward()) {

     ViterbiStateEntry* parent_vse = vse_it->data();

     // Only consider the parent if it has been updated or

     // if the current ratings cell has just been classified.

     if (!just_classified && !parent_vse->updated) continue;

     if (language_model_debug_level > 2)

       parent_vse->Print("Considering");

     // If the parent is non-alnum, then upper counts as lower.

     *top_choice_flags = blob_choice_flags;

     if ((blob_choice_flags & kUpperCaseFlag) &&

         !parent_vse->HasAlnumChoice(unicharset)) {

       *top_choice_flags |= kLowerCaseFlag;

     }

     *top_choice_flags &= parent_vse->top_choice_flags;

     UNICHAR_ID unichar_id = bc->unichar_id();

     const BLOB_CHOICE* parent_b = parent_vse->curr_b;

     UNICHAR_ID parent_id = parent_b->unichar_id();

     // Digits do not bind to alphas if there is a mix in both parent and current

     // or if the alpha is not the top choice.

     if (unicharset.get_isdigit(unichar_id) &&

         unicharset.get_isalpha(parent_id) &&

         (mixed_alnum || *top_choice_flags == 0))

       continue;  // Digits don't bind to alphas.

     // Likewise alphas do not bind to digits if there is a mix in both or if

     // the digit is not the top choice.

     if (unicharset.get_isalpha(unichar_id) &&

         unicharset.get_isdigit(parent_id) &&

         (mixed_alnum || *top_choice_flags == 0))

       continue;  // Alphas don't bind to digits.

     // If there is a case mix of the same alpha in the parent list, then

     // competing_vse is non-null and will be used to determine whether

     // or not to bind the current blob choice.

     if (parent_vse->competing_vse != NULL) {

       const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;

       UNICHAR_ID other_id = competing_b->unichar_id();

       if (language_model_debug_level >= 5) {

         tprintf("Parent %s has competition %s\n",

                 unicharset.id_to_unichar(parent_id),

                 unicharset.id_to_unichar(other_id));

       }

       if (unicharset.SizesDistinct(parent_id, other_id)) {

         // If other_id matches bc wrt position and size, and parent_id, doesn't,

         // don't bind to the current parent.

         if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,

                                 language_model_debug_level >= 5) &&

             !bc->PosAndSizeAgree(*parent_b, word_res->x_height,

                                 language_model_debug_level >= 5))

           continue;  // Competing blobchoice has a better vertical match.

       }

     }

     vse_it->forward();

     return parent_vse;  // This one is good!

   }

   return NULL;  // Ran out of possibilities.

 }


 bool LanguageModel::AddViterbiStateEntry(

     LanguageModelFlagsType top_choice_flags,

     float denom,

     bool word_end,

     int curr_col, int curr_row,

     BLOB_CHOICE *b,

     LanguageModelState *curr_state,

     ViterbiStateEntry *parent_vse,

     LMPainPoints *pain_points,

     WERD_RES *word_res,

     BestChoiceBundle *best_choice_bundle,

     BlamerBundle *blamer_bundle) {

   ViterbiStateEntry_IT vit;

   if (language_model_debug_level > 1) {

     tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"

             " certainty=%.4f top_choice_flags=0x%x",

             dict_->getUnicharset().id_to_unichar(b->unichar_id()),

             b->rating(), b->certainty(), top_choice_flags);

     if (language_model_debug_level > 5)

       tprintf(" parent_vse=%p\n", parent_vse);

     else

       tprintf("\n");

   }

   // Check whether the list is full.

   if (curr_state != NULL &&

       curr_state->viterbi_state_entries_length >=

           language_model_viterbi_list_max_size) {

     if (language_model_debug_level > 1) {

       tprintf("AddViterbiStateEntry: viterbi list is full!\n");

     }

     return false;

   }


   // Invoke Dawg language model component.

   LanguageModelDawgInfo *dawg_info =

     GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);


   float outline_length =

       AssociateUtils::ComputeOutlineLength(rating_cert_scale_, *b);

   // Invoke Ngram language model component.

   LanguageModelNgramInfo *ngram_info = NULL;

   if (language_model_ngram_on) {

     ngram_info = GenerateNgramInfo(

         dict_->getUnicharset().id_to_unichar(b->unichar_id()), b->certainty(),

         denom, curr_col, curr_row, outline_length, parent_vse);

     ASSERT_HOST(ngram_info != NULL);

   }

   bool liked_by_language_model = dawg_info != NULL ||

       (ngram_info != NULL && !ngram_info->pruned);

   // Quick escape if not liked by the language model, can't be consistent

   // xheight, and not top choice.

   if (!liked_by_language_model && top_choice_flags == 0) {

     if (language_model_debug_level > 1) {

       tprintf("Language model components very early pruned this entry\n");

     }

     delete ngram_info;

     delete dawg_info;

     return false;

   }


   // Check consistency of the path and set the relevant consistency_info.

   LMConsistencyInfo consistency_info(

     parent_vse != NULL ? &parent_vse->consistency_info : NULL);

   // Start with just the x-height consistency, as it provides significant

   // pruning opportunity.

   consistency_info.ComputeXheightConsistency(

       b, dict_->getUnicharset().get_ispunctuation(b->unichar_id()));

   // Turn off xheight consistent flag if not consistent.

   if (consistency_info.InconsistentXHeight()) {

     top_choice_flags &= ~kXhtConsistentFlag;

   }


   // Quick escape if not liked by the language model, not consistent xheight,

   // and not top choice.

   if (!liked_by_language_model && top_choice_flags == 0) {

     if (language_model_debug_level > 1) {

       tprintf("Language model components early pruned this entry\n");

     }

     delete ngram_info;

     delete dawg_info;

     return false;

   }


   // Compute the rest of the consistency info.

   FillConsistencyInfo(curr_col, word_end, b, parent_vse,

                       word_res, &consistency_info);

   if (dawg_info != NULL && consistency_info.invalid_punc) {

     consistency_info.invalid_punc = false;  // do not penalize dict words

   }


   // Compute cost of associating the blobs that represent the current unichar.

   AssociateStats associate_stats;

   ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,

                         parent_vse, word_res, &associate_stats);

   if (parent_vse != NULL) {

     associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;

     associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;

   }


   // Create the new ViterbiStateEntry compute the adjusted cost of the path.

   ViterbiStateEntry *new_vse = new ViterbiStateEntry(

       parent_vse, b, 0.0, outline_length,

       consistency_info, associate_stats, top_choice_flags, dawg_info,

       ngram_info, (language_model_debug_level > 0) ?

           dict_->getUnicharset().id_to_unichar(b->unichar_id()) : NULL);

   new_vse->cost = ComputeAdjustedPathCost(new_vse);

   if (language_model_debug_level >= 3)

     tprintf("Adjusted cost = %g\n", new_vse->cost);


   // Invoke Top Choice language model component to make the final adjustments

   // to new_vse->top_choice_flags.

   if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {

     GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);

   }


   // If language model components did not like this unichar - return.

   bool keep = new_vse->top_choice_flags || liked_by_language_model;

   if (!(top_choice_flags & kSmallestRatingFlag) &&  // no non-top choice paths

       consistency_info.inconsistent_script) {       // with inconsistent script

     keep = false;

   }

   if (!keep) {

     if (language_model_debug_level > 1) {

       tprintf("Language model components did not like this entry\n");

     }

     delete new_vse;

     return false;

   }


   // Discard this entry if it represents a prunable path and

   // language_model_viterbi_list_max_num_prunable such entries with a lower

   // cost have already been recorded.

   if (PrunablePath(*new_vse) &&

       (curr_state->viterbi_state_entries_prunable_length >=

        language_model_viterbi_list_max_num_prunable) &&

       new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {

     if (language_model_debug_level > 1) {

       tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",

               new_vse->cost,

               curr_state->viterbi_state_entries_prunable_max_cost);

     }

     delete new_vse;

     return false;

   }


   // Update best choice if needed.

   if (word_end) {

     UpdateBestChoice(new_vse, pain_points, word_res,

                      best_choice_bundle, blamer_bundle);

     // Discard the entry if UpdateBestChoice() found flaws in it.

     if (new_vse->cost >= WERD_CHOICE::kBadRating &&

         new_vse != best_choice_bundle->best_vse) {

       if (language_model_debug_level > 1) {

         tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);

       }

       delete new_vse;

       return false;

     }

   }


   // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.

   curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,

                                                false, new_vse);

   curr_state->viterbi_state_entries_length++;

   if (PrunablePath(*new_vse)) {

     curr_state->viterbi_state_entries_prunable_length++;

   }


   // Update lms->viterbi_state_entries_prunable_max_cost and clear

   // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.

   if ((curr_state->viterbi_state_entries_prunable_length >=

        language_model_viterbi_list_max_num_prunable) ||

       new_vse->top_choice_flags) {

     ASSERT_HOST(!curr_state->viterbi_state_entries.empty());

     int prunable_counter = language_model_viterbi_list_max_num_prunable;

     vit.set_to_list(&(curr_state->viterbi_state_entries));

     for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {

       ViterbiStateEntry *curr_vse = vit.data();

       // Clear the appropriate top choice flags of the entries in the

       // list that have cost higher thank new_entry->cost

       // (since they will not be top choices any more).

       if (curr_vse->top_choice_flags && curr_vse != new_vse &&

           curr_vse->cost > new_vse->cost) {

         curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);

       }

       if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;

       // Update curr_state->viterbi_state_entries_prunable_max_cost.

       if (prunable_counter == 0) {

         curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;

         if (language_model_debug_level > 1) {

           tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",

                   curr_state->viterbi_state_entries_prunable_max_cost);

         }

         prunable_counter = -1;  // stop counting

       }

     }

   }


   // Print the newly created ViterbiStateEntry.

   if (language_model_debug_level > 2) {

     new_vse->Print("New");

     if (language_model_debug_level > 5)

       curr_state->Print("Updated viterbi list");

   }


   return true;

 }


 void LanguageModel::GenerateTopChoiceInfo(ViterbiStateEntry *new_vse,

                                           const ViterbiStateEntry *parent_vse,

                                           LanguageModelState *lms) {

   ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));

   for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&

        new_vse->cost >= vit.data()->cost; vit.forward()) {

     // Clear the appropriate flags if the list already contains

     // a top choice entry with a lower cost.

     new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);

   }

   if (language_model_debug_level > 2) {

     tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",

             new_vse->top_choice_flags);

   }

 }


 LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo(

     bool word_end,

     int curr_col, int curr_row,

     const BLOB_CHOICE &b,

     const ViterbiStateEntry *parent_vse) {

   // Initialize active_dawgs from parent_vse if it is not NULL.

   // Otherwise use very_beginning_active_dawgs_.

   if (parent_vse == NULL) {

     dawg_args_->active_dawgs = very_beginning_active_dawgs_;

     dawg_args_->permuter = NO_PERM;

   } else {

     if (parent_vse->dawg_info == NULL) return NULL;  // not a dict word path

     dawg_args_->active_dawgs = parent_vse->dawg_info->active_dawgs;

     dawg_args_->permuter = parent_vse->dawg_info->permuter;

   }


   // Deal with hyphenated words.

   if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {

     if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");

     return new LanguageModelDawgInfo(dawg_args_->active_dawgs,

                                      COMPOUND_PERM);

   }


   // Deal with compound words.

   if (dict_->compound_marker(b.unichar_id()) &&

       (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) {

     if (language_model_debug_level > 0) tprintf("Found compound marker\n");

     // Do not allow compound operators at the beginning and end of the word.

     // Do not allow more than one compound operator per word.

     // Do not allow compounding of words with lengths shorter than

     // language_model_min_compound_length

     if (parent_vse == NULL || word_end ||

         dawg_args_->permuter == COMPOUND_PERM ||

         parent_vse->length < language_model_min_compound_length) return NULL;


     int i;

     // Check a that the path terminated before the current character is a word.

     bool has_word_ending = false;

     for (i = 0; i < parent_vse->dawg_info->active_dawgs->size(); ++i) {

       const DawgPosition &pos = (*parent_vse->dawg_info->active_dawgs)[i];

       const Dawg *pdawg = pos.dawg_index < 0

           ? NULL : dict_->GetDawg(pos.dawg_index);

       if (pdawg == NULL || pos.back_to_punc) continue;;

       if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&

           pdawg->end_of_word(pos.dawg_ref)) {

         has_word_ending = true;

         break;

       }

     }

     if (!has_word_ending) return NULL;


     if (language_model_debug_level > 0) tprintf("Compound word found\n");

     return new LanguageModelDawgInfo(beginning_active_dawgs_, COMPOUND_PERM);

   }  // done dealing with compound words


   LanguageModelDawgInfo *dawg_info = NULL;


   // Call LetterIsOkay().

   // Use the normalized IDs so that all shapes of ' can be allowed in words

   // like don't.

   const GenericVector<UNICHAR_ID>& normed_ids =

       dict_->getUnicharset().normed_ids(b.unichar_id());

   DawgPositionVector tmp_active_dawgs;

   for (int i = 0; i < normed_ids.size(); ++i) {

     if (language_model_debug_level > 2)

       tprintf("Test Letter OK for unichar %d, normed %d\n",

               b.unichar_id(), normed_ids[i]);

     dict_->LetterIsOkay(dawg_args_, normed_ids[i],

                         word_end && i == normed_ids.size() - 1);

     if (dawg_args_->permuter == NO_PERM) {

       break;

     } else if (i < normed_ids.size() - 1) {

       tmp_active_dawgs = *dawg_args_->updated_dawgs;

       dawg_args_->active_dawgs = &tmp_active_dawgs;

     }

     if (language_model_debug_level > 2)

       tprintf("Letter was OK for unichar %d, normed %d\n",

               b.unichar_id(), normed_ids[i]);

   }

   dawg_args_->active_dawgs = NULL;

   if (dawg_args_->permuter != NO_PERM) {

     dawg_info = new LanguageModelDawgInfo(dawg_args_->updated_dawgs,

                                           dawg_args_->permuter);

   } else if (language_model_debug_level > 3) {

     tprintf("Letter %s not OK!\n",

             dict_->getUnicharset().id_to_unichar(b.unichar_id()));

   }


   return dawg_info;

 }


 LanguageModelNgramInfo *LanguageModel::GenerateNgramInfo(

     const char *unichar, float certainty, float denom,

     int curr_col, int curr_row, float outline_length,

     const ViterbiStateEntry *parent_vse) {

   // Initialize parent context.

   const char *pcontext_ptr = "";

   int pcontext_unichar_step_len = 0;

   if (parent_vse == NULL) {

     pcontext_ptr = prev_word_str_.string();

     pcontext_unichar_step_len = prev_word_unichar_step_len_;

   } else {

     pcontext_ptr = parent_vse->ngram_info->context.string();

     pcontext_unichar_step_len =

       parent_vse->ngram_info->context_unichar_step_len;

   }

   // Compute p(unichar | parent context).

   int unichar_step_len = 0;

   bool pruned = false;

   float ngram_cost;

   float ngram_and_classifier_cost =

       ComputeNgramCost(unichar, certainty, denom,

                        pcontext_ptr, &unichar_step_len,

                        &pruned, &ngram_cost);

   // Normalize just the ngram_and_classifier_cost by outline_length.

   // The ngram_cost is used by the params_model, so it needs to be left as-is,

   // and the params model cost will be normalized by outline_length.

   ngram_and_classifier_cost *=

       outline_length / language_model_ngram_rating_factor;

   // Add the ngram_cost of the parent.

   if (parent_vse != NULL) {

     ngram_and_classifier_cost +=

         parent_vse->ngram_info->ngram_and_classifier_cost;

     ngram_cost += parent_vse->ngram_info->ngram_cost;

   }


   // Shorten parent context string by unichar_step_len unichars.

   int num_remove = (unichar_step_len + pcontext_unichar_step_len -

                     language_model_ngram_order);

   if (num_remove > 0) pcontext_unichar_step_len -= num_remove;

   while (num_remove > 0 && *pcontext_ptr != '\0') {

     pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);

     --num_remove;

   }


   // Decide whether to prune this ngram path and update changed accordingly.

   if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true;


   // Construct and return the new LanguageModelNgramInfo.

   LanguageModelNgramInfo *ngram_info = new LanguageModelNgramInfo(

       pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,

       ngram_and_classifier_cost);

   ngram_info->context += unichar;

   ngram_info->context_unichar_step_len += unichar_step_len;

   assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);

   return ngram_info;

 }


 float LanguageModel::ComputeNgramCost(const char *unichar,

                                       float certainty,

                                       float denom,

                                       const char *context,

                                       int *unichar_step_len,

                                       bool *found_small_prob,

                                       float *ngram_cost) {

   const char *context_ptr = context;

   char *modified_context = NULL;

   char *modified_context_end = NULL;

   const char *unichar_ptr = unichar;

   const char *unichar_end = unichar_ptr + strlen(unichar_ptr);

   float prob = 0.0f;

   int step = 0;

   while (unichar_ptr < unichar_end &&

          (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {

     if (language_model_debug_level > 1) {

       tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,

               dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));

     }

     prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);

     ++(*unichar_step_len);

     if (language_model_ngram_use_only_first_uft8_step) break;

     unichar_ptr += step;

     // If there are multiple UTF8 characters present in unichar, context is

     // updated to include the previously examined characters from str,

     // unless use_only_first_uft8_step is true.

     if (unichar_ptr < unichar_end) {

       if (modified_context == NULL) {

         int context_len = strlen(context);

         modified_context =

           new char[context_len + strlen(unichar_ptr) + step + 1];

         strncpy(modified_context, context, context_len);

         modified_context_end = modified_context + context_len;

         context_ptr = modified_context;

       }

       strncpy(modified_context_end, unichar_ptr - step, step);

       modified_context_end += step;

       *modified_context_end = '\0';

     }

   }

   prob /= static_cast<float>(*unichar_step_len);  // normalize

   if (prob < language_model_ngram_small_prob) {

     if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);

     *found_small_prob = true;

     prob = language_model_ngram_small_prob;

   }

   *ngram_cost = -1.0*log2(prob);

   float ngram_and_classifier_cost =

       -1.0*log2(CertaintyScore(certainty)/denom) +

       *ngram_cost * language_model_ngram_scale_factor;

   if (language_model_debug_level > 1) {

     tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,

             unichar, context_ptr, CertaintyScore(certainty)/denom, prob,

             ngram_and_classifier_cost);

   }

   if (modified_context != NULL) delete[] modified_context;

   return ngram_and_classifier_cost;

 }


 float LanguageModel::ComputeDenom(BLOB_CHOICE_LIST *curr_list) {

   if (curr_list->empty()) return 1.0f;

   float denom = 0.0f;

   int len = 0;

   BLOB_CHOICE_IT c_it(curr_list);

   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {

     ASSERT_HOST(c_it.data() != NULL);

     ++len;

     denom += CertaintyScore(c_it.data()->certainty());

   }

   assert(len != 0);

   // The ideal situation would be to have the classifier scores for

   // classifying each position as each of the characters in the unicharset.

   // Since we can not do this because of speed, we add a very crude estimate

   // of what these scores for the "missing" classifications would sum up to.

   denom += (dict_->getUnicharset().size() - len) *

     CertaintyScore(language_model_ngram_nonmatch_score);


   return denom;

 }


 void LanguageModel::FillConsistencyInfo(

     int curr_col,

     bool word_end,

     BLOB_CHOICE *b,

     ViterbiStateEntry *parent_vse,

     WERD_RES *word_res,

     LMConsistencyInfo *consistency_info) {

   const UNICHARSET &unicharset = dict_->getUnicharset();

   UNICHAR_ID unichar_id = b->unichar_id();

   BLOB_CHOICE* parent_b = parent_vse != NULL ? parent_vse->curr_b : NULL;


   // Check punctuation validity.

   if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;

   if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) {

     if (dict_->compound_marker(unichar_id) && parent_b != NULL &&

         (unicharset.get_isalpha(parent_b->unichar_id()) ||

          unicharset.get_isdigit(parent_b->unichar_id()))) {

       // reset punc_ref for compound words

       consistency_info->punc_ref = NO_EDGE;

     } else {

       bool is_apos = dict_->is_apostrophe(unichar_id);

       bool prev_is_numalpha = (parent_b != NULL &&

           (unicharset.get_isalpha(parent_b->unichar_id()) ||

            unicharset.get_isdigit(parent_b->unichar_id())));

       UNICHAR_ID pattern_unichar_id =

         (unicharset.get_isalpha(unichar_id) ||

          unicharset.get_isdigit(unichar_id) ||

          (is_apos && prev_is_numalpha)) ?

         Dawg::kPatternUnicharID : unichar_id;

       if (consistency_info->punc_ref == NO_EDGE ||

           pattern_unichar_id != Dawg::kPatternUnicharID ||

           dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=

           Dawg::kPatternUnicharID) {

         NODE_REF node = Dict::GetStartingNode(dict_->GetPuncDawg(),

                                               consistency_info->punc_ref);

         consistency_info->punc_ref =

           (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(

               node, pattern_unichar_id, word_end) : NO_EDGE;

         if (consistency_info->punc_ref == NO_EDGE) {

           consistency_info->invalid_punc = true;

         }

       }

     }

   }


   // Update case related counters.

   if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) {

     // Reset counters if we are dealing with a compound word.

     consistency_info->num_lower = 0;

     consistency_info->num_non_first_upper = 0;

   }

   else if (unicharset.get_islower(unichar_id)) {

     consistency_info->num_lower++;

   } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) {

     if (unicharset.get_isupper(parent_b->unichar_id()) ||

         consistency_info->num_lower > 0 ||

         consistency_info->num_non_first_upper > 0) {

       consistency_info->num_non_first_upper++;

     }

   }


   // Initialize consistency_info->script_id (use script of unichar_id

   // if it is not Common, use script id recorded by the parent otherwise).

   // Set inconsistent_script to true if the script of the current unichar

   // is not consistent with that of the parent.

   consistency_info->script_id = unicharset.get_script(unichar_id);

   // Hiragana and Katakana can mix with Han.

   if (dict_->getUnicharset().han_sid() != dict_->getUnicharset().null_sid()) {

     if ((unicharset.hiragana_sid() != unicharset.null_sid() &&

          consistency_info->script_id == unicharset.hiragana_sid()) ||

         (unicharset.katakana_sid() != unicharset.null_sid() &&

          consistency_info->script_id == unicharset.katakana_sid())) {

       consistency_info->script_id = dict_->getUnicharset().han_sid();

     }

   }


   if (parent_vse != NULL &&

       (parent_vse->consistency_info.script_id !=

        dict_->getUnicharset().common_sid())) {

     int parent_script_id = parent_vse->consistency_info.script_id;

     // If script_id is Common, use script id of the parent instead.

     if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {

       consistency_info->script_id = parent_script_id;

     }

     if (consistency_info->script_id != parent_script_id) {

       consistency_info->inconsistent_script = true;

     }

   }


   // Update chartype related counters.

   if (unicharset.get_isalpha(unichar_id)) {

     consistency_info->num_alphas++;

   } else if (unicharset.get_isdigit(unichar_id)) {

     consistency_info->num_digits++;

   } else if (!unicharset.get_ispunctuation(unichar_id)) {

     consistency_info->num_other++;

   }


   // Check font and spacing consistency.

   if (fontinfo_table_->size() > 0 && parent_b != NULL) {

     int fontinfo_id = -1;

     if (parent_b->fontinfo_id() == b->fontinfo_id() ||

         parent_b->fontinfo_id2() == b->fontinfo_id()) {

       fontinfo_id = b->fontinfo_id();

     } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||

                 parent_b->fontinfo_id2() == b->fontinfo_id2()) {

       fontinfo_id = b->fontinfo_id2();

     }

     if(language_model_debug_level > 1) {

       tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",

               (parent_b->fontinfo_id() >= 0) ?

                   fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,

               (parent_b->fontinfo_id2() >= 0) ?

                   fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",

               (b->fontinfo_id() >= 0) ?

                   fontinfo_table_->get(b->fontinfo_id()).name : "",

               (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",

               (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",

               fontinfo_id);

     }

     if (!word_res->blob_widths.empty()) {  // if we have widths/gaps info

       bool expected_gap_found = false;

       float expected_gap;

       int temp_gap;

       if (fontinfo_id >= 0) {  // found a common font

         ASSERT_HOST(fontinfo_id < fontinfo_table_->size());

         if (fontinfo_table_->get(fontinfo_id).get_spacing(

             parent_b->unichar_id(), unichar_id, &temp_gap)) {

           expected_gap = temp_gap;

           expected_gap_found = true;

         }

       } else {

         consistency_info->inconsistent_font = true;

         // Get an average of the expected gaps in each font

         int num_addends = 0;

         expected_gap = 0;

         int temp_fid;

         for (int i = 0; i < 4; ++i) {

           if (i == 0) {

             temp_fid = parent_b->fontinfo_id();

           } else if (i == 1) {

             temp_fid = parent_b->fontinfo_id2();

           } else if (i == 2) {

             temp_fid = b->fontinfo_id();

           } else {

             temp_fid = b->fontinfo_id2();

           }

           ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());

           if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(

               parent_b->unichar_id(), unichar_id, &temp_gap)) {

             expected_gap += temp_gap;

             num_addends++;

           }

         }

         expected_gap_found = (num_addends > 0);

         if (num_addends > 0) {

           expected_gap /= static_cast<float>(num_addends);

         }

       }

       if (expected_gap_found) {

         float actual_gap =

             static_cast<float>(word_res->GetBlobsGap(curr_col-1));

         float gap_ratio = expected_gap / actual_gap;

         // TODO(rays) The gaps seem to be way off most of the time, saved by

         // the error here that the ratio was compared to 1/2, when it should

         // have been 0.5f. Find the source of the gaps discrepancy and put

         // the 0.5f here in place of 0.0f.

         // Test on 2476595.sj, pages 0 to 6. (In French.)

         if (gap_ratio < 0.0f || gap_ratio > 2.0f) {

           consistency_info->num_inconsistent_spaces++;

         }

         if (language_model_debug_level > 1) {

           tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",

                   unicharset.id_to_unichar(parent_b->unichar_id()),

                   parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),

                   unichar_id, curr_col, expected_gap, actual_gap);

         }

       }

     }

   }

 }


 float LanguageModel::ComputeAdjustedPathCost(ViterbiStateEntry *vse) {

   ASSERT_HOST(vse != NULL);

   if (params_model_.Initialized()) {

     float features[PTRAIN_NUM_FEATURE_TYPES];

     ExtractFeaturesFromPath(*vse, features);

     float cost = params_model_.ComputeCost(features);

     if (language_model_debug_level > 3) {

       tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);

       if (language_model_debug_level >= 5) {

         for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {

           tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);

         }

       }

     }

     return cost * vse->outline_length;

   } else {

     float adjustment = 1.0f;

     if (vse->dawg_info == NULL || vse->dawg_info->permuter != FREQ_DAWG_PERM) {

       adjustment += language_model_penalty_non_freq_dict_word;

     }

     if (vse->dawg_info == NULL) {

       adjustment += language_model_penalty_non_dict_word;

       if (vse->length > language_model_min_compound_length) {

         adjustment += ((vse->length - language_model_min_compound_length) *

             language_model_penalty_increment);

       }

     }

     if (vse->associate_stats.shape_cost > 0) {

       adjustment += vse->associate_stats.shape_cost /

           static_cast<float>(vse->length);

     }

     if (language_model_ngram_on) {

       ASSERT_HOST(vse->ngram_info != NULL);

       return vse->ngram_info->ngram_and_classifier_cost * adjustment;

     } else {

       adjustment += ComputeConsistencyAdjustment(vse->dawg_info,

                                                  vse->consistency_info);

       return vse->ratings_sum * adjustment;

     }

   }

 }


 void LanguageModel::UpdateBestChoice(

     ViterbiStateEntry *vse,

     LMPainPoints *pain_points,

     WERD_RES *word_res,

     BestChoiceBundle *best_choice_bundle,

     BlamerBundle *blamer_bundle) {

   bool truth_path;

   WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,

                                     blamer_bundle, &truth_path);

   ASSERT_HOST(word != NULL);

   if (dict_->stopper_debug_level >= 1) {

     STRING word_str;

     word->string_and_lengths(&word_str, NULL);

     vse->Print(word_str.string());

   }

   if (language_model_debug_level > 0) {

     word->print("UpdateBestChoice() constructed word");

   }

   // Record features from the current path if necessary.

   ParamsTrainingHypothesis curr_hyp;

   if (blamer_bundle != NULL) {

     if (vse->dawg_info != NULL) vse->dawg_info->permuter =

         static_cast<PermuterType>(word->permuter());

     ExtractFeaturesFromPath(*vse, curr_hyp.features);

     word->string_and_lengths(&(curr_hyp.str), NULL);

     curr_hyp.cost = vse->cost;  // record cost for error rate computations

     if (language_model_debug_level > 0) {

       tprintf("Raw features extracted from %s (cost=%g) [ ",

               curr_hyp.str.string(), curr_hyp.cost);

       for (int deb_i = 0; deb_i < PTRAIN_NUM_FEATURE_TYPES; ++deb_i) {

         tprintf("%g ", curr_hyp.features[deb_i]);

       }

       tprintf("]\n");

     }

     // Record the current hypothesis in params_training_bundle.

     blamer_bundle->AddHypothesis(curr_hyp);

     if (truth_path)

       blamer_bundle->UpdateBestRating(word->rating());

   }

   if (blamer_bundle != NULL && blamer_bundle->GuidedSegsearchStillGoing()) {

     // The word was constructed solely for blamer_bundle->AddHypothesis, so

     // we no longer need it.

     delete word;

     return;

   }

   if (word_res->chopped_word != NULL && !word_res->chopped_word->blobs.empty())

     word->SetScriptPositions(false, word_res->chopped_word);

   // Update and log new raw_choice if needed.

   if (word_res->raw_choice == NULL ||

       word->rating() < word_res->raw_choice->rating()) {

     if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)

       tprintf("Updated raw choice\n");

   }

   // Set the modified rating for best choice to vse->cost and log best choice.

   word->set_rating(vse->cost);

   // Call LogNewChoice() for best choice from Dict::adjust_word() since it

   // computes adjust_factor that is used by the adaption code (e.g. by

   // ClassifyAdaptableWord() to compute adaption acceptance thresholds).

   // Note: the rating of the word is not adjusted.

   dict_->adjust_word(word, vse->dawg_info == NULL,

                      vse->consistency_info.xht_decision, 0.0,

                      false, language_model_debug_level > 0);

   // Hand ownership of the word over to the word_res.

   if (!word_res->LogNewCookedChoice(dict_->tessedit_truncate_wordchoice_log,

                                     dict_->stopper_debug_level >= 1, word)) {

     // The word was so bad that it was deleted.

     return;

   }

   if (word_res->best_choice == word) {

     // Word was the new best.

     if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) &&

         AcceptablePath(*vse)) {

       acceptable_choice_found_ = true;

     }

     // Update best_choice_bundle.

     best_choice_bundle->updated = true;

     best_choice_bundle->best_vse = vse;

     if (language_model_debug_level > 0) {

       tprintf("Updated best choice\n");

       word->print_state("New state ");

     }

     // Update hyphen state if we are dealing with a dictionary word.

     if (vse->dawg_info != NULL) {

       if (dict_->has_hyphen_end(*word)) {

         dict_->set_hyphen_word(*word, *(dawg_args_->active_dawgs));

       } else {

         dict_->reset_hyphen_vars(true);

       }

     }


     if (blamer_bundle != NULL) {

       blamer_bundle->set_best_choice_is_dict_and_top_choice(

           vse->dawg_info != NULL && vse->top_choice_flags);

     }

   }

   if (wordrec_display_segmentations && word_res->chopped_word != NULL) {

     word->DisplaySegmentation(word_res->chopped_word);

   }

 }


 void LanguageModel::ExtractFeaturesFromPath(

     const ViterbiStateEntry &vse, float features[]) {

   memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);

   // Record dictionary match info.

   int len = vse.length <= kMaxSmallWordUnichars ? 0 :

       vse.length <= kMaxMediumWordUnichars ? 1 : 2;

   if (vse.dawg_info != NULL) {

     int permuter = vse.dawg_info->permuter;

     if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {

       if (vse.consistency_info.num_digits == vse.length) {

         features[PTRAIN_DIGITS_SHORT+len] = 1.0;

       } else {

         features[PTRAIN_NUM_SHORT+len] = 1.0;

       }

     } else if (permuter == DOC_DAWG_PERM) {

       features[PTRAIN_DOC_SHORT+len] = 1.0;

     } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||

         permuter == COMPOUND_PERM) {

       features[PTRAIN_DICT_SHORT+len] = 1.0;

     } else if (permuter == FREQ_DAWG_PERM) {

       features[PTRAIN_FREQ_SHORT+len] = 1.0;

     }

   }

   // Record shape cost feature (normalized by path length).

   features[PTRAIN_SHAPE_COST_PER_CHAR] =

       vse.associate_stats.shape_cost / static_cast<float>(vse.length);

   // Record ngram cost. (normalized by the path length).

   features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;

   if (vse.ngram_info != NULL) {

     features[PTRAIN_NGRAM_COST_PER_CHAR] =

         vse.ngram_info->ngram_cost / static_cast<float>(vse.length);

   }

   // Record consistency-related features.

   // Disabled this feature for due to its poor performance.

   // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();

   features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase();

   features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision;

   features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == NULL ?

       vse.consistency_info.NumInconsistentChartype() : 0.0;

   features[PTRAIN_NUM_BAD_SPACING] =

       vse.consistency_info.NumInconsistentSpaces();

   // Disabled this feature for now due to its poor performance.

   // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;


   // Classifier-related features.

   features[PTRAIN_RATING_PER_CHAR] =

       vse.ratings_sum / static_cast<float>(vse.outline_length);

 }


 WERD_CHOICE *LanguageModel::ConstructWord(

     ViterbiStateEntry *vse,

     WERD_RES *word_res,

     DANGERR *fixpt,

     BlamerBundle *blamer_bundle,

     bool *truth_path) {

   if (truth_path != NULL) {

     *truth_path =

         (blamer_bundle != NULL &&

          vse->length == blamer_bundle->correct_segmentation_length());

   }

   BLOB_CHOICE *curr_b = vse->curr_b;

   ViterbiStateEntry *curr_vse = vse;


   int i;

   bool compound = dict_->hyphenated();  // treat hyphenated words as compound


   // Re-compute the variance of the width-to-height ratios (since we now

   // can compute the mean over the whole word).

   float full_wh_ratio_mean = 0.0f;

   if (vse->associate_stats.full_wh_ratio_var != 0.0f) {

     vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;

     full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /

                           static_cast<float>(vse->length));

     vse->associate_stats.full_wh_ratio_var = 0.0f;

   }


   // Construct a WERD_CHOICE by tracing parent pointers.

   WERD_CHOICE *word = new WERD_CHOICE(word_res->uch_set, vse->length);

   word->set_length(vse->length);

   int total_blobs = 0;

   for (i = (vse->length-1); i >= 0; --i) {

     if (blamer_bundle != NULL && truth_path != NULL && *truth_path &&

         !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {

         *truth_path = false;

     }

     // The number of blobs used for this choice is row - col + 1.

     int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;

     total_blobs += num_blobs;

     word->set_blob_choice(i, num_blobs, curr_b);

     // Update the width-to-height ratio variance. Useful non-space delimited

     // languages to ensure that the blobs are of uniform width.

     // Skip leading and trailing punctuation when computing the variance.

     if ((full_wh_ratio_mean != 0.0f &&

          ((curr_vse != vse && curr_vse->parent_vse != NULL) ||

           !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {

       vse->associate_stats.full_wh_ratio_var +=

         pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);

       if (language_model_debug_level > 2) {

         tprintf("full_wh_ratio_var += (%g-%g)^2\n",

                 full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);

       }

     }


     // Mark the word as compound if compound permuter was set for any of

     // the unichars on the path (usually this will happen for unichars

     // that are compounding operators, like "-" and "/").

     if (!compound && curr_vse->dawg_info &&

         curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;


     // Update curr_* pointers.

     curr_vse = curr_vse->parent_vse;

     if (curr_vse == NULL) break;

     curr_b = curr_vse->curr_b;

   }

   ASSERT_HOST(i == 0);  // check that we recorded all the unichar ids.

   ASSERT_HOST(total_blobs == word_res->ratings->dimension());

   // Re-adjust shape cost to include the updated width-to-height variance.

   if (full_wh_ratio_mean != 0.0f) {

     vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;

   }


   word->set_rating(vse->ratings_sum);

   word->set_certainty(vse->min_certainty);

   word->set_x_heights(vse->consistency_info.BodyMinXHeight(),

                       vse->consistency_info.BodyMaxXHeight());

   if (vse->dawg_info != NULL) {

     word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);

   } else if (language_model_ngram_on && !vse->ngram_info->pruned) {

     word->set_permuter(NGRAM_PERM);

   } else if (vse->top_choice_flags) {

     word->set_permuter(TOP_CHOICE_PERM);

   } else {

     word->set_permuter(NO_PERM);

   }

   word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,

                                                             word_res->ratings));

   return word;

 }


 }  // namespace tesseract

UNICHARSET::katakana_sid
int katakana_sid() const
Definition: unicharset.h:838

lm_pain_points.h

tesseract::DawgArgs::active_dawgs
DawgPositionVector * active_dawgs
Definition: dict.h:81

tesseract::LanguageModelNgramInfo::context_unichar_step_len
int context_unichar_step_len
Definition: lm_state.h:81

COMPOUND_PERM
Definition: ratngs.h:253

WERD_CHOICE::kBadRating
static const float kBadRating
Definition: ratngs.h:273

WERD_RES::LogNewRawChoice
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:596

GenericVector::size
int size() const
Definition: genericvector.h:72

tesseract::ViterbiStateEntry::consistency_info
LMConsistencyInfo consistency_info
Definition: lm_state.h:173

tesseract::LanguageModel::dict_
Dict * dict_
Definition: language_model.h:375

UNICHARSET::SizesDistinct
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:472

tesseract::ParamsTrainingHypothesis::cost
float cost
Definition: params_training_featdef.h:118

tesseract::LanguageModel::GetTopLowerUpperDigit
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
Definition: language_model.cpp:385

tesseract::LanguageModel::language_model_viterbi_list_max_size
int language_model_viterbi_list_max_size
Definition: language_model.h:317

tesseract::ParamsModel::Initialized
bool Initialized()
Definition: params_model.h:43

tesseract::LanguageModelState::viterbi_state_entries_prunable_length
int viterbi_state_entries_prunable_length
Number and max cost of prunable paths in viterbi_state_entries.
Definition: lm_state.h:212

WERD_CHOICE::DisplaySegmentation
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:747

WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:324

tesseract::LMConsistencyInfo::script_id
int script_id
Definition: lm_consistency.h:126

params_training_featdef.h

WERD_CHOICE::SetScriptPositions
void SetScriptPositions(bool small_caps, TWERD *word)
Definition: ratngs.cpp:528

WERD_RES
Definition: pageres.h:155

tesseract::LanguageModel::ExtractFeaturesFromPath
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
Definition: language_model.cpp:1340

UNICHARSET::hiragana_sid
int hiragana_sid() const
Definition: unicharset.h:837

tesseract::LanguageModelNgramInfo::pruned
bool pruned
Definition: lm_state.h:86

WERD_CHOICE::set_certainty
void set_certainty(float new_val)
Definition: ratngs.h:369

tesseract::LanguageModel::language_model_ngram_nonmatch_score
double language_model_ngram_nonmatch_score
Definition: language_model.h:322

tesseract::LanguageModel::language_model_ngram_small_prob
double language_model_ngram_small_prob
Definition: language_model.h:320

tesseract::BestChoiceBundle::best_vse
ViterbiStateEntry * best_vse
Best ViterbiStateEntry and BLOB_CHOICE.
Definition: lm_state.h:237

tesseract::Dawg::end_of_word
virtual bool end_of_word(EDGE_REF edge_ref) const =0

tesseract::LanguageModelDawgInfo::permuter
PermuterType permuter
Definition: lm_state.h:69

WERD_RES::ratings
MATRIX * ratings
Definition: pageres.h:215

tesseract::LMConsistencyInfo::num_non_first_upper
int num_non_first_upper
Definition: lm_consistency.h:124

tesseract::ParamsTrainingHypothesis
Definition: params_training_featdef.h:106

tesseract::LanguageModel::language_model_ngram_rating_factor
double language_model_ngram_rating_factor
Definition: language_model.h:331

tesseract::DawgPosition::dawg_ref
EDGE_REF dawg_ref
Definition: dawg.h:362

WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:219

tesseract::LanguageModel::kDigitFlag
static const LanguageModelFlagsType kDigitFlag
Definition: language_model.h:48

tesseract::LanguageModel::prev_word_unichar_step_len_
int prev_word_unichar_step_len_
Definition: language_model.h:393

tesseract::LanguageModel::fixed_pitch_
bool fixed_pitch_
Definition: language_model.h:382

tesseract::AssociateStats::full_wh_ratio
float full_wh_ratio
Definition: associate.h:56

tesseract::LanguageModel::fontinfo_table_
const UnicityTable< FontInfo > * fontinfo_table_
Definition: language_model.h:371

tesseract::LMConsistencyInfo::num_inconsistent_spaces
int num_inconsistent_spaces
Definition: lm_consistency.h:128

tesseract::PTRAIN_NUM_BAD_CASE
Definition: params_training_featdef.h:63

tesseract::ViterbiStateEntry::outline_length
float outline_length
Definition: lm_state.h:172

tesseract::LanguageModel::language_model_ngram_order
int language_model_ngram_order
Definition: language_model.h:312

USER_DAWG_PERM
Definition: ratngs.h:251

WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:201

tesseract::LanguageModel::CertaintyScore
float CertaintyScore(float cert)
Definition: language_model.h:104

WERD_RES::LogNewCookedChoice
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:612

tesseract::DawgPositionVector
Definition: dawg.h:369

UNICHARSET::null_sid
int null_sid() const
Definition: unicharset.h:831

tesseract::PTRAIN_NUM_FEATURE_TYPES
Definition: params_training_featdef.h:70

tprintf
#define tprintf(...)
Definition: tprintf.h:31

UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463

tesseract::AssociateStats::full_wh_ratio_var
float full_wh_ratio_var
Definition: associate.h:59

tesseract::ViterbiStateEntry::updated
bool updated
Definition: lm_state.h:188

tesseract::ViterbiStateEntry::curr_b
BLOB_CHOICE * curr_b
Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this).
Definition: lm_state.h:160

UNICHARSET::han_sid
int han_sid() const
Definition: unicharset.h:836

WERD_CHOICE::set_x_heights
void set_x_heights(float min_height, float max_height)
Definition: ratngs.h:339

PermuterType
PermuterType
Definition: ratngs.h:240

tesseract::LanguageModelNgramInfo
Definition: lm_state.h:74

tesseract::AssociateStats
Definition: associate.h:36

BlamerBundle::UpdateBestRating
void UpdateBestRating(float rating)
Definition: blamer.h:122

tesseract::LanguageModel::prev_word_str_
STRING prev_word_str_
Definition: language_model.h:392

tesseract::ViterbiStateEntry
Definition: lm_state.h:95

WERD_CHOICE::set_permuter
void set_permuter(uinT8 perm)
Definition: ratngs.h:372

tesseract::LanguageModel::wordrec_display_segmentations
int wordrec_display_segmentations
Definition: language_model.h:354

tesseract::Dict::reset_hyphen_vars
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32

tesseract::LanguageModel::~LanguageModel
~LanguageModel()
Definition: language_model.cpp:131

tesseract::LMConsistencyInfo::inconsistent_script
bool inconsistent_script
Definition: lm_consistency.h:127

NO_PERM
Definition: ratngs.h:241

tesseract::Dict
Definition: dict.h:86

tesseract::LanguageModel::language_model_ngram_space_delimited_language
bool language_model_ngram_space_delimited_language
Definition: language_model.h:333

tesseract::LanguageModelState::Print
void Print(const char *msg)
Definition: lm_state.cpp:70

tesseract::LMConsistencyInfo::punc_ref
EDGE_REF punc_ref
Definition: lm_consistency.h:122

tesseract::ParamsModel::ComputeCost
float ComputeCost(const float features[]) const
Definition: params_model.cpp:78

tesseract::LanguageModel::acceptable_choice_found_
bool acceptable_choice_found_
Definition: language_model.h:408

WERD_CHOICE::set_length
void set_length(int len)
Definition: ratngs.h:378

tesseract::LanguageModel::language_model_penalty_non_freq_dict_word
double language_model_penalty_non_freq_dict_word
Definition: language_model.h:338

tesseract::Dict::init_active_dawgs
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:523

USER_PATTERN_PERM
Definition: ratngs.h:248

STRING::length
inT32 length() const
Definition: strngs.cpp:188

tesseract::LanguageModel::language_model_debug_level
int language_model_debug_level
Definition: language_model.h:308

BLOB_CHOICE
Definition: ratngs.h:48

BOOL_MEMBER
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:304

UNICHARSET::normed_ids
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783

WERD_CHOICE
Definition: ratngs.h:271

tesseract::LanguageModel::language_model_ngram_on
bool language_model_ngram_on
Definition: language_model.h:310

DOC_DAWG_PERM
Definition: ratngs.h:250

tesseract::Dict::default_dawgs
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:540

tesseract::ViterbiStateEntry::ngram_info
LanguageModelNgramInfo * ngram_info
Definition: lm_state.h:186

tesseract::PTRAIN_RATING_PER_CHAR
Definition: params_training_featdef.h:68

tesseract::DawgArgs
Definition: dict.h:77

tesseract::LanguageModel::ConstructWord
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
Definition: language_model.cpp:1389

tesseract::ViterbiStateEntry::dawg_info
LanguageModelDawgInfo * dawg_info
Definition: lm_state.h:182

WERD_RES::x_height
float x_height
Definition: pageres.h:295

BandTriMatrix::dimension
int dimension() const
Definition: matrix.h:247

tesseract::LanguageModel::GenerateNgramInfo
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
Definition: language_model.cpp:878

tesseract::PTRAIN_DIGITS_SHORT
Definition: params_training_featdef.h:41

WERD_CHOICE::print_state
void print_state(const char *msg) const
Definition: ratngs.cpp:738

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:84

tesseract::AssociateStats::bad_shape
bool bad_shape
Definition: associate.h:55

tesseract::LanguageModel::UpdateState
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:255

freelist.h

tesseract::LanguageModel::SetTopParentLowerUpperDigit
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
Definition: language_model.cpp:425

WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:524

tesseract::Dawg::kPatternUnicharID
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:125

tesseract::LanguageModel::PrunablePath
bool PrunablePath(const ViterbiStateEntry &vse)
Definition: language_model.h:291

tesseract::ViterbiStateEntry::parent_vse
ViterbiStateEntry * parent_vse
Definition: lm_state.h:161

tesseract::PTRAIN_NUM_BAD_CHAR_TYPE
Definition: params_training_featdef.h:65

tesseract::ParamsTrainingHypothesis::str
STRING str
Definition: params_training_featdef.h:117

tesseract::LanguageModel::language_model_ngram_scale_factor
double language_model_ngram_scale_factor
Definition: language_model.h:328

tesseract::LanguageModel::kSmallestRatingFlag
static const LanguageModelFlagsType kSmallestRatingFlag
Definition: language_model.h:45

tesseract::Dict::AcceptableChoice
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:51

tesseract::Dawg::edge_char_of
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.

tesseract::LanguageModel::beginning_active_dawgs_
DawgPositionVector * beginning_active_dawgs_
Definition: language_model.h:396

tesseract::LMConsistencyInfo::BodyMaxXHeight
float BodyMaxXHeight() const
Definition: lm_consistency.h:111

UNICHARSET::get_fragment
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682

tesseract::PTRAIN_XHEIGHT_CONSISTENCY
Definition: params_training_featdef.h:64

UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470

tesseract::LanguageModelDawgInfo
Definition: lm_state.h:61

tesseract::ViterbiStateEntry::top_choice_flags
LanguageModelFlagsType top_choice_flags
Definition: lm_state.h:178

tesseract::LanguageModel::dawg_args_
DawgArgs * dawg_args_
Definition: language_model.h:356

WERD_RES::GetBlobsGap
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:732

tesseract::LMConsistencyInfo::invalid_punc
bool invalid_punc
Definition: lm_consistency.h:123

tesseract::LanguageModel::kMaxAvgNgramCost
static const float kMaxAvgNgramCost
Definition: language_model.h:53

lm_state.h

BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:79

tesseract::LanguageModel::language_model_penalty_non_dict_word
double language_model_penalty_non_dict_word
Definition: language_model.h:340

tesseract::LMConsistencyInfo::num_other
int num_other
Definition: lm_consistency.h:120

BlamerBundle::GuidedSegsearchStillGoing
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:501

UnicityTable
Definition: fontinfo.h:28

BOOL_INIT_MEMBER
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:316

tesseract::LanguageModelNgramInfo::ngram_and_classifier_cost
float ngram_and_classifier_cost
-[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
Definition: lm_state.h:90

BLOB_CHOICE::fontinfo_id
inT16 fontinfo_id() const
Definition: ratngs.h:85

intproto.h

tesseract::LanguageModelNgramInfo::context
STRING context
Definition: lm_state.h:78

tesseract::PTRAIN_DOC_SHORT
Definition: params_training_featdef.h:49

BlamerBundle::set_best_choice_is_dict_and_top_choice
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:135

tesseract::AssociateStats::full_wh_ratio_total
float full_wh_ratio_total
Definition: associate.h:57

UNICHARSET::get_script
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:611

tesseract::LanguageModel::rating_cert_scale_
float rating_cert_scale_
Definition: language_model.h:366

tesseract::PTRAIN_NUM_SHORT
Definition: params_training_featdef.h:45

tesseract::ViterbiStateEntry::HasAlnumChoice
bool HasAlnumChoice(const UNICHARSET &unicharset)
Definition: lm_state.h:145

tesseract::PTRAIN_FREQ_SHORT
Definition: params_training_featdef.h:57

name
name_table name
Definition: GlyphLessFont.c:308

tesseract::LanguageModel::FillConsistencyInfo
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
Definition: language_model.cpp:1016

UNICHARSET::id_to_unichar
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266

tesseract::Dict::compound_marker
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:107

tesseract::ViterbiStateEntry::associate_stats
AssociateStats associate_stats
Definition: lm_state.h:174

tesseract::DawgPosition
Definition: dawg.h:342

WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:192

tesseract::LMConsistencyInfo::NumInconsistentChartype
int NumInconsistentChartype() const
Definition: lm_consistency.h:90

tesseract::Dict::NoDangerousAmbig
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:152

FREQ_DAWG_PERM
Definition: ratngs.h:252

tesseract::PTRAIN_NGRAM_COST_PER_CHAR
Definition: params_training_featdef.h:61

NUMBER_PERM
Definition: ratngs.h:247

SYSTEM_DAWG_PERM
Definition: ratngs.h:249

WERD_CHOICE::permuter
uinT8 permuter() const
Definition: ratngs.h:343

tesseract::LMConsistencyInfo::xht_decision
XHeightConsistencyEnum xht_decision
Definition: lm_consistency.h:137

tesseract::LanguageModel::max_char_wh_ratio_
float max_char_wh_ratio_
Definition: language_model.h:385

tesseract::Dict::GetPuncDawg
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:408

tesseract::Dict::hyphenated
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:125

tesseract::LanguageModel::GetNextParentVSE
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
Definition: language_model.cpp:502

tesseract::LMConsistencyInfo::NumInconsistentCase
int NumInconsistentCase() const
Definition: lm_consistency.h:87

WERD_RES::raw_choice
WERD_CHOICE * raw_choice
Definition: pageres.h:224

INT_MEMBER
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:301

tesseract::Dict::stopper_debug_level
int stopper_debug_level
Definition: dict.h:612

tesseract::LanguageModel::very_beginning_active_dawgs_
DawgPositionVector * very_beginning_active_dawgs_
Definition: language_model.h:395

tesseract::LanguageModelDawgInfo::active_dawgs
DawgPositionVector * active_dawgs
Definition: lm_state.h:68

tesseract::LanguageModel::GenerateTopChoiceInfo
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
Definition: language_model.cpp:771

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:33

UNICHAR::utf8_step
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134

tesseract::LanguageModel::kUpperCaseFlag
static const LanguageModelFlagsType kUpperCaseFlag
Definition: language_model.h:47

tesseract::LanguageModel::AddViterbiStateEntry
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:563

tesseract::ViterbiStateEntry::Compare
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:130

tesseract::Dict::LetterIsOkay
int LetterIsOkay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:350

tesseract::DawgPosition::dawg_index
inT8 dawg_index
Definition: dawg.h:361

BLOB_CHOICE::fontinfo_id2
inT16 fontinfo_id2() const
Definition: ratngs.h:88

tesseract::LMConsistencyInfo::NumInconsistentSpaces
int NumInconsistentSpaces() const
Definition: lm_consistency.h:99

params.h

UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456

tesseract::LMConsistencyInfo::ComputeXheightConsistency
void ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc)
Definition: lm_consistency.cpp:29

UNICHARSET::get_other_case
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:631

tesseract::LanguageModel::correct_segmentation_explored_
bool correct_segmentation_explored_
Definition: language_model.h:410

UNICHARSET::get_ispunctuation
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477

tesseract::Dawg
Definition: dawg.h:118

WERD_CHOICE::string_and_lengths
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427

BlamerBundle::MatrixPositionCorrect
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:131

tesseract::Dict::ProbabilityInContext
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:363

tesseract::LMConsistencyInfo::num_lower
int num_lower
Definition: lm_consistency.h:125

tesseract::LanguageModel::kLowerCaseFlag
static const LanguageModelFlagsType kLowerCaseFlag
Definition: language_model.h:46

tesseract::BestChoiceBundle::fixpt
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
Definition: lm_state.h:231

tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable
int language_model_viterbi_list_max_num_prunable
Definition: language_model.h:315

tesseract::LanguageModelState::viterbi_state_entries_prunable_max_cost
float viterbi_state_entries_prunable_max_cost
Definition: lm_state.h:213

tesseract::PTRAIN_SHAPE_COST_PER_CHAR
Definition: params_training_featdef.h:60

tesseract::DawgPosition::back_to_punc
bool back_to_punc
Definition: dawg.h:366

helpers.h

tesseract::LanguageModelState::viterbi_state_entries_length
int viterbi_state_entries_length
Total number of entries in viterbi_state_entries.
Definition: lm_state.h:215

GenericVector::empty
bool empty() const
Definition: genericvector.h:84

tesseract::BestChoiceBundle::beam
PointerVector< LanguageModelState > beam
Definition: lm_state.h:235

mixed
Definition: cluster.h:45

tesseract::ViterbiStateEntry::competing_vse
ViterbiStateEntry * competing_vse
Definition: lm_state.h:164

tesseract::AssociateUtils::ComputeOutlineLength
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
Definition: associate.h:82

tesseract::AssociateStats::shape_cost
float shape_cost
Definition: associate.h:54

tesseract::LanguageModel::language_model_min_compound_length
int language_model_min_compound_length
Definition: language_model.h:335

WERD_CHOICE::set_blob_choice
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:290

BlamerBundle::AddHypothesis
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:154

UNICHARSET::common_sid
int common_sid() const
Definition: unicharset.h:832

TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:436

tesseract
Definition: baseapi.cpp:83

language_model.h

tesseract::Dict::getUnicharset
const UNICHARSET & getUnicharset() const
Definition: dict.h:96

tesseract::Dawg::type
DawgType type() const
Definition: dawg.h:127

tesseract::LanguageModel::kXhtConsistentFlag
static const LanguageModelFlagsType kXhtConsistentFlag
Definition: language_model.h:49

tesseract::LanguageModel::ComputeConsistencyAdjustment
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
Definition: language_model.h:127

UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449

NGRAM_PERM
Definition: ratngs.h:246

tesseract::LMPainPoints
Definition: lm_pain_points.h:53

BLOB_CHOICE::PosAndSizeAgree
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:132

tesseract::LMConsistencyInfo::BodyMinXHeight
float BodyMinXHeight() const
Definition: lm_consistency.h:106

tesseract::LanguageModel::GenerateDawgInfo
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
Definition: language_model.cpp:787

tesseract::LMConsistencyInfo::InconsistentXHeight
int InconsistentXHeight() const
Definition: lm_consistency.h:102

tesseract::LanguageModelFlagsType
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:37

tesseract::LanguageModel::InitForWord
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
Definition: language_model.cpp:138

tesseract::LanguageModel::AcceptablePath
bool AcceptablePath(const ViterbiStateEntry &vse)
Definition: language_model.h:301

TOP_CHOICE_PERM
Definition: ratngs.h:243

UNICHARSET
Definition: unicharset.h:139

tesseract::BestChoiceBundle::updated
bool updated
Flag to indicate whether anything was changed.
Definition: lm_state.h:229

tesseract::DawgPositionVector::clear
void clear()
Definition: dawg.h:381

tesseract::Dict::GetDawg
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:406

double_MEMBER
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:310

tesseract::DawgArgs::updated_dawgs
DawgPositionVector * updated_dawgs
Definition: dict.h:82

WERD_CHOICE::print
void print() const
Definition: ratngs.h:563

tesseract::ViterbiStateEntry::cost
float cost
Definition: lm_state.h:157

BlamerBundle
Definition: blamer.h:88

tesseract::LMConsistencyInfo::num_punc
int num_punc
Definition: lm_consistency.h:119

matrix.h

tesseract::LMConsistencyInfo::num_alphas
int num_alphas
Definition: lm_consistency.h:117

STRING
Definition: strngs.h:44

tesseract::ViterbiStateEntry::Print
void Print(const char *msg) const
Definition: lm_state.cpp:27

tesseract::LanguageModelState
Struct to store information maintained by various language model components.
Definition: lm_state.h:197

NULL
#define NULL
Definition: host.h:144

tesseract::DAWG_TYPE_WORD
Definition: dawg.h:73

tesseract::LanguageModel::ComputeAssociateStats
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
Definition: language_model.h:272

tesseract::Dawg::edge_letter
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

tesseract::Dict::set_hyphen_word
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:49

tesseract::LanguageModel::ComputeNgramCost
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
Definition: language_model.cpp:935

tesseract::LanguageModelNgramInfo::ngram_cost
float ngram_cost
-ln(P_ngram_model(path))
Definition: lm_state.h:88

tesseract::Dict::tessedit_truncate_wordchoice_log
int tessedit_truncate_wordchoice_log
Definition: dict.h:618

WERD_RES::blob_widths
GenericVector< int > blob_widths
Definition: pageres.h:205

NODE_REF
inT64 NODE_REF
Definition: dawg.h:55

tesseract::LanguageModelState::viterbi_state_entries
ViterbiStateEntry_LIST viterbi_state_entries
Storage for the Viterbi state.
Definition: lm_state.h:210

tesseract::LMConsistencyInfo::num_digits
int num_digits
Definition: lm_consistency.h:118

tesseract::LanguageModel::ComputeDenom
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
Definition: language_model.cpp:995

tesseract::Dict::has_hyphen_end
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:142

GenericVector< UNICHAR_ID >

tesseract::ViterbiStateEntry::min_certainty
float min_certainty
Definition: lm_state.h:169

UNICHARSET::size
int size() const
Definition: unicharset.h:297

MATRIX_COORD::col
int col
Definition: matrix.h:345

tesseract::LanguageModel::params_model_
ParamsModel params_model_
Definition: language_model.h:413

tesseract::PTRAIN_NUM_BAD_SPACING
Definition: params_training_featdef.h:66

STRING::string
const char * string() const
Definition: strngs.cpp:193

tesseract::ViterbiStateEntry::length
int length
Definition: lm_state.h:171

MATRIX_COORD::row
int row
Definition: matrix.h:346

BlamerBundle::correct_segmentation_length
int correct_segmentation_length() const
Definition: blamer.h:126

dawg.h

BLOB_CHOICE::certainty
float certainty() const
Definition: ratngs.h:82

tesseract::Dict::is_apostrophe
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:116

tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step
bool language_model_ngram_use_only_first_uft8_step
Definition: language_model.h:325

tesseract::LanguageModel::ComputeAdjustedPathCost
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
Definition: language_model.cpp:1198

BLOB_CHOICE::matrix_cell
const MATRIX_COORD & matrix_cell()
Definition: ratngs.h:114

tesseract::PTRAIN_DICT_SHORT
Definition: params_training_featdef.h:53

tesseract::LanguageModel::LanguageModel
LanguageModel(const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
Definition: language_model.cpp:45

tesseract::Dict::GetStartingNode
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:412

tesseract::Dict::adjust_word
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:625

BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76

tesseract::LanguageModel::UpdateBestChoice
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:1240

tesseract::ParamsTrainingHypothesis::features
float features[PTRAIN_NUM_FEATURE_TYPES]
Definition: params_training_featdef.h:116

WERD_CHOICE::set_rating
void set_rating(float new_val)
Definition: ratngs.h:366

tesseract::BestChoiceBundle
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:219

WERD_CHOICE::set_dangerous_ambig_found_
void set_dangerous_ambig_found_(bool value)
Definition: ratngs.h:363

tesseract::DawgArgs::permuter
PermuterType permuter
Definition: dict.h:83

tesseract::LanguageModel::language_model_penalty_increment
double language_model_penalty_increment
Definition: language_model.h:353

tesseract::LMConsistencyInfo::inconsistent_font
bool inconsistent_font
Definition: lm_consistency.h:129

tesseract::ViterbiStateEntry::ratings_sum
float ratings_sum
Definition: lm_state.h:168

tesseract::LMConsistencyInfo
Definition: lm_consistency.h:38