tesseract  5.0.0-alpha-619-ge9db
language_model.h
Go to the documentation of this file.
1 // File: language_model.h
3 // Description: Functions that utilize the knowledge about the properties,
4 // structure and statistics of the language to help segmentation
5 // search.
6 // Author: Daria Antonova
7 //
8 // (C) Copyright 2009, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_
22 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_
23 
24 #include <cmath> // for exp
25 #include "associate.h" // for AssociateStats (ptr only), AssociateUtils
26 #include "dawg.h" // for DawgPositionVector
27 #include "dict.h" // for DawgArgs, Dict
28 #include "lm_consistency.h" // for LMConsistencyInfo
29 #include "lm_state.h" // for ViterbiStateEntry, LanguageModelFlagsType
30 #include "params.h" // for DoubleParam, double_VAR_H, IntParam, Boo...
31 #include "params_model.h" // for ParamsModel
32 #include "ratngs.h" // for BLOB_CHOICE (ptr only), BLOB_CHOICE_LIST...
33 #include "stopper.h" // for DANGERR
34 #include <tesseract/strngs.h> // for STRING
35 
36 class UNICHARSET;
37 class WERD_RES;
38 
39 struct BlamerBundle;
40 
41 template <typename T> class UnicityTable;
42 
43 namespace tesseract {
44 
45 class LMPainPoints;
46 struct FontInfo;
47 
48 // This class that contains the data structures and functions necessary
49 // to represent and use the knowledge about the language.
51  public:
52  // Masks for keeping track of top choices that should not be pruned out.
56  static const LanguageModelFlagsType kDigitFlag = 0x8;
58 
59  // Denominator for normalizing per-letter ngram cost when deriving
60  // penalty adjustments.
61  static const float kMaxAvgNgramCost;
62 
63  LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict);
65 
66  // Fills the given floats array with features extracted from path represented
67  // by the given ViterbiStateEntry. See ccstruct/params_training_featdef.h
68  // for feature information.
69  // Note: the function assumes that features points to an array of size
70  // PTRAIN_NUM_FEATURE_TYPES.
71  static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse,
72  float features[]);
73 
74  // Updates data structures that are used for the duration of the segmentation
75  // search on the current word;
76  void InitForWord(const WERD_CHOICE *prev_word,
77  bool fixed_pitch, float max_char_wh_ratio,
78  float rating_cert_scale);
79 
80  // Updates language model state of the given BLOB_CHOICE_LIST (from
81  // the ratings matrix) a its parent. Updates pain_points if new
82  // problematic points are found in the segmentation graph.
83  //
84  // At most language_model_viterbi_list_size are kept in each
85  // LanguageModelState.viterbi_state_entries list.
86  // At most language_model_viterbi_list_max_num_prunable of those are prunable
87  // (non-dictionary) paths.
88  // The entries that represent dictionary word paths are kept at the front
89  // of the list.
90  // The list ordered by cost that is computed collectively by several
91  // language model components (currently dawg and ngram components).
92  bool UpdateState(
93  bool just_classified,
94  int curr_col, int curr_row,
95  BLOB_CHOICE_LIST *curr_list,
96  LanguageModelState *parent_node,
97  LMPainPoints *pain_points,
98  WERD_RES *word_res,
99  BestChoiceBundle *best_choice_bundle,
100  BlamerBundle *blamer_bundle);
101 
102  // Returns true if an acceptable best choice was discovered.
104  inline void SetAcceptableChoiceFound(bool val) {
106  }
107  // Returns the reference to ParamsModel.
109 
110  protected:
111 
112  inline float CertaintyScore(float cert) {
114  // cert is assumed to be between 0 and -dict_->certainty_scale.
115  // If you enable language_model_use_sigmoidal_certainty, you
116  // need to adjust language_model_ngram_nonmatch_score as well.
117  cert = -cert / dict_->certainty_scale;
118  return 1.0f / (1.0f + exp(10.0f * cert));
119  } else {
120  return (-1.0f / cert);
121  }
122  }
123 
124  inline float ComputeAdjustment(int num_problems, float penalty) {
125  if (num_problems == 0) return 0.0f;
126  if (num_problems == 1) return penalty;
127  return (penalty + (language_model_penalty_increment *
128  static_cast<float>(num_problems-1)));
129  }
130 
131  // Computes the adjustment to the ratings sum based on the given
132  // consistency_info. The paths with invalid punctuation, inconsistent
133  // case and character type are penalized proportionally to the number
134  // of inconsistencies on the path.
136  const LanguageModelDawgInfo *dawg_info,
137  const LMConsistencyInfo &consistency_info) {
138  if (dawg_info != nullptr) {
139  return ComputeAdjustment(consistency_info.NumInconsistentCase(),
141  (consistency_info.inconsistent_script ?
143  }
144  return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
146  ComputeAdjustment(consistency_info.NumInconsistentCase(),
148  ComputeAdjustment(consistency_info.NumInconsistentChartype(),
150  ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
152  (consistency_info.inconsistent_script ?
154  (consistency_info.inconsistent_font ?
156  }
157 
158  // Returns an adjusted ratings sum that includes inconsistency penalties,
159  // penalties for non-dictionary paths and paths with dips in ngram
160  // probability.
162 
163  // Finds the first lower and upper case letter and first digit in curr_list.
164  // Uses the first character in the list in place of empty results.
165  // Returns true if both alpha and digits are found.
166  bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
167  BLOB_CHOICE **first_lower,
168  BLOB_CHOICE **first_upper,
169  BLOB_CHOICE **first_digit) const;
170  // Forces there to be at least one entry in the overall set of the
171  // viterbi_state_entries of each element of parent_node that has the
172  // top_choice_flag set for lower, upper and digit using the same rules as
173  // GetTopLowerUpperDigit, setting the flag on the first found suitable
174  // candidate, whether or not the flag is set on some other parent.
175  // Returns 1 if both alpha and digits are found among the parents, -1 if no
176  // parents are found at all (a legitimate case), and 0 otherwise.
177  int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const;
178 
179  // Finds the next ViterbiStateEntry with which the given unichar_id can
180  // combine sensibly, taking into account any mixed alnum/mixed case
181  // situation, and whether this combination has been inspected before.
183  bool just_classified, bool mixed_alnum,
184  const BLOB_CHOICE* bc, LanguageModelFlagsType blob_choice_flags,
185  const UNICHARSET& unicharset, WERD_RES* word_res,
186  ViterbiStateEntry_IT* vse_it,
187  LanguageModelFlagsType* top_choice_flags) const;
188  // Helper function that computes the cost of the path composed of the
189  // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE.
190  // If the new path looks good enough, adds a new ViterbiStateEntry to the
191  // list of viterbi entries in the given BLOB_CHOICE and returns true.
193  LanguageModelFlagsType top_choice_flags, float denom, bool word_end,
194  int curr_col, int curr_row, BLOB_CHOICE *b,
195  LanguageModelState *curr_state, ViterbiStateEntry *parent_vse,
196  LMPainPoints *pain_points, WERD_RES *word_res,
197  BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);
198 
199  // Determines whether a potential entry is a true top choice and
200  // updates changed accordingly.
201  //
202  // Note: The function assumes that b, top_choice_flags and changed
203  // are not nullptr.
205  const ViterbiStateEntry *parent_vse,
206  LanguageModelState *lms);
207 
208  // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and
209  // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo
210  // with updated active dawgs, constraints and permuter.
211  //
212  // Note: the caller is responsible for deleting the returned pointer.
214  int curr_col, int curr_row,
215  const BLOB_CHOICE &b,
216  const ViterbiStateEntry *parent_vse);
217 
218  // Computes p(unichar | parent context) and records it in ngram_cost.
219  // If b.unichar_id() is an unlikely continuation of the parent context
220  // sets found_small_prob to true and returns nullptr.
221  // Otherwise creates a new LanguageModelNgramInfo entry containing the
222  // updated context (that includes b.unichar_id() at the end) and returns it.
223  //
224  // Note: the caller is responsible for deleting the returned pointer.
226  const char *unichar, float certainty, float denom,
227  int curr_col, int curr_row, float outline_length,
228  const ViterbiStateEntry *parent_vse);
229 
230  // Computes -(log(prob(classifier)) + log(prob(ngram model)))
231  // for the given unichar in the given context. If there are multiple
232  // unichars at one position - takes the average of their probabilities.
233  // UNICHAR::utf8_step() is used to separate out individual UTF8 characters,
234  // since probability_in_context() can only handle one at a time (while
235  // unicharset might contain ngrams and glyphs composed from multiple UTF8
236  // characters).
237  float ComputeNgramCost(const char *unichar, float certainty, float denom,
238  const char *context, int *unichar_step_len,
239  bool *found_small_prob, float *ngram_prob);
240 
241  // Computes the normalization factors for the classifier confidences
242  // (used by ComputeNgramCost()).
243  float ComputeDenom(BLOB_CHOICE_LIST *curr_list);
244 
245  // Fills the given consistenty_info based on parent_vse.consistency_info
246  // and on the consistency of the given unichar_id with parent_vse.
247  void FillConsistencyInfo(
248  int curr_col, bool word_end, BLOB_CHOICE *b,
249  ViterbiStateEntry *parent_vse,
250  WERD_RES *word_res,
251  LMConsistencyInfo *consistency_info);
252 
253  // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs
254  // on the path represented by the given BLOB_CHOICE and language model
255  // state entries (lmse, dse). The path is re-constructed by following
256  // the parent pointers in the the lang model state entries). If the
257  // constructed WERD_CHOICE is better than the best/raw choice recorded
258  // in the best_choice_bundle, this function updates the corresponding
259  // fields and sets best_choice_bunldle->updated to true.
261  LMPainPoints *pain_points,
262  WERD_RES *word_res,
263  BestChoiceBundle *best_choice_bundle,
264  BlamerBundle *blamer_bundle);
265 
266  // Constructs a WERD_CHOICE by tracing parent pointers starting with
267  // the given LanguageModelStateEntry. Returns the constructed word.
268  // Updates best_char_choices, certainties and state if they are not
269  // nullptr (best_char_choices and certainties are assumed to have the
270  // length equal to lmse->length).
271  // The caller is responsible for freeing memory associated with the
272  // returned WERD_CHOICE.
274  WERD_RES *word_res,
275  DANGERR *fixpt,
276  BlamerBundle *blamer_bundle,
277  bool *truth_path);
278 
279  // Wrapper around AssociateUtils::ComputeStats().
280  inline void ComputeAssociateStats(int col, int row,
281  float max_char_wh_ratio,
282  ViterbiStateEntry *parent_vse,
283  WERD_RES *word_res,
284  AssociateStats *associate_stats) {
286  col, row,
287  (parent_vse != nullptr) ? &(parent_vse->associate_stats) : nullptr,
288  (parent_vse != nullptr) ? parent_vse->length : 0,
289  fixed_pitch_, max_char_wh_ratio,
290  word_res, language_model_debug_level > 2, associate_stats);
291  }
292 
293  // Returns true if the path with such top_choice_flags and dawg_info
294  // could be pruned out (i.e. is neither a system/user/frequent dictionary
295  // nor a top choice path).
296  // In non-space delimited languages all paths can be "somewhat" dictionary
297  // words. In such languages we can not do dictionary-driven path pruning,
298  // so paths with non-empty dawg_info are considered prunable.
299  inline bool PrunablePath(const ViterbiStateEntry &vse) {
300  if (vse.top_choice_flags) return false;
301  if (vse.dawg_info != nullptr &&
304  vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false;
305  return true;
306  }
307 
308  // Returns true if the given ViterbiStateEntry represents an acceptable path.
309  inline bool AcceptablePath(const ViterbiStateEntry &vse) {
310  return (vse.dawg_info != nullptr || vse.Consistent() ||
311  (vse.ngram_info != nullptr && !vse.ngram_info->pruned));
312  }
313 
314  public:
315  // Parameters.
316  INT_VAR_H(language_model_debug_level, 0, "Language model debug level");
318  "Turn on/off the use of character ngram model");
320  "Maximum order of the character ngram model");
322  "Maximum number of prunable (those for which PrunablePath() is"
323  " true) entries in each viterbi list recorded in BLOB_CHOICEs");
325  "Maximum size of viterbi lists recorded in BLOB_CHOICEs");
327  "To avoid overly small denominators use this as the floor"
328  " of the probability returned by the ngram model");
330  "Average classifier score of a non-matching unichar");
332  "Use only the first UTF8 step of the given string"
333  " when computing log probabilities");
335  "Strength of the character ngram model relative to the"
336  " character classifier ");
338  "Factor to bring log-probs into the same range as ratings"
339  " when multiplied by outline length ");
341  "Words are delimited by space");
343  "Minimum length of compound words");
344  // Penalties used for adjusting path costs and final word rating.
346  "Penalty for words not in the frequent word dictionary");
348  "Penalty for non-dictionary words");
350  "Penalty for inconsistent punctuation");
352  "Penalty for inconsistent case");
354  "Penalty for inconsistent script");
356  "Penalty for inconsistent character type");
358  "Penalty for inconsistent font");
360  "Penalty for inconsistent spacing");
361  double_VAR_H(language_model_penalty_increment, 0.01, "Penalty increment");
362  INT_VAR_H(wordrec_display_segmentations, 0, "Display Segmentations");
364  "Use sigmoidal score for certainty");
365 
366 
367  protected:
368  // Member Variables.
369 
370  // Temporary DawgArgs struct that is re-used across different words to
371  // avoid dynamic memory re-allocation (should be cleared before each use).
373  // Scaling for recovering blob outline length from rating and certainty.
374  float rating_cert_scale_ = 0.0f;
375 
376  // The following variables are set at construction time.
377 
378  // Pointer to fontinfo table (not owned by LanguageModel).
380 
381  // Pointer to Dict class, that is used for querying the dictionaries
382  // (the pointer is not owned by LanguageModel).
383  Dict* dict_ = nullptr;
384 
385  // TODO(daria): the following variables should become LanguageModel params
386  // when the old code in bestfirst.cpp and heuristic.cpp is deprecated.
387  //
388  // Set to true if we are dealing with fixed pitch text
389  // (set to assume_fixed_pitch_char_segment).
390  bool fixed_pitch_ = false;
391  // Max char width-to-height ratio allowed
392  // (set to segsearch_max_char_wh_ratio).
393  float max_char_wh_ratio_ = 0.0f;
394 
395  // The following variables are initialized with InitForWord().
396 
397  // String representation of the classification of the previous word
398  // (since this is only used by the character ngram model component,
399  // only the last language_model_ngram_order of the word are stored).
402  // Active dawg vector.
405  // Set to true if acceptable choice was discovered.
406  // Note: it would be nice to use this to terminate the search once an
407  // acceptable choices is found. However we do not do that and once an
408  // acceptable choice is found we finish looking for alternative choices
409  // in the current segmentation graph and then exit the search (no more
410  // classifications are done after an acceptable choice is found).
411  // This is needed in order to let the search find the words very close to
412  // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these
413  // choices. This way the stopper will know that the best choice is not
414  // ambiguous (i.e. there are best choices in the best choice list that have
415  // ratings close to the very best one) and will be less likely to mis-adapt.
417  // Set to true if a choice representing correct segmentation was explored.
419 
420  // Params models containing weights for for computing ViterbiStateEntry costs.
422 };
423 
424 } // namespace tesseract
425 
426 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_
tesseract::LanguageModel::prev_word_str_
STRING prev_word_str_
Definition: language_model.h:400
tesseract::LanguageModel::AddViterbiStateEntry
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:561
tesseract::ViterbiStateEntry::top_choice_flags
LanguageModelFlagsType top_choice_flags
Definition: lm_state.h:192
strngs.h
tesseract::ViterbiStateEntry::dawg_info
LanguageModelDawgInfo * dawg_info
Definition: lm_state.h:166
tesseract::LanguageModel::getParamsModel
ParamsModel & getParamsModel()
Definition: language_model.h:108
tesseract::LanguageModel::AcceptableChoiceFound
bool AcceptableChoiceFound()
Definition: language_model.h:103
tesseract::ViterbiStateEntry::length
int length
number of characters on the path
Definition: lm_state.h:185
tesseract::LanguageModelNgramInfo::pruned
bool pruned
Definition: lm_state.h:82
tesseract::LanguageModel::language_model_ngram_on
bool language_model_ngram_on
Definition: language_model.h:318
dict.h
tesseract::ViterbiStateEntry::Consistent
bool Consistent() const
Definition: lm_state.h:139
tesseract::LanguageModel::wordrec_display_segmentations
int wordrec_display_segmentations
Definition: language_model.h:362
tesseract::LanguageModel::params_model_
ParamsModel params_model_
Definition: language_model.h:421
tesseract::LanguageModel::language_model_penalty_non_dict_word
double language_model_penalty_non_dict_word
Definition: language_model.h:348
WERD_CHOICE
Definition: ratngs.h:261
tesseract::LanguageModel::correct_segmentation_explored_
bool correct_segmentation_explored_
Definition: language_model.h:418
tesseract::LanguageModel::language_model_penalty_script
double language_model_penalty_script
Definition: language_model.h:354
tesseract::LanguageModel::kXhtConsistentFlag
static const LanguageModelFlagsType kXhtConsistentFlag
Definition: language_model.h:57
SYSTEM_DAWG_PERM
Definition: ratngs.h:239
tesseract::LanguageModel::max_char_wh_ratio_
float max_char_wh_ratio_
Definition: language_model.h:393
params.h
tesseract::ViterbiStateEntry
Definition: lm_state.h:91
tesseract::LMConsistencyInfo::NumInconsistentSpaces
int NumInconsistentSpaces() const
Definition: lm_consistency.h:99
tesseract::LanguageModel::language_model_penalty_spacing
double language_model_penalty_spacing
Definition: language_model.h:360
tesseract::LMConsistencyInfo::inconsistent_script
bool inconsistent_script
Definition: lm_consistency.h:136
tesseract::LanguageModel::language_model_penalty_non_freq_dict_word
double language_model_penalty_non_freq_dict_word
Definition: language_model.h:346
tesseract::LanguageModel::GenerateNgramInfo
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
Definition: language_model.cpp:877
tesseract::LanguageModel::very_beginning_active_dawgs_
DawgPositionVector very_beginning_active_dawgs_
Definition: language_model.h:403
tesseract::LanguageModel::dict_
Dict * dict_
Definition: language_model.h:383
STRING
Definition: strngs.h:45
tesseract::LanguageModel::~LanguageModel
~LanguageModel()
Definition: language_model.cpp:134
tesseract::LanguageModel::ComputeAdjustedPathCost
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
Definition: language_model.cpp:1199
WERD_RES
Definition: pageres.h:160
tesseract::LanguageModelDawgInfo
Definition: lm_state.h:61
tesseract::DawgArgs
Definition: dict.h:80
tesseract::LanguageModel::SetAcceptableChoiceFound
void SetAcceptableChoiceFound(bool val)
Definition: language_model.h:104
stopper.h
tesseract::LanguageModel
Definition: language_model.h:50
tesseract::LanguageModel::dawg_args_
DawgArgs dawg_args_
Definition: language_model.h:364
double_VAR_H
#define double_VAR_H(name, val, comment)
Definition: params.h:298
tesseract::AssociateStats
Definition: associate.h:36
ratngs.h
lm_consistency.h
tesseract::LanguageModel::UpdateState
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:253
tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step
bool language_model_ngram_use_only_first_uft8_step
Definition: language_model.h:333
tesseract::LanguageModel::language_model_ngram_small_prob
double language_model_ngram_small_prob
Definition: language_model.h:328
tesseract::LanguageModel::kMaxAvgNgramCost
static const float kMaxAvgNgramCost
Definition: language_model.h:61
tesseract::LanguageModel::LanguageModel
LanguageModel(const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
Definition: language_model.cpp:53
tesseract::LanguageModel::language_model_ngram_rating_factor
double language_model_ngram_rating_factor
Definition: language_model.h:339
tesseract::LanguageModel::rating_cert_scale_
float rating_cert_scale_
Definition: language_model.h:374
tesseract::BestChoiceBundle
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:222
dawg.h
tesseract::LanguageModel::language_model_ngram_space_delimited_language
bool language_model_ngram_space_delimited_language
Definition: language_model.h:341
tesseract::ViterbiStateEntry::ngram_info
LanguageModelNgramInfo * ngram_info
Definition: lm_state.h:170
tesseract::LMConsistencyInfo::NumInconsistentCase
int NumInconsistentCase() const
Definition: lm_consistency.h:87
tesseract::LanguageModel::GetTopLowerUpperDigit
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
Definition: language_model.cpp:383
tesseract::ViterbiStateEntry::associate_stats
AssociateStats associate_stats
character widths/gaps/seams
Definition: lm_state.h:188
tesseract::Dict::certainty_scale
double certainty_scale
Definition: dict.h:627
UNICHARSET
Definition: unicharset.h:145
tesseract::LMPainPoints
Definition: lm_pain_points.h:56
tesseract::LanguageModel::language_model_penalty_case
double language_model_penalty_case
Definition: language_model.h:352
tesseract::LanguageModelFlagsType
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:37
tesseract::ParamsModel
Definition: params_model.h:31
tesseract::LanguageModel::ComputeAdjustment
float ComputeAdjustment(int num_problems, float penalty)
Definition: language_model.h:124
tesseract
Definition: baseapi.h:65
tesseract::LanguageModelState
Struct to store information maintained by various language model components.
Definition: lm_state.h:200
tesseract::LanguageModel::language_model_ngram_order
int language_model_ngram_order
Definition: language_model.h:320
tesseract::LanguageModel::ComputeDenom
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
Definition: language_model.cpp:994
INT_VAR_H
#define INT_VAR_H(name, val, comment)
Definition: params.h:292
tesseract::LanguageModel::GenerateDawgInfo
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
Definition: language_model.cpp:785
tesseract::LanguageModel::fixed_pitch_
bool fixed_pitch_
Definition: language_model.h:390
tesseract::LanguageModel::kUpperCaseFlag
static const LanguageModelFlagsType kUpperCaseFlag
Definition: language_model.h:55
tesseract::LanguageModel::prev_word_unichar_step_len_
int prev_word_unichar_step_len_
Definition: language_model.h:401
tesseract::DawgPositionVector
Definition: dawg.h:373
tesseract::LanguageModel::acceptable_choice_found_
bool acceptable_choice_found_
Definition: language_model.h:416
tesseract::LanguageModel::fontinfo_table_
const UnicityTable< FontInfo > * fontinfo_table_
Definition: language_model.h:379
GenericVector< DANGERR_INFO >
tesseract::LanguageModel::language_model_use_sigmoidal_certainty
bool language_model_use_sigmoidal_certainty
Definition: language_model.h:364
tesseract::LanguageModel::language_model_penalty_increment
double language_model_penalty_increment
Definition: language_model.h:361
tesseract::LanguageModelDawgInfo::permuter
PermuterType permuter
Definition: lm_state.h:65
tesseract::LanguageModel::ComputeConsistencyAdjustment
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
Definition: language_model.h:135
UnicityTable
Definition: fontinfo.h:30
tesseract::LanguageModel::GenerateTopChoiceInfo
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
Definition: language_model.cpp:769
tesseract::LanguageModelNgramInfo
Definition: lm_state.h:70
tesseract::Dict
Definition: dict.h:91
tesseract::LanguageModel::GetNextParentVSE
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
Definition: language_model.cpp:500
tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable
int language_model_viterbi_list_max_num_prunable
Definition: language_model.h:323
tesseract::LMConsistencyInfo
Definition: lm_consistency.h:38
tesseract::LanguageModel::ExtractFeaturesFromPath
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
Definition: language_model.cpp:1341
tesseract::LanguageModel::ConstructWord
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
Definition: language_model.cpp:1390
BLOB_CHOICE
Definition: ratngs.h:49
tesseract::LanguageModel::language_model_penalty_chartype
double language_model_penalty_chartype
Definition: language_model.h:356
tesseract::LanguageModel::CertaintyScore
float CertaintyScore(float cert)
Definition: language_model.h:112
BOOL_VAR_H
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:294
params_model.h
tesseract::LanguageModel::language_model_min_compound_length
int language_model_min_compound_length
Definition: language_model.h:343
tesseract::LanguageModel::language_model_viterbi_list_max_size
int language_model_viterbi_list_max_size
Definition: language_model.h:325
tesseract::LanguageModel::kLowerCaseFlag
static const LanguageModelFlagsType kLowerCaseFlag
Definition: language_model.h:54
tesseract::LanguageModel::language_model_ngram_nonmatch_score
double language_model_ngram_nonmatch_score
Definition: language_model.h:330
tesseract::LanguageModel::FillConsistencyInfo
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
Definition: language_model.cpp:1015
tesseract::AssociateUtils::ComputeStats
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
Definition: associate.cpp:34
tesseract::LanguageModel::SetTopParentLowerUpperDigit
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
Definition: language_model.cpp:423
tesseract::LanguageModel::kSmallestRatingFlag
static const LanguageModelFlagsType kSmallestRatingFlag
Definition: language_model.h:53
tesseract::LanguageModel::InitForWord
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
Definition: language_model.cpp:136
tesseract::LanguageModel::kDigitFlag
static const LanguageModelFlagsType kDigitFlag
Definition: language_model.h:56
tesseract::LanguageModel::beginning_active_dawgs_
DawgPositionVector beginning_active_dawgs_
Definition: language_model.h:404
tesseract::LanguageModel::ComputeAssociateStats
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
Definition: language_model.h:280
tesseract::LanguageModel::language_model_penalty_font
double language_model_penalty_font
Definition: language_model.h:358
tesseract::LanguageModel::language_model_debug_level
int language_model_debug_level
Definition: language_model.h:316
tesseract::LanguageModel::ComputeNgramCost
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
Definition: language_model.cpp:934
associate.h
BlamerBundle
Definition: blamer.h:103
tesseract::LanguageModel::PrunablePath
bool PrunablePath(const ViterbiStateEntry &vse)
Definition: language_model.h:299
FREQ_DAWG_PERM
Definition: ratngs.h:242
tesseract::LanguageModel::language_model_penalty_punc
double language_model_penalty_punc
Definition: language_model.h:350
USER_DAWG_PERM
Definition: ratngs.h:241
lm_state.h
tesseract::LanguageModel::AcceptablePath
bool AcceptablePath(const ViterbiStateEntry &vse)
Definition: language_model.h:309
tesseract::LMConsistencyInfo::NumInconsistentChartype
int NumInconsistentChartype() const
Definition: lm_consistency.h:90
tesseract::LanguageModel::UpdateBestChoice
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:1241
tesseract::LMConsistencyInfo::inconsistent_font
bool inconsistent_font
Definition: lm_consistency.h:137
tesseract::LanguageModel::language_model_ngram_scale_factor
double language_model_ngram_scale_factor
Definition: language_model.h:336
tesseract::LMConsistencyInfo::NumInconsistentPunc
int NumInconsistentPunc() const
Definition: lm_consistency.h:84