tesseract  5.0.0-alpha-619-ge9db
lm_state.h
Go to the documentation of this file.
1 // File: lm_state.h
3 // Description: Structures and functionality for capturing the state of
4 // segmentation search guided by the language model.
5 // Author: Rika Antonova
6 //
7 // (C) Copyright 2012, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_
21 #define TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_
22 
23 #include "associate.h" // for AssociateStats
24 #include "dawg.h" // for DawgPositionVector
25 #include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
26 #include <tesseract/genericvector.h> // for PointerVector
27 #include "lm_consistency.h" // for LMConsistencyInfo
28 #include "ratngs.h" // for BLOB_CHOICE, PermuterType
29 #include "stopper.h" // for DANGERR
30 #include <tesseract/strngs.h> // for STRING
31 #include <tesseract/unichar.h> // for UNICHAR_ID
32 #include "unicharset.h" // for UNICHARSET
33 
34 namespace tesseract {
35 
37 using LanguageModelFlagsType = unsigned char;
38 
57 
63  : active_dawgs(*a), permuter(pt) {}
66 };
67 
71  LanguageModelNgramInfo(const char *c, int l, bool p, float nc, float ncc)
82  bool pruned;
84  float ngram_cost;
87 };
88 
91 struct ViterbiStateEntry : public ELIST_LINK {
93  BLOB_CHOICE *b, float c, float ol,
94  const LMConsistencyInfo &ci,
95  const AssociateStats &as,
99  const char *debug_uch)
100  : curr_b(b), parent_vse(pe), competing_vse(nullptr),
101  dawg_info(d), ngram_info(n),
102  cost(c),
103  ratings_sum(b->rating()),
104  min_certainty(b->certainty()),
105  adapted(b->IsAdapted()),
106  length(1),
107  outline_length(ol),
108  consistency_info(ci),
109  associate_stats(as),
110  top_choice_flags(tcf),
111  updated(true) {
112  debug_str = (debug_uch == nullptr) ? nullptr : new STRING();
113  if (pe != nullptr) {
114  ratings_sum += pe->ratings_sum;
115  if (pe->min_certainty < min_certainty) {
117  }
118  adapted += pe->adapted;
119  length += pe->length;
121  if (debug_uch != nullptr) *debug_str += *(pe->debug_str);
122  }
123  if (debug_str != nullptr && debug_uch != nullptr) *debug_str += debug_uch;
124  }
126  delete dawg_info;
127  delete ngram_info;
128  delete debug_str;
129  }
132  static int Compare(const void *e1, const void *e2) {
133  const ViterbiStateEntry *ve1 =
134  *static_cast<const ViterbiStateEntry *const *>(e1);
135  const ViterbiStateEntry *ve2 =
136  *static_cast<const ViterbiStateEntry *const *>(e2);
137  return (ve1->cost < ve2->cost) ? -1 : 1;
138  }
139  inline bool Consistent() const {
140  if (dawg_info != nullptr && consistency_info.NumInconsistentCase() == 0) {
141  return true;
142  }
143  return consistency_info.Consistent();
144  }
147  bool HasAlnumChoice(const UNICHARSET& unicharset) {
148  if (curr_b == nullptr) return false;
149  UNICHAR_ID unichar_id = curr_b->unichar_id();
150  if (unicharset.get_isalpha(unichar_id) ||
151  unicharset.get_isdigit(unichar_id))
152  return true;
153  return false;
154  }
155  void Print(const char *msg) const;
156 
163 
167 
171 
175 
178  float cost;
179 
182  float ratings_sum;
184  int adapted;
185  int length;
189 
193 
194  bool updated;
195 };
196 
198 
199 struct LanguageModelState {
206 
208  void Clear();
209 
210  void Print(const char *msg);
211 
213  ViterbiStateEntry_LIST viterbi_state_entries;
219 };
220 
223  explicit BestChoiceBundle(int matrix_dimension)
224  : updated(false), best_vse(nullptr) {
225  beam.reserve(matrix_dimension);
226  for (int i = 0; i < matrix_dimension; ++i)
227  beam.push_back(new LanguageModelState);
228  }
230 
232  bool updated;
241 };
242 
243 } // namespace tesseract
244 
245 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_
elst.h
tesseract::ViterbiStateEntry::top_choice_flags
LanguageModelFlagsType top_choice_flags
Definition: lm_state.h:192
strngs.h
tesseract::ViterbiStateEntry::dawg_info
LanguageModelDawgInfo * dawg_info
Definition: lm_state.h:166
tesseract::ViterbiStateEntry::Print
void Print(const char *msg) const
Definition: lm_state.cpp:26
tesseract::ViterbiStateEntry::length
int length
number of characters on the path
Definition: lm_state.h:185
tesseract::LanguageModelNgramInfo::pruned
bool pruned
Definition: lm_state.h:82
tesseract::ViterbiStateEntry::outline_length
float outline_length
length of the outline so far
Definition: lm_state.h:186
tesseract::ViterbiStateEntry::Consistent
bool Consistent() const
Definition: lm_state.h:139
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:481
tesseract::LanguageModelNgramInfo::context
STRING context
Definition: lm_state.h:74
PermuterType
PermuterType
Definition: ratngs.h:230
tesseract::ViterbiStateEntry
Definition: lm_state.h:91
tesseract::PointerVector
Definition: genericvector.h:417
STRING
Definition: strngs.h:45
tesseract::ViterbiStateEntry::cost
float cost
Definition: lm_state.h:178
tesseract::ViterbiStateEntry::~ViterbiStateEntry
~ViterbiStateEntry()
Definition: lm_state.h:125
tesseract::LanguageModelDawgInfo
Definition: lm_state.h:61
stopper.h
tesseract::LanguageModelNgramInfo::ngram_cost
float ngram_cost
-ln(P_ngram_model(path))
Definition: lm_state.h:84
tesseract::LMConsistencyInfo::Consistent
bool Consistent() const
Definition: lm_consistency.h:94
tesseract::ViterbiStateEntry::debug_str
STRING * debug_str
Definition: lm_state.h:174
tesseract::ViterbiStateEntry::competing_vse
ViterbiStateEntry * competing_vse
Definition: lm_state.h:162
BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:75
ELISTIZEH
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:907
tesseract::BestChoiceBundle::best_vse
ViterbiStateEntry * best_vse
Best ViterbiStateEntry and BLOB_CHOICE.
Definition: lm_state.h:240
tesseract::AssociateStats
Definition: associate.h:36
ratngs.h
lm_consistency.h
tesseract::LanguageModelDawgInfo::LanguageModelDawgInfo
LanguageModelDawgInfo(const DawgPositionVector *a, PermuterType pt)
Definition: lm_state.h:62
tesseract::ViterbiStateEntry::ViterbiStateEntry
ViterbiStateEntry(ViterbiStateEntry *pe, BLOB_CHOICE *b, float c, float ol, const LMConsistencyInfo &ci, const AssociateStats &as, LanguageModelFlagsType tcf, LanguageModelDawgInfo *d, LanguageModelNgramInfo *n, const char *debug_uch)
Definition: lm_state.h:92
genericvector.h
tesseract::BestChoiceBundle::~BestChoiceBundle
~BestChoiceBundle()
Definition: lm_state.h:229
tesseract::BestChoiceBundle::beam
PointerVector< LanguageModelState > beam
Definition: lm_state.h:238
tesseract::BestChoiceBundle::updated
bool updated
Flag to indicate whether anything was changed.
Definition: lm_state.h:232
tesseract::BestChoiceBundle
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:222
unicharset.h
dawg.h
tesseract::LanguageModelState::viterbi_state_entries_length
int viterbi_state_entries_length
Total number of entries in viterbi_state_entries.
Definition: lm_state.h:218
tesseract::ViterbiStateEntry::ngram_info
LanguageModelNgramInfo * ngram_info
Definition: lm_state.h:170
tesseract::LMConsistencyInfo::NumInconsistentCase
int NumInconsistentCase() const
Definition: lm_consistency.h:87
tesseract::LanguageModelState::Print
void Print(const char *msg)
Definition: lm_state.cpp:69
tesseract::BestChoiceBundle::BestChoiceBundle
BestChoiceBundle(int matrix_dimension)
Definition: lm_state.h:223
tesseract::LanguageModelState::LanguageModelState
LanguageModelState()
Definition: lm_state.h:201
tesseract::ViterbiStateEntry::associate_stats
AssociateStats associate_stats
character widths/gaps/seams
Definition: lm_state.h:188
UNICHARSET
Definition: unicharset.h:145
tesseract::ViterbiStateEntry::curr_b
BLOB_CHOICE * curr_b
Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this).
Definition: lm_state.h:158
tesseract::LanguageModelFlagsType
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:37
tesseract::LanguageModelNgramInfo::LanguageModelNgramInfo
LanguageModelNgramInfo(const char *c, int l, bool p, float nc, float ncc)
Definition: lm_state.h:71
tesseract::LanguageModelState::viterbi_state_entries_prunable_max_cost
float viterbi_state_entries_prunable_max_cost
Definition: lm_state.h:216
tesseract
Definition: baseapi.h:65
tesseract::LanguageModelNgramInfo::ngram_and_classifier_cost
float ngram_and_classifier_cost
-[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
Definition: lm_state.h:86
tesseract::LanguageModelState
Struct to store information maintained by various language model components.
Definition: lm_state.h:200
tesseract::BestChoiceBundle::fixpt
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
Definition: lm_state.h:234
tesseract::ViterbiStateEntry::consistency_info
LMConsistencyInfo consistency_info
path consistency info
Definition: lm_state.h:187
tesseract::LanguageModelState::viterbi_state_entries
ViterbiStateEntry_LIST viterbi_state_entries
Storage for the Viterbi state.
Definition: lm_state.h:213
tesseract::ViterbiStateEntry::updated
bool updated
set to true if the entry has just been created/updated
Definition: lm_state.h:194
tesseract::ViterbiStateEntry::parent_vse
ViterbiStateEntry * parent_vse
Definition: lm_state.h:159
tesseract::LanguageModelState::Clear
void Clear()
Clears the viterbi search state back to its initial conditions.
Definition: lm_state.cpp:62
tesseract::DawgPositionVector
Definition: dawg.h:373
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
tesseract::LanguageModelDawgInfo::active_dawgs
DawgPositionVector active_dawgs
Definition: lm_state.h:64
GenericVector< DANGERR_INFO >
tesseract::LanguageModelDawgInfo::permuter
PermuterType permuter
Definition: lm_state.h:65
tesseract::ViterbiStateEntry::Compare
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:132
tesseract::LanguageModelNgramInfo
Definition: lm_state.h:70
tesseract::ViterbiStateEntry::HasAlnumChoice
bool HasAlnumChoice(const UNICHARSET &unicharset)
Definition: lm_state.h:147
tesseract::ViterbiStateEntry::adapted
int adapted
number of BLOB_CHOICES from adapted templates
Definition: lm_state.h:184
tesseract::LMConsistencyInfo
Definition: lm_consistency.h:38
BLOB_CHOICE
Definition: ratngs.h:49
unichar.h
tesseract::ViterbiStateEntry::min_certainty
float min_certainty
minimum certainty on the path
Definition: lm_state.h:183
tesseract::LanguageModelNgramInfo::context_unichar_step_len
int context_unichar_step_len
Definition: lm_state.h:77
tesseract::ViterbiStateEntry::ratings_sum
float ratings_sum
sum of ratings of character on the path
Definition: lm_state.h:182
ELIST_LINK
Definition: elst.h:74
tesseract::LanguageModelState::~LanguageModelState
~LanguageModelState()
Definition: lm_state.h:205
associate.h
tesseract::LanguageModelState::viterbi_state_entries_prunable_length
int viterbi_state_entries_prunable_length
Number and max cost of prunable paths in viterbi_state_entries.
Definition: lm_state.h:215