tesseract  5.0.0-alpha-619-ge9db
language_model.cpp
Go to the documentation of this file.
1 // File: language_model.cpp
3 // Description: Functions that utilize the knowledge about the properties,
4 // structure and statistics of the language to help recognition.
5 // Author: Daria Antonova
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include "language_model.h"
21 #include <cassert> // for assert
22 #include <cmath> // for log2, pow
23 #include "blamer.h" // for BlamerBundle
24 #include "ccutil.h" // for CCUtil
25 #include "dawg.h" // for NO_EDGE, Dawg, Dawg::kPatternUn...
26 #include "errcode.h" // for ASSERT_HOST
27 #include "lm_state.h" // for ViterbiStateEntry, ViterbiState...
28 #include "matrix.h" // for MATRIX_COORD
29 #include "pageres.h" // for WERD_RES
30 #include "params.h" // for IntParam, BoolParam, DoubleParam
31 #include "params_training_featdef.h" // for ParamsTrainingHypothesis, PTRAI...
32 #include "tprintf.h" // for tprintf
33 #include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID
34 #include "unicharset.h" // for UNICHARSET
35 #include "unicity_table.h" // for UnicityTable
36 
37 template <typename T> class GenericVector;
38 template <typename T> class UnicityTable;
39 
40 namespace tesseract {
41 
42 class LMPainPoints;
43 struct FontInfo;
44 
45 #if defined(ANDROID)
46 static inline double log2(double n) {
47  return log(n) / log(2.0);
48 }
49 #endif // ANDROID
50 
51 const float LanguageModel::kMaxAvgNgramCost = 25.0f;
52 
54  Dict *dict)
55  : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
56  dict->getCCUtil()->params()),
57  BOOL_INIT_MEMBER(language_model_ngram_on, false,
58  "Turn on/off the use of character ngram model",
59  dict->getCCUtil()->params()),
60  INT_MEMBER(language_model_ngram_order, 8,
61  "Maximum order of the character ngram model",
62  dict->getCCUtil()->params()),
63  INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
64  "Maximum number of prunable (those for which"
65  " PrunablePath() is true) entries in each viterbi list"
66  " recorded in BLOB_CHOICEs",
67  dict->getCCUtil()->params()),
68  INT_MEMBER(language_model_viterbi_list_max_size, 500,
69  "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
70  dict->getCCUtil()->params()),
71  double_MEMBER(language_model_ngram_small_prob, 0.000001,
72  "To avoid overly small denominators use this as the "
73  "floor of the probability returned by the ngram model.",
74  dict->getCCUtil()->params()),
75  double_MEMBER(language_model_ngram_nonmatch_score, -40.0,
76  "Average classifier score of a non-matching unichar.",
77  dict->getCCUtil()->params()),
78  BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
79  "Use only the first UTF8 step of the given string"
80  " when computing log probabilities.",
81  dict->getCCUtil()->params()),
82  double_MEMBER(language_model_ngram_scale_factor, 0.03,
83  "Strength of the character ngram model relative to the"
84  " character classifier ",
85  dict->getCCUtil()->params()),
86  double_MEMBER(language_model_ngram_rating_factor, 16.0,
87  "Factor to bring log-probs into the same range as ratings"
88  " when multiplied by outline length ",
89  dict->getCCUtil()->params()),
90  BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
91  "Words are delimited by space", dict->getCCUtil()->params()),
92  INT_MEMBER(language_model_min_compound_length, 3,
93  "Minimum length of compound words",
94  dict->getCCUtil()->params()),
95  double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,
96  "Penalty for words not in the frequent word dictionary",
97  dict->getCCUtil()->params()),
98  double_MEMBER(language_model_penalty_non_dict_word, 0.15,
99  "Penalty for non-dictionary words",
100  dict->getCCUtil()->params()),
101  double_MEMBER(language_model_penalty_punc, 0.2,
102  "Penalty for inconsistent punctuation",
103  dict->getCCUtil()->params()),
104  double_MEMBER(language_model_penalty_case, 0.1,
105  "Penalty for inconsistent case",
106  dict->getCCUtil()->params()),
107  double_MEMBER(language_model_penalty_script, 0.5,
108  "Penalty for inconsistent script",
109  dict->getCCUtil()->params()),
110  double_MEMBER(language_model_penalty_chartype, 0.3,
111  "Penalty for inconsistent character type",
112  dict->getCCUtil()->params()),
113  // TODO(daria, rays): enable font consistency checking
114  // after improving font analysis.
115  double_MEMBER(language_model_penalty_font, 0.00,
116  "Penalty for inconsistent font",
117  dict->getCCUtil()->params()),
118  double_MEMBER(language_model_penalty_spacing, 0.05,
119  "Penalty for inconsistent spacing",
120  dict->getCCUtil()->params()),
121  double_MEMBER(language_model_penalty_increment, 0.01, "Penalty increment",
122  dict->getCCUtil()->params()),
123  INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
124  dict->getCCUtil()->params()),
125  BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false,
126  "Use sigmoidal score for certainty",
127  dict->getCCUtil()->params()),
128  dawg_args_(nullptr, new DawgPositionVector(), NO_PERM),
129  fontinfo_table_(fontinfo_table),
130  dict_(dict) {
131  ASSERT_HOST(dict_ != nullptr);
132 }
133 
135 
137  bool fixed_pitch, float max_char_wh_ratio,
138  float rating_cert_scale) {
139  fixed_pitch_ = fixed_pitch;
140  max_char_wh_ratio_ = max_char_wh_ratio;
141  rating_cert_scale_ = rating_cert_scale;
142  acceptable_choice_found_ = false;
144 
145  // Initialize vectors with beginning DawgInfos.
150 
151  // Fill prev_word_str_ with the last language_model_ngram_order
152  // unichars from prev_word.
154  if (prev_word != nullptr && prev_word->unichar_string() != nullptr) {
155  prev_word_str_ = prev_word->unichar_string();
157  } else {
158  prev_word_str_ = " ";
159  }
160  const char *str_ptr = prev_word_str_.c_str();
161  const char *str_end = str_ptr + prev_word_str_.length();
162  int step;
164  while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
165  str_ptr += step;
167  }
168  ASSERT_HOST(str_ptr == str_end);
169  }
170 }
171 
176 static void ScanParentsForCaseMix(const UNICHARSET& unicharset,
177  LanguageModelState* parent_node) {
178  if (parent_node == nullptr) return;
179  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
180  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
181  ViterbiStateEntry* vse = vit.data();
182  vse->competing_vse = nullptr;
183  UNICHAR_ID unichar_id = vse->curr_b->unichar_id();
184  if (unicharset.get_isupper(unichar_id) ||
185  unicharset.get_islower(unichar_id)) {
186  UNICHAR_ID other_case = unicharset.get_other_case(unichar_id);
187  if (other_case == unichar_id) continue; // Not in unicharset.
188  // Find other case in same list. There could be multiple entries with
189  // the same unichar_id, but in theory, they should all point to the
190  // same BLOB_CHOICE, and that is what we will be using to decide
191  // which to keep.
192  ViterbiStateEntry_IT vit2(&parent_node->viterbi_state_entries);
193  for (vit2.mark_cycle_pt(); !vit2.cycled_list() &&
194  vit2.data()->curr_b->unichar_id() != other_case;
195  vit2.forward()) {}
196  if (!vit2.cycled_list()) {
197  vse->competing_vse = vit2.data();
198  }
199  }
200  }
201 }
202 
207 static bool HasBetterCaseVariant(const UNICHARSET& unicharset,
208  const BLOB_CHOICE* choice,
209  BLOB_CHOICE_LIST* choices) {
210  UNICHAR_ID choice_id = choice->unichar_id();
211  UNICHAR_ID other_case = unicharset.get_other_case(choice_id);
212  if (other_case == choice_id || other_case == INVALID_UNICHAR_ID)
213  return false; // Not upper or lower or not in unicharset.
214  if (unicharset.SizesDistinct(choice_id, other_case))
215  return false; // Can be separated by size.
216  BLOB_CHOICE_IT bc_it(choices);
217  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
218  BLOB_CHOICE* better_choice = bc_it.data();
219  if (better_choice->unichar_id() == other_case)
220  return true; // Found an earlier instance of other_case.
221  else if (better_choice == choice)
222  return false; // Reached the original choice.
223  }
224  return false; // Should never happen, but just in case.
225 }
226 
254  bool just_classified,
255  int curr_col, int curr_row,
256  BLOB_CHOICE_LIST *curr_list,
257  LanguageModelState *parent_node,
258  LMPainPoints *pain_points,
259  WERD_RES *word_res,
260  BestChoiceBundle *best_choice_bundle,
261  BlamerBundle *blamer_bundle) {
262  if (language_model_debug_level > 0) {
263  tprintf("\nUpdateState: col=%d row=%d %s",
264  curr_col, curr_row, just_classified ? "just_classified" : "");
266  tprintf("(parent=%p)\n", parent_node);
267  else
268  tprintf("\n");
269  }
270  // Initialize helper variables.
271  bool word_end = (curr_row+1 >= word_res->ratings->dimension());
272  bool new_changed = false;
273  float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
274  const UNICHARSET& unicharset = dict_->getUnicharset();
275  BLOB_CHOICE *first_lower = nullptr;
276  BLOB_CHOICE *first_upper = nullptr;
277  BLOB_CHOICE *first_digit = nullptr;
278  bool has_alnum_mix = false;
279  if (parent_node != nullptr) {
280  int result = SetTopParentLowerUpperDigit(parent_node);
281  if (result < 0) {
283  tprintf("No parents found to process\n");
284  return false;
285  }
286  if (result > 0)
287  has_alnum_mix = true;
288  }
289  if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
290  &first_digit))
291  has_alnum_mix = false;;
292  ScanParentsForCaseMix(unicharset, parent_node);
293  if (language_model_debug_level > 3 && parent_node != nullptr) {
294  parent_node->Print("Parent viterbi list");
295  }
296  LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
297 
298  // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
299  ViterbiStateEntry_IT vit;
300  BLOB_CHOICE_IT c_it(curr_list);
301  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
302  BLOB_CHOICE* choice = c_it.data();
303  // TODO(antonova): make sure commenting this out if ok for ngram
304  // model scoring (I think this was introduced to fix ngram model quirks).
305  // Skip nullptr unichars unless it is the only choice.
306  //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
307  UNICHAR_ID unichar_id = choice->unichar_id();
308  if (unicharset.get_fragment(unichar_id)) {
309  continue; // Skip fragments.
310  }
311  // Set top choice flags.
312  LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
313  if (c_it.at_first() || !new_changed)
314  blob_choice_flags |= kSmallestRatingFlag;
315  if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
316  if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
317  if (first_digit == choice) blob_choice_flags |= kDigitFlag;
318 
319  if (parent_node == nullptr) {
320  // Process the beginning of a word.
321  // If there is a better case variant that is not distinguished by size,
322  // skip this blob choice, as we have no choice but to accept the result
323  // of the character classifier to distinguish between them, even if
324  // followed by an upper case.
325  // With words like iPoc, and other CamelBackWords, the lower-upper
326  // transition can only be achieved if the classifier has the correct case
327  // as the top choice, and leaving an initial I lower down the list
328  // increases the chances of choosing IPoc simply because it doesn't
329  // include such a transition. iPoc will beat iPOC and ipoc because
330  // the other words are baseline/x-height inconsistent.
331  if (HasBetterCaseVariant(unicharset, choice, curr_list))
332  continue;
333  // Upper counts as lower at the beginning of a word.
334  if (blob_choice_flags & kUpperCaseFlag)
335  blob_choice_flags |= kLowerCaseFlag;
336  new_changed |= AddViterbiStateEntry(
337  blob_choice_flags, denom, word_end, curr_col, curr_row,
338  choice, curr_state, nullptr, pain_points,
339  word_res, best_choice_bundle, blamer_bundle);
340  } else {
341  // Get viterbi entries from each parent ViterbiStateEntry.
342  vit.set_to_list(&parent_node->viterbi_state_entries);
343  int vit_counter = 0;
344  vit.mark_cycle_pt();
345  ViterbiStateEntry* parent_vse = nullptr;
346  LanguageModelFlagsType top_choice_flags;
347  while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
348  c_it.data(), blob_choice_flags,
349  unicharset, word_res, &vit,
350  &top_choice_flags)) != nullptr) {
351  // Skip pruned entries and do not look at prunable entries if already
352  // examined language_model_viterbi_list_max_num_prunable of those.
353  if (PrunablePath(*parent_vse) &&
355  (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
356  continue;
357  }
358  // If the parent has no alnum choice, (ie choice is the first in a
359  // string of alnum), and there is a better case variant that is not
360  // distinguished by size, skip this blob choice/parent, as with the
361  // initial blob treatment above.
362  if (!parent_vse->HasAlnumChoice(unicharset) &&
363  HasBetterCaseVariant(unicharset, choice, curr_list))
364  continue;
365  // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
366  // looks good according to the Dawgs or character ngram model.
367  new_changed |= AddViterbiStateEntry(
368  top_choice_flags, denom, word_end, curr_col, curr_row,
369  c_it.data(), curr_state, parent_vse, pain_points,
370  word_res, best_choice_bundle, blamer_bundle);
371  }
372  }
373  }
374  return new_changed;
375 }
376 
383 bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
384  BLOB_CHOICE **first_lower,
385  BLOB_CHOICE **first_upper,
386  BLOB_CHOICE **first_digit) const {
387  BLOB_CHOICE_IT c_it(curr_list);
388  const UNICHARSET &unicharset = dict_->getUnicharset();
389  BLOB_CHOICE *first_unichar = nullptr;
390  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
391  UNICHAR_ID unichar_id = c_it.data()->unichar_id();
392  if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
393  if (first_unichar == nullptr) first_unichar = c_it.data();
394  if (*first_lower == nullptr && unicharset.get_islower(unichar_id)) {
395  *first_lower = c_it.data();
396  }
397  if (*first_upper == nullptr && unicharset.get_isalpha(unichar_id) &&
398  !unicharset.get_islower(unichar_id)) {
399  *first_upper = c_it.data();
400  }
401  if (*first_digit == nullptr && unicharset.get_isdigit(unichar_id)) {
402  *first_digit = c_it.data();
403  }
404  }
405  ASSERT_HOST(first_unichar != nullptr);
406  bool mixed = (*first_lower != nullptr || *first_upper != nullptr) &&
407  *first_digit != nullptr;
408  if (*first_lower == nullptr) *first_lower = first_unichar;
409  if (*first_upper == nullptr) *first_upper = first_unichar;
410  if (*first_digit == nullptr) *first_digit = first_unichar;
411  return mixed;
412 }
413 
424  LanguageModelState *parent_node) const {
425  if (parent_node == nullptr) return -1;
426  UNICHAR_ID top_id = INVALID_UNICHAR_ID;
427  ViterbiStateEntry* top_lower = nullptr;
428  ViterbiStateEntry* top_upper = nullptr;
429  ViterbiStateEntry* top_digit = nullptr;
430  ViterbiStateEntry* top_choice = nullptr;
431  float lower_rating = 0.0f;
432  float upper_rating = 0.0f;
433  float digit_rating = 0.0f;
434  float top_rating = 0.0f;
435  const UNICHARSET &unicharset = dict_->getUnicharset();
436  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
437  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
438  ViterbiStateEntry* vse = vit.data();
439  // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
440  // back to the real character if needed.
441  ViterbiStateEntry* unichar_vse = vse;
442  UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
443  float rating = unichar_vse->curr_b->rating();
444  while (unichar_id == INVALID_UNICHAR_ID &&
445  unichar_vse->parent_vse != nullptr) {
446  unichar_vse = unichar_vse->parent_vse;
447  unichar_id = unichar_vse->curr_b->unichar_id();
448  rating = unichar_vse->curr_b->rating();
449  }
450  if (unichar_id != INVALID_UNICHAR_ID) {
451  if (unicharset.get_islower(unichar_id)) {
452  if (top_lower == nullptr || lower_rating > rating) {
453  top_lower = vse;
454  lower_rating = rating;
455  }
456  } else if (unicharset.get_isalpha(unichar_id)) {
457  if (top_upper == nullptr || upper_rating > rating) {
458  top_upper = vse;
459  upper_rating = rating;
460  }
461  } else if (unicharset.get_isdigit(unichar_id)) {
462  if (top_digit == nullptr || digit_rating > rating) {
463  top_digit = vse;
464  digit_rating = rating;
465  }
466  }
467  }
468  if (top_choice == nullptr || top_rating > rating) {
469  top_choice = vse;
470  top_rating = rating;
471  top_id = unichar_id;
472  }
473  }
474  if (top_choice == nullptr) return -1;
475  bool mixed = (top_lower != nullptr || top_upper != nullptr) &&
476  top_digit != nullptr;
477  if (top_lower == nullptr) top_lower = top_choice;
478  top_lower->top_choice_flags |= kLowerCaseFlag;
479  if (top_upper == nullptr) top_upper = top_choice;
480  top_upper->top_choice_flags |= kUpperCaseFlag;
481  if (top_digit == nullptr) top_digit = top_choice;
482  top_digit->top_choice_flags |= kDigitFlag;
483  top_choice->top_choice_flags |= kSmallestRatingFlag;
484  if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
485  (top_choice->top_choice_flags &
487  // If the compound marker top choice carries any of the top alnum flags,
488  // then give it all of them, allowing words like I-295 to be chosen.
489  top_choice->top_choice_flags |=
491  }
492  return mixed ? 1 : 0;
493 }
494 
501  bool just_classified, bool mixed_alnum, const BLOB_CHOICE* bc,
502  LanguageModelFlagsType blob_choice_flags, const UNICHARSET& unicharset,
503  WERD_RES* word_res, ViterbiStateEntry_IT* vse_it,
504  LanguageModelFlagsType* top_choice_flags) const {
505  for (; !vse_it->cycled_list(); vse_it->forward()) {
506  ViterbiStateEntry* parent_vse = vse_it->data();
507  // Only consider the parent if it has been updated or
508  // if the current ratings cell has just been classified.
509  if (!just_classified && !parent_vse->updated) continue;
511  parent_vse->Print("Considering");
512  // If the parent is non-alnum, then upper counts as lower.
513  *top_choice_flags = blob_choice_flags;
514  if ((blob_choice_flags & kUpperCaseFlag) &&
515  !parent_vse->HasAlnumChoice(unicharset)) {
516  *top_choice_flags |= kLowerCaseFlag;
517  }
518  *top_choice_flags &= parent_vse->top_choice_flags;
519  UNICHAR_ID unichar_id = bc->unichar_id();
520  const BLOB_CHOICE* parent_b = parent_vse->curr_b;
521  UNICHAR_ID parent_id = parent_b->unichar_id();
522  // Digits do not bind to alphas if there is a mix in both parent and current
523  // or if the alpha is not the top choice.
524  if (unicharset.get_isdigit(unichar_id) &&
525  unicharset.get_isalpha(parent_id) &&
526  (mixed_alnum || *top_choice_flags == 0))
527  continue; // Digits don't bind to alphas.
528  // Likewise alphas do not bind to digits if there is a mix in both or if
529  // the digit is not the top choice.
530  if (unicharset.get_isalpha(unichar_id) &&
531  unicharset.get_isdigit(parent_id) &&
532  (mixed_alnum || *top_choice_flags == 0))
533  continue; // Alphas don't bind to digits.
534  // If there is a case mix of the same alpha in the parent list, then
535  // competing_vse is non-null and will be used to determine whether
536  // or not to bind the current blob choice.
537  if (parent_vse->competing_vse != nullptr) {
538  const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
539  UNICHAR_ID other_id = competing_b->unichar_id();
540  if (language_model_debug_level >= 5) {
541  tprintf("Parent %s has competition %s\n",
542  unicharset.id_to_unichar(parent_id),
543  unicharset.id_to_unichar(other_id));
544  }
545  if (unicharset.SizesDistinct(parent_id, other_id)) {
546  // If other_id matches bc wrt position and size, and parent_id, doesn't,
547  // don't bind to the current parent.
548  if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
550  !bc->PosAndSizeAgree(*parent_b, word_res->x_height,
552  continue; // Competing blobchoice has a better vertical match.
553  }
554  }
555  vse_it->forward();
556  return parent_vse; // This one is good!
557  }
558  return nullptr; // Ran out of possibilities.
559 }
560 
562  LanguageModelFlagsType top_choice_flags,
563  float denom,
564  bool word_end,
565  int curr_col, int curr_row,
566  BLOB_CHOICE *b,
567  LanguageModelState *curr_state,
568  ViterbiStateEntry *parent_vse,
569  LMPainPoints *pain_points,
570  WERD_RES *word_res,
571  BestChoiceBundle *best_choice_bundle,
572  BlamerBundle *blamer_bundle) {
573  ViterbiStateEntry_IT vit;
574  if (language_model_debug_level > 1) {
575  tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
576  " certainty=%.4f top_choice_flags=0x%x",
578  b->rating(), b->certainty(), top_choice_flags);
580  tprintf(" parent_vse=%p\n", parent_vse);
581  else
582  tprintf("\n");
583  }
584  ASSERT_HOST(curr_state != nullptr);
585  // Check whether the list is full.
586  if (curr_state->viterbi_state_entries_length >=
588  if (language_model_debug_level > 1) {
589  tprintf("AddViterbiStateEntry: viterbi list is full!\n");
590  }
591  return false;
592  }
593 
594  // Invoke Dawg language model component.
595  LanguageModelDawgInfo *dawg_info =
596  GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
597 
598  float outline_length =
600  // Invoke Ngram language model component.
601  LanguageModelNgramInfo *ngram_info = nullptr;
603  ngram_info = GenerateNgramInfo(
605  denom, curr_col, curr_row, outline_length, parent_vse);
606  ASSERT_HOST(ngram_info != nullptr);
607  }
608  bool liked_by_language_model = dawg_info != nullptr ||
609  (ngram_info != nullptr && !ngram_info->pruned);
610  // Quick escape if not liked by the language model, can't be consistent
611  // xheight, and not top choice.
612  if (!liked_by_language_model && top_choice_flags == 0) {
613  if (language_model_debug_level > 1) {
614  tprintf("Language model components very early pruned this entry\n");
615  }
616  delete ngram_info;
617  delete dawg_info;
618  return false;
619  }
620 
621  // Check consistency of the path and set the relevant consistency_info.
622  LMConsistencyInfo consistency_info(
623  parent_vse != nullptr ? &parent_vse->consistency_info : nullptr);
624  // Start with just the x-height consistency, as it provides significant
625  // pruning opportunity.
626  consistency_info.ComputeXheightConsistency(
628  // Turn off xheight consistent flag if not consistent.
629  if (consistency_info.InconsistentXHeight()) {
630  top_choice_flags &= ~kXhtConsistentFlag;
631  }
632 
633  // Quick escape if not liked by the language model, not consistent xheight,
634  // and not top choice.
635  if (!liked_by_language_model && top_choice_flags == 0) {
636  if (language_model_debug_level > 1) {
637  tprintf("Language model components early pruned this entry\n");
638  }
639  delete ngram_info;
640  delete dawg_info;
641  return false;
642  }
643 
644  // Compute the rest of the consistency info.
645  FillConsistencyInfo(curr_col, word_end, b, parent_vse,
646  word_res, &consistency_info);
647  if (dawg_info != nullptr && consistency_info.invalid_punc) {
648  consistency_info.invalid_punc = false; // do not penalize dict words
649  }
650 
651  // Compute cost of associating the blobs that represent the current unichar.
652  AssociateStats associate_stats;
653  ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
654  parent_vse, word_res, &associate_stats);
655  if (parent_vse != nullptr) {
656  associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
657  associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
658  }
659 
660  // Create the new ViterbiStateEntry compute the adjusted cost of the path.
661  auto *new_vse = new ViterbiStateEntry(
662  parent_vse, b, 0.0, outline_length,
663  consistency_info, associate_stats, top_choice_flags, dawg_info,
664  ngram_info, (language_model_debug_level > 0) ?
665  dict_->getUnicharset().id_to_unichar(b->unichar_id()) : nullptr);
666  new_vse->cost = ComputeAdjustedPathCost(new_vse);
668  tprintf("Adjusted cost = %g\n", new_vse->cost);
669 
670  // Invoke Top Choice language model component to make the final adjustments
671  // to new_vse->top_choice_flags.
672  if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
673  GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
674  }
675 
676  // If language model components did not like this unichar - return.
677  bool keep = new_vse->top_choice_flags || liked_by_language_model;
678  if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths
679  consistency_info.inconsistent_script) { // with inconsistent script
680  keep = false;
681  }
682  if (!keep) {
683  if (language_model_debug_level > 1) {
684  tprintf("Language model components did not like this entry\n");
685  }
686  delete new_vse;
687  return false;
688  }
689 
690  // Discard this entry if it represents a prunable path and
691  // language_model_viterbi_list_max_num_prunable such entries with a lower
692  // cost have already been recorded.
693  if (PrunablePath(*new_vse) &&
696  new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
697  if (language_model_debug_level > 1) {
698  tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
699  new_vse->cost,
701  }
702  delete new_vse;
703  return false;
704  }
705 
706  // Update best choice if needed.
707  if (word_end) {
708  UpdateBestChoice(new_vse, pain_points, word_res,
709  best_choice_bundle, blamer_bundle);
710  // Discard the entry if UpdateBestChoice() found flaws in it.
711  if (new_vse->cost >= WERD_CHOICE::kBadRating &&
712  new_vse != best_choice_bundle->best_vse) {
713  if (language_model_debug_level > 1) {
714  tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
715  }
716  delete new_vse;
717  return false;
718  }
719  }
720 
721  // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
722  curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
723  false, new_vse);
724  curr_state->viterbi_state_entries_length++;
725  if (PrunablePath(*new_vse)) {
727  }
728 
729  // Update lms->viterbi_state_entries_prunable_max_cost and clear
730  // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
731  if ((curr_state->viterbi_state_entries_prunable_length >=
733  new_vse->top_choice_flags) {
734  ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
735  int prunable_counter = language_model_viterbi_list_max_num_prunable;
736  vit.set_to_list(&(curr_state->viterbi_state_entries));
737  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
738  ViterbiStateEntry *curr_vse = vit.data();
739  // Clear the appropriate top choice flags of the entries in the
740  // list that have cost higher thank new_entry->cost
741  // (since they will not be top choices any more).
742  if (curr_vse->top_choice_flags && curr_vse != new_vse &&
743  curr_vse->cost > new_vse->cost) {
744  curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
745  }
746  if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
747  // Update curr_state->viterbi_state_entries_prunable_max_cost.
748  if (prunable_counter == 0) {
749  curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
750  if (language_model_debug_level > 1) {
751  tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
753  }
754  prunable_counter = -1; // stop counting
755  }
756  }
757  }
758 
759  // Print the newly created ViterbiStateEntry.
760  if (language_model_debug_level > 2) {
761  new_vse->Print("New");
763  curr_state->Print("Updated viterbi list");
764  }
765 
766  return true;
767 }
768 
770  const ViterbiStateEntry *parent_vse,
771  LanguageModelState *lms) {
772  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
773  for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
774  new_vse->cost >= vit.data()->cost; vit.forward()) {
775  // Clear the appropriate flags if the list already contains
776  // a top choice entry with a lower cost.
777  new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
778  }
779  if (language_model_debug_level > 2) {
780  tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
781  new_vse->top_choice_flags);
782  }
783 }
784 
786  bool word_end,
787  int curr_col, int curr_row,
788  const BLOB_CHOICE &b,
789  const ViterbiStateEntry *parent_vse) {
790  // Initialize active_dawgs from parent_vse if it is not nullptr.
791  // Otherwise use very_beginning_active_dawgs_.
792  if (parent_vse == nullptr) {
795  } else {
796  if (parent_vse->dawg_info == nullptr) return nullptr; // not a dict word path
798  dawg_args_.permuter = parent_vse->dawg_info->permuter;
799  }
800 
801  // Deal with hyphenated words.
802  if (word_end && dict_->has_hyphen_end(&dict_->getUnicharset(),
803  b.unichar_id(), curr_col == 0)) {
804  if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
806  }
807 
808  // Deal with compound words.
809  if (dict_->compound_marker(b.unichar_id()) &&
810  (parent_vse == nullptr || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
811  if (language_model_debug_level > 0) tprintf("Found compound marker\n");
812  // Do not allow compound operators at the beginning and end of the word.
813  // Do not allow more than one compound operator per word.
814  // Do not allow compounding of words with lengths shorter than
815  // language_model_min_compound_length
816  if (parent_vse == nullptr || word_end ||
819  return nullptr;
820 
821  int i;
822  // Check a that the path terminated before the current character is a word.
823  bool has_word_ending = false;
824  for (i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {
825  const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];
826  const Dawg *pdawg = pos.dawg_index < 0
827  ? nullptr : dict_->GetDawg(pos.dawg_index);
828  if (pdawg == nullptr || pos.back_to_punc) continue;;
829  if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
830  pdawg->end_of_word(pos.dawg_ref)) {
831  has_word_ending = true;
832  break;
833  }
834  }
835  if (!has_word_ending) return nullptr;
836 
837  if (language_model_debug_level > 0) tprintf("Compound word found\n");
839  } // done dealing with compound words
840 
841  LanguageModelDawgInfo *dawg_info = nullptr;
842 
843  // Call LetterIsOkay().
844  // Use the normalized IDs so that all shapes of ' can be allowed in words
845  // like don't.
846  const GenericVector<UNICHAR_ID>& normed_ids =
848  DawgPositionVector tmp_active_dawgs;
849  for (int i = 0; i < normed_ids.size(); ++i) {
851  tprintf("Test Letter OK for unichar %d, normed %d\n",
852  b.unichar_id(), normed_ids[i]);
853  dict_->LetterIsOkay(&dawg_args_, dict_->getUnicharset(), normed_ids[i],
854  word_end && i == normed_ids.size() - 1);
855  if (dawg_args_.permuter == NO_PERM) {
856  break;
857  } else if (i < normed_ids.size() - 1) {
858  tmp_active_dawgs = *dawg_args_.updated_dawgs;
859  dawg_args_.active_dawgs = &tmp_active_dawgs;
860  }
862  tprintf("Letter was OK for unichar %d, normed %d\n",
863  b.unichar_id(), normed_ids[i]);
864  }
865  dawg_args_.active_dawgs = nullptr;
866  if (dawg_args_.permuter != NO_PERM) {
869  } else if (language_model_debug_level > 3) {
870  tprintf("Letter %s not OK!\n",
872  }
873 
874  return dawg_info;
875 }
876 
878  const char *unichar, float certainty, float denom,
879  int curr_col, int curr_row, float outline_length,
880  const ViterbiStateEntry *parent_vse) {
881  // Initialize parent context.
882  const char *pcontext_ptr = "";
883  int pcontext_unichar_step_len = 0;
884  if (parent_vse == nullptr) {
885  pcontext_ptr = prev_word_str_.c_str();
886  pcontext_unichar_step_len = prev_word_unichar_step_len_;
887  } else {
888  pcontext_ptr = parent_vse->ngram_info->context.c_str();
889  pcontext_unichar_step_len =
891  }
892  // Compute p(unichar | parent context).
893  int unichar_step_len = 0;
894  bool pruned = false;
895  float ngram_cost;
896  float ngram_and_classifier_cost =
897  ComputeNgramCost(unichar, certainty, denom,
898  pcontext_ptr, &unichar_step_len,
899  &pruned, &ngram_cost);
900  // Normalize just the ngram_and_classifier_cost by outline_length.
901  // The ngram_cost is used by the params_model, so it needs to be left as-is,
902  // and the params model cost will be normalized by outline_length.
903  ngram_and_classifier_cost *=
904  outline_length / language_model_ngram_rating_factor;
905  // Add the ngram_cost of the parent.
906  if (parent_vse != nullptr) {
907  ngram_and_classifier_cost +=
909  ngram_cost += parent_vse->ngram_info->ngram_cost;
910  }
911 
912  // Shorten parent context string by unichar_step_len unichars.
913  int num_remove = (unichar_step_len + pcontext_unichar_step_len -
915  if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
916  while (num_remove > 0 && *pcontext_ptr != '\0') {
917  pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
918  --num_remove;
919  }
920 
921  // Decide whether to prune this ngram path and update changed accordingly.
922  if (parent_vse != nullptr && parent_vse->ngram_info->pruned) pruned = true;
923 
924  // Construct and return the new LanguageModelNgramInfo.
925  auto *ngram_info = new LanguageModelNgramInfo(
926  pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
927  ngram_and_classifier_cost);
928  ngram_info->context += unichar;
929  ngram_info->context_unichar_step_len += unichar_step_len;
930  assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
931  return ngram_info;
932 }
933 
934 float LanguageModel::ComputeNgramCost(const char *unichar,
935  float certainty,
936  float denom,
937  const char *context,
938  int *unichar_step_len,
939  bool *found_small_prob,
940  float *ngram_cost) {
941  const char *context_ptr = context;
942  char *modified_context = nullptr;
943  char *modified_context_end = nullptr;
944  const char *unichar_ptr = unichar;
945  const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
946  float prob = 0.0f;
947  int step = 0;
948  while (unichar_ptr < unichar_end &&
949  (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
950  if (language_model_debug_level > 1) {
951  tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
952  dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
953  }
954  prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
955  ++(*unichar_step_len);
957  unichar_ptr += step;
958  // If there are multiple UTF8 characters present in unichar, context is
959  // updated to include the previously examined characters from str,
960  // unless use_only_first_uft8_step is true.
961  if (unichar_ptr < unichar_end) {
962  if (modified_context == nullptr) {
963  size_t context_len = strlen(context);
964  modified_context =
965  new char[context_len + strlen(unichar_ptr) + step + 1];
966  memcpy(modified_context, context, context_len);
967  modified_context_end = modified_context + context_len;
968  context_ptr = modified_context;
969  }
970  strncpy(modified_context_end, unichar_ptr - step, step);
971  modified_context_end += step;
972  *modified_context_end = '\0';
973  }
974  }
975  prob /= static_cast<float>(*unichar_step_len); // normalize
976  if (prob < language_model_ngram_small_prob) {
977  if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
978  *found_small_prob = true;
980  }
981  *ngram_cost = -1.0*log2(prob);
982  float ngram_and_classifier_cost =
983  -1.0*log2(CertaintyScore(certainty)/denom) +
984  *ngram_cost * language_model_ngram_scale_factor;
985  if (language_model_debug_level > 1) {
986  tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
987  unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
988  ngram_and_classifier_cost);
989  }
990  delete[] modified_context;
991  return ngram_and_classifier_cost;
992 }
993 
994 float LanguageModel::ComputeDenom(BLOB_CHOICE_LIST *curr_list) {
995  if (curr_list->empty()) return 1.0f;
996  float denom = 0.0f;
997  int len = 0;
998  BLOB_CHOICE_IT c_it(curr_list);
999  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
1000  ASSERT_HOST(c_it.data() != nullptr);
1001  ++len;
1002  denom += CertaintyScore(c_it.data()->certainty());
1003  }
1004  assert(len != 0);
1005  // The ideal situation would be to have the classifier scores for
1006  // classifying each position as each of the characters in the unicharset.
1007  // Since we can not do this because of speed, we add a very crude estimate
1008  // of what these scores for the "missing" classifications would sum up to.
1009  denom += (dict_->getUnicharset().size() - len) *
1011 
1012  return denom;
1013 }
1014 
1016  int curr_col,
1017  bool word_end,
1018  BLOB_CHOICE *b,
1019  ViterbiStateEntry *parent_vse,
1020  WERD_RES *word_res,
1021  LMConsistencyInfo *consistency_info) {
1022  const UNICHARSET &unicharset = dict_->getUnicharset();
1023  UNICHAR_ID unichar_id = b->unichar_id();
1024  BLOB_CHOICE* parent_b = parent_vse != nullptr ? parent_vse->curr_b : nullptr;
1025 
1026  // Check punctuation validity.
1027  if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
1028  if (dict_->GetPuncDawg() != nullptr && !consistency_info->invalid_punc) {
1029  if (dict_->compound_marker(unichar_id) && parent_b != nullptr &&
1030  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1031  unicharset.get_isdigit(parent_b->unichar_id()))) {
1032  // reset punc_ref for compound words
1033  consistency_info->punc_ref = NO_EDGE;
1034  } else {
1035  bool is_apos = dict_->is_apostrophe(unichar_id);
1036  bool prev_is_numalpha = (parent_b != nullptr &&
1037  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1038  unicharset.get_isdigit(parent_b->unichar_id())));
1039  UNICHAR_ID pattern_unichar_id =
1040  (unicharset.get_isalpha(unichar_id) ||
1041  unicharset.get_isdigit(unichar_id) ||
1042  (is_apos && prev_is_numalpha)) ?
1043  Dawg::kPatternUnicharID : unichar_id;
1044  if (consistency_info->punc_ref == NO_EDGE ||
1045  pattern_unichar_id != Dawg::kPatternUnicharID ||
1046  dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1049  consistency_info->punc_ref);
1050  consistency_info->punc_ref =
1051  (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1052  node, pattern_unichar_id, word_end) : NO_EDGE;
1053  if (consistency_info->punc_ref == NO_EDGE) {
1054  consistency_info->invalid_punc = true;
1055  }
1056  }
1057  }
1058  }
1059 
1060  // Update case related counters.
1061  if (parent_vse != nullptr && !word_end && dict_->compound_marker(unichar_id)) {
1062  // Reset counters if we are dealing with a compound word.
1063  consistency_info->num_lower = 0;
1064  consistency_info->num_non_first_upper = 0;
1065  }
1066  else if (unicharset.get_islower(unichar_id)) {
1067  consistency_info->num_lower++;
1068  } else if ((parent_b != nullptr) && unicharset.get_isupper(unichar_id)) {
1069  if (unicharset.get_isupper(parent_b->unichar_id()) ||
1070  consistency_info->num_lower > 0 ||
1071  consistency_info->num_non_first_upper > 0) {
1072  consistency_info->num_non_first_upper++;
1073  }
1074  }
1075 
1076  // Initialize consistency_info->script_id (use script of unichar_id
1077  // if it is not Common, use script id recorded by the parent otherwise).
1078  // Set inconsistent_script to true if the script of the current unichar
1079  // is not consistent with that of the parent.
1080  consistency_info->script_id = unicharset.get_script(unichar_id);
1081  // Hiragana and Katakana can mix with Han.
1083  if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1084  consistency_info->script_id == unicharset.hiragana_sid()) ||
1085  (unicharset.katakana_sid() != unicharset.null_sid() &&
1086  consistency_info->script_id == unicharset.katakana_sid())) {
1087  consistency_info->script_id = dict_->getUnicharset().han_sid();
1088  }
1089  }
1090 
1091  if (parent_vse != nullptr &&
1092  (parent_vse->consistency_info.script_id !=
1093  dict_->getUnicharset().common_sid())) {
1094  int parent_script_id = parent_vse->consistency_info.script_id;
1095  // If script_id is Common, use script id of the parent instead.
1096  if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1097  consistency_info->script_id = parent_script_id;
1098  }
1099  if (consistency_info->script_id != parent_script_id) {
1100  consistency_info->inconsistent_script = true;
1101  }
1102  }
1103 
1104  // Update chartype related counters.
1105  if (unicharset.get_isalpha(unichar_id)) {
1106  consistency_info->num_alphas++;
1107  } else if (unicharset.get_isdigit(unichar_id)) {
1108  consistency_info->num_digits++;
1109  } else if (!unicharset.get_ispunctuation(unichar_id)) {
1110  consistency_info->num_other++;
1111  }
1112 
1113  // Check font and spacing consistency.
1114  if (fontinfo_table_->size() > 0 && parent_b != nullptr) {
1115  int fontinfo_id = -1;
1116  if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1117  parent_b->fontinfo_id2() == b->fontinfo_id()) {
1118  fontinfo_id = b->fontinfo_id();
1119  } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1120  parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1121  fontinfo_id = b->fontinfo_id2();
1122  }
1123  if(language_model_debug_level > 1) {
1124  tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1125  (parent_b->fontinfo_id() >= 0) ?
1126  fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
1127  (parent_b->fontinfo_id2() >= 0) ?
1128  fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
1129  (b->fontinfo_id() >= 0) ?
1130  fontinfo_table_->get(b->fontinfo_id()).name : "",
1131  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1132  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1133  fontinfo_id);
1134  }
1135  if (!word_res->blob_widths.empty()) { // if we have widths/gaps info
1136  bool expected_gap_found = false;
1137  float expected_gap = 0.0f;
1138  int temp_gap;
1139  if (fontinfo_id >= 0) { // found a common font
1140  ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1141  if (fontinfo_table_->get(fontinfo_id).get_spacing(
1142  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1143  expected_gap = temp_gap;
1144  expected_gap_found = true;
1145  }
1146  } else {
1147  consistency_info->inconsistent_font = true;
1148  // Get an average of the expected gaps in each font
1149  int num_addends = 0;
1150  int temp_fid;
1151  for (int i = 0; i < 4; ++i) {
1152  if (i == 0) {
1153  temp_fid = parent_b->fontinfo_id();
1154  } else if (i == 1) {
1155  temp_fid = parent_b->fontinfo_id2();
1156  } else if (i == 2) {
1157  temp_fid = b->fontinfo_id();
1158  } else {
1159  temp_fid = b->fontinfo_id2();
1160  }
1161  ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1162  if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
1163  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1164  expected_gap += temp_gap;
1165  num_addends++;
1166  }
1167  }
1168  if (num_addends > 0) {
1169  expected_gap /= static_cast<float>(num_addends);
1170  expected_gap_found = true;
1171  }
1172  }
1173  if (expected_gap_found) {
1174  int actual_gap = word_res->GetBlobsGap(curr_col-1);
1175  if (actual_gap == 0) {
1176  consistency_info->num_inconsistent_spaces++;
1177  } else {
1178  float gap_ratio = expected_gap / actual_gap;
1179  // TODO(rays) The gaps seem to be way off most of the time, saved by
1180  // the error here that the ratio was compared to 1/2, when it should
1181  // have been 0.5f. Find the source of the gaps discrepancy and put
1182  // the 0.5f here in place of 0.0f.
1183  // Test on 2476595.sj, pages 0 to 6. (In French.)
1184  if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1185  consistency_info->num_inconsistent_spaces++;
1186  }
1187  }
1188  if (language_model_debug_level > 1) {
1189  tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %d\n",
1190  unicharset.id_to_unichar(parent_b->unichar_id()),
1191  parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
1192  unichar_id, curr_col, expected_gap, actual_gap);
1193  }
1194  }
1195  }
1196  }
1197 }
1198 
1200  ASSERT_HOST(vse != nullptr);
1201  if (params_model_.Initialized()) {
1202  float features[PTRAIN_NUM_FEATURE_TYPES];
1203  ExtractFeaturesFromPath(*vse, features);
1204  float cost = params_model_.ComputeCost(features);
1205  if (language_model_debug_level > 3) {
1206  tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1207  if (language_model_debug_level >= 5) {
1208  for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
1209  tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1210  }
1211  }
1212  }
1213  return cost * vse->outline_length;
1214  } else {
1215  float adjustment = 1.0f;
1216  if (vse->dawg_info == nullptr || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
1218  }
1219  if (vse->dawg_info == nullptr) {
1222  adjustment += ((vse->length - language_model_min_compound_length) *
1224  }
1225  }
1226  if (vse->associate_stats.shape_cost > 0) {
1227  adjustment += vse->associate_stats.shape_cost /
1228  static_cast<float>(vse->length);
1229  }
1231  ASSERT_HOST(vse->ngram_info != nullptr);
1232  return vse->ngram_info->ngram_and_classifier_cost * adjustment;
1233  } else {
1234  adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
1235  vse->consistency_info);
1236  return vse->ratings_sum * adjustment;
1237  }
1238  }
1239 }
1240 
1242  ViterbiStateEntry *vse,
1243  LMPainPoints *pain_points,
1244  WERD_RES *word_res,
1245  BestChoiceBundle *best_choice_bundle,
1246  BlamerBundle *blamer_bundle) {
1247  bool truth_path;
1248  WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
1249  blamer_bundle, &truth_path);
1250  ASSERT_HOST(word != nullptr);
1251  if (dict_->stopper_debug_level >= 1) {
1252  STRING word_str;
1253  word->string_and_lengths(&word_str, nullptr);
1254  vse->Print(word_str.c_str());
1255  }
1256  if (language_model_debug_level > 0) {
1257  word->print("UpdateBestChoice() constructed word");
1258  }
1259  // Record features from the current path if necessary.
1260  ParamsTrainingHypothesis curr_hyp;
1261  if (blamer_bundle != nullptr) {
1262  if (vse->dawg_info != nullptr) vse->dawg_info->permuter =
1263  static_cast<PermuterType>(word->permuter());
1264  ExtractFeaturesFromPath(*vse, curr_hyp.features);
1265  word->string_and_lengths(&(curr_hyp.str), nullptr);
1266  curr_hyp.cost = vse->cost; // record cost for error rate computations
1267  if (language_model_debug_level > 0) {
1268  tprintf("Raw features extracted from %s (cost=%g) [ ",
1269  curr_hyp.str.c_str(), curr_hyp.cost);
1270  for (float feature : curr_hyp.features) {
1271  tprintf("%g ", feature);
1272  }
1273  tprintf("]\n");
1274  }
1275  // Record the current hypothesis in params_training_bundle.
1276  blamer_bundle->AddHypothesis(curr_hyp);
1277  if (truth_path)
1278  blamer_bundle->UpdateBestRating(word->rating());
1279  }
1280  if (blamer_bundle != nullptr && blamer_bundle->GuidedSegsearchStillGoing()) {
1281  // The word was constructed solely for blamer_bundle->AddHypothesis, so
1282  // we no longer need it.
1283  delete word;
1284  return;
1285  }
1286  if (word_res->chopped_word != nullptr && !word_res->chopped_word->blobs.empty())
1288  // Update and log new raw_choice if needed.
1289  if (word_res->raw_choice == nullptr ||
1290  word->rating() < word_res->raw_choice->rating()) {
1291  if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
1292  tprintf("Updated raw choice\n");
1293  }
1294  // Set the modified rating for best choice to vse->cost and log best choice.
1295  word->set_rating(vse->cost);
1296  // Call LogNewChoice() for best choice from Dict::adjust_word() since it
1297  // computes adjust_factor that is used by the adaption code (e.g. by
1298  // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
1299  // Note: the rating of the word is not adjusted.
1300  dict_->adjust_word(word, vse->dawg_info == nullptr,
1301  vse->consistency_info.xht_decision, 0.0,
1302  false, language_model_debug_level > 0);
1303  // Hand ownership of the word over to the word_res.
1305  dict_->stopper_debug_level >= 1, word)) {
1306  // The word was so bad that it was deleted.
1307  return;
1308  }
1309  if (word_res->best_choice == word) {
1310  // Word was the new best.
1312  AcceptablePath(*vse)) {
1313  acceptable_choice_found_ = true;
1314  }
1315  // Update best_choice_bundle.
1316  best_choice_bundle->updated = true;
1317  best_choice_bundle->best_vse = vse;
1318  if (language_model_debug_level > 0) {
1319  tprintf("Updated best choice\n");
1320  word->print_state("New state ");
1321  }
1322  // Update hyphen state if we are dealing with a dictionary word.
1323  if (vse->dawg_info != nullptr) {
1324  if (dict_->has_hyphen_end(*word)) {
1326  } else {
1327  dict_->reset_hyphen_vars(true);
1328  }
1329  }
1330 
1331  if (blamer_bundle != nullptr) {
1333  vse->dawg_info != nullptr && vse->top_choice_flags);
1334  }
1335  }
1336  if (wordrec_display_segmentations && word_res->chopped_word != nullptr) {
1337  word->DisplaySegmentation(word_res->chopped_word);
1338  }
1339 }
1340 
1342  const ViterbiStateEntry &vse, float features[]) {
1343  memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
1344  // Record dictionary match info.
1345  int len = vse.length <= kMaxSmallWordUnichars ? 0 :
1346  vse.length <= kMaxMediumWordUnichars ? 1 : 2;
1347  if (vse.dawg_info != nullptr) {
1348  int permuter = vse.dawg_info->permuter;
1349  if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
1350  if (vse.consistency_info.num_digits == vse.length) {
1351  features[PTRAIN_DIGITS_SHORT+len] = 1.0;
1352  } else {
1353  features[PTRAIN_NUM_SHORT+len] = 1.0;
1354  }
1355  } else if (permuter == DOC_DAWG_PERM) {
1356  features[PTRAIN_DOC_SHORT+len] = 1.0;
1357  } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
1358  permuter == COMPOUND_PERM) {
1359  features[PTRAIN_DICT_SHORT+len] = 1.0;
1360  } else if (permuter == FREQ_DAWG_PERM) {
1361  features[PTRAIN_FREQ_SHORT+len] = 1.0;
1362  }
1363  }
1364  // Record shape cost feature (normalized by path length).
1365  features[PTRAIN_SHAPE_COST_PER_CHAR] =
1366  vse.associate_stats.shape_cost / static_cast<float>(vse.length);
1367  // Record ngram cost. (normalized by the path length).
1368  features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
1369  if (vse.ngram_info != nullptr) {
1370  features[PTRAIN_NGRAM_COST_PER_CHAR] =
1371  vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
1372  }
1373  // Record consistency-related features.
1374  // Disabled this feature for due to its poor performance.
1375  // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
1378  features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == nullptr ?
1380  features[PTRAIN_NUM_BAD_SPACING] =
1382  // Disabled this feature for now due to its poor performance.
1383  // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
1384 
1385  // Classifier-related features.
1386  features[PTRAIN_RATING_PER_CHAR] =
1387  vse.ratings_sum / static_cast<float>(vse.outline_length);
1388 }
1389 
1391  ViterbiStateEntry *vse,
1392  WERD_RES *word_res,
1393  DANGERR *fixpt,
1394  BlamerBundle *blamer_bundle,
1395  bool *truth_path) {
1396  if (truth_path != nullptr) {
1397  *truth_path =
1398  (blamer_bundle != nullptr &&
1399  vse->length == blamer_bundle->correct_segmentation_length());
1400  }
1401  BLOB_CHOICE *curr_b = vse->curr_b;
1402  ViterbiStateEntry *curr_vse = vse;
1403 
1404  int i;
1405  bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1406 
1407  // Re-compute the variance of the width-to-height ratios (since we now
1408  // can compute the mean over the whole word).
1409  float full_wh_ratio_mean = 0.0f;
1410  if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1412  full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
1413  static_cast<float>(vse->length));
1414  vse->associate_stats.full_wh_ratio_var = 0.0f;
1415  }
1416 
1417  // Construct a WERD_CHOICE by tracing parent pointers.
1418  auto *word = new WERD_CHOICE(word_res->uch_set, vse->length);
1419  word->set_length(vse->length);
1420  int total_blobs = 0;
1421  for (i = (vse->length-1); i >= 0; --i) {
1422  if (blamer_bundle != nullptr && truth_path != nullptr && *truth_path &&
1423  !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
1424  *truth_path = false;
1425  }
1426  // The number of blobs used for this choice is row - col + 1.
1427  int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
1428  total_blobs += num_blobs;
1429  word->set_blob_choice(i, num_blobs, curr_b);
1430  // Update the width-to-height ratio variance. Useful non-space delimited
1431  // languages to ensure that the blobs are of uniform width.
1432  // Skip leading and trailing punctuation when computing the variance.
1433  if ((full_wh_ratio_mean != 0.0f &&
1434  ((curr_vse != vse && curr_vse->parent_vse != nullptr) ||
1435  !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
1437  pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1438  if (language_model_debug_level > 2) {
1439  tprintf("full_wh_ratio_var += (%g-%g)^2\n",
1440  full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
1441  }
1442  }
1443 
1444  // Mark the word as compound if compound permuter was set for any of
1445  // the unichars on the path (usually this will happen for unichars
1446  // that are compounding operators, like "-" and "/").
1447  if (!compound && curr_vse->dawg_info &&
1448  curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
1449 
1450  // Update curr_* pointers.
1451  curr_vse = curr_vse->parent_vse;
1452  if (curr_vse == nullptr) break;
1453  curr_b = curr_vse->curr_b;
1454  }
1455  ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.
1456  ASSERT_HOST(total_blobs == word_res->ratings->dimension());
1457  // Re-adjust shape cost to include the updated width-to-height variance.
1458  if (full_wh_ratio_mean != 0.0f) {
1460  }
1461 
1462  word->set_rating(vse->ratings_sum);
1463  word->set_certainty(vse->min_certainty);
1464  word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
1466  if (vse->dawg_info != nullptr) {
1467  word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1468  } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1469  word->set_permuter(NGRAM_PERM);
1470  } else if (vse->top_choice_flags) {
1471  word->set_permuter(TOP_CHOICE_PERM);
1472  } else {
1473  word->set_permuter(NO_PERM);
1474  }
1475  word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
1476  word_res->ratings));
1477  return word;
1478 }
1479 
1480 } // namespace tesseract
tesseract::LanguageModel::prev_word_str_
STRING prev_word_str_
Definition: language_model.h:400
tesseract::Dawg::edge_letter
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
tesseract::LanguageModel::AddViterbiStateEntry
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:561
tesseract::ViterbiStateEntry::top_choice_flags
LanguageModelFlagsType top_choice_flags
Definition: lm_state.h:192
tesseract::DawgPositionVector::clear
void clear()
Definition: dawg.h:377
tesseract::ViterbiStateEntry::dawg_info
LanguageModelDawgInfo * dawg_info
Definition: lm_state.h:166
tesseract::Dawg::end_of_word
virtual bool end_of_word(EDGE_REF edge_ref) const =0
language_model.h
BlamerBundle::correct_segmentation_length
int correct_segmentation_length() const
Definition: blamer.h:141
tesseract::ViterbiStateEntry::Print
void Print(const char *msg) const
Definition: lm_state.cpp:26
pageres.h
UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:488
tesseract::Dict::GetPuncDawg
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:434
tesseract::ViterbiStateEntry::length
int length
number of characters on the path
Definition: lm_state.h:185
tesseract::LanguageModelNgramInfo::pruned
bool pruned
Definition: lm_state.h:82
WERD_RES::blob_widths
GenericVector< int > blob_widths
Definition: pageres.h:210
tesseract::LanguageModel::language_model_ngram_on
bool language_model_ngram_on
Definition: language_model.h:318
tesseract::PTRAIN_RATING_PER_CHAR
Definition: params_training_featdef.h:68
tesseract::ViterbiStateEntry::outline_length
float outline_length
length of the outline so far
Definition: lm_state.h:186
unicity_table.h
tesseract::Dawg::type
DawgType type() const
Definition: dawg.h:122
tesseract::LanguageModel::wordrec_display_segmentations
int wordrec_display_segmentations
Definition: language_model.h:362
tesseract::LanguageModel::params_model_
ParamsModel params_model_
Definition: language_model.h:421
tesseract::DawgPosition
Definition: dawg.h:348
tesseract::LanguageModel::language_model_penalty_non_dict_word
double language_model_penalty_non_dict_word
Definition: language_model.h:348
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
WERD_CHOICE
Definition: ratngs.h:261
UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:481
tesseract::PTRAIN_DIGITS_SHORT
Definition: params_training_featdef.h:41
tesseract::LanguageModel::correct_segmentation_explored_
bool correct_segmentation_explored_
Definition: language_model.h:418
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
WERD_RES::LogNewRawChoice
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:600
tesseract::LanguageModel::kXhtConsistentFlag
static const LanguageModelFlagsType kXhtConsistentFlag
Definition: language_model.h:57
SYSTEM_DAWG_PERM
Definition: ratngs.h:239
tesseract::Dict::tessedit_truncate_wordchoice_log
int tessedit_truncate_wordchoice_log
Definition: dict.h:642
tesseract::LanguageModelNgramInfo::context
STRING context
Definition: lm_state.h:74
INT_MEMBER
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:312
language_specific.log
log
Definition: language_specific.py:25
tesseract::LanguageModel::max_char_wh_ratio_
float max_char_wh_ratio_
Definition: language_model.h:393
BLOB_CHOICE::certainty
float certainty() const
Definition: ratngs.h:81
params.h
tesseract::ViterbiStateEntry
Definition: lm_state.h:91
tesseract::LMConsistencyInfo::NumInconsistentSpaces
int NumInconsistentSpaces() const
Definition: lm_consistency.h:99
params_training_featdef.h
tesseract::LMConsistencyInfo::inconsistent_script
bool inconsistent_script
Definition: lm_consistency.h:136
tesseract::LanguageModel::language_model_penalty_non_freq_dict_word
double language_model_penalty_non_freq_dict_word
Definition: language_model.h:346
NO_PERM
Definition: ratngs.h:231
tesseract::LanguageModel::GenerateNgramInfo
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
Definition: language_model.cpp:877
tesseract::LanguageModel::very_beginning_active_dawgs_
DawgPositionVector very_beginning_active_dawgs_
Definition: language_model.h:403
tesseract::LanguageModel::dict_
Dict * dict_
Definition: language_model.h:383
STRING
Definition: strngs.h:45
tesseract::LanguageModel::~LanguageModel
~LanguageModel()
Definition: language_model.cpp:134
WERD_RES::x_height
float x_height
Definition: pageres.h:310
tesseract::Dict::stopper_debug_level
int stopper_debug_level
Definition: dict.h:638
tesseract::LanguageModel::ComputeAdjustedPathCost
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
Definition: language_model.cpp:1199
tesseract::AssociateStats::bad_shape
bool bad_shape
Definition: associate.h:53
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:334
tesseract::ViterbiStateEntry::cost
float cost
Definition: lm_state.h:178
WERD_RES
Definition: pageres.h:160
tesseract::AssociateStats::full_wh_ratio_total
float full_wh_ratio_total
Definition: associate.h:55
tesseract::LanguageModelDawgInfo
Definition: lm_state.h:61
COMPOUND_PERM
Definition: ratngs.h:243
tesseract::LanguageModelNgramInfo::ngram_cost
float ngram_cost
-ln(P_ngram_model(path))
Definition: lm_state.h:84
WERD_RES::ratings
MATRIX * ratings
Definition: pageres.h:231
tesseract::Dict::AcceptableChoice
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:56
WERD_CHOICE::kBadRating
static const float kBadRating
Definition: ratngs.h:263
tesseract::LanguageModel::dawg_args_
DawgArgs dawg_args_
Definition: language_model.h:364
tesseract::Dict::default_dawgs
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:617
UNICHARSET::katakana_sid
int katakana_sid() const
Definition: unicharset.h:881
tesseract::ViterbiStateEntry::competing_vse
ViterbiStateEntry * competing_vse
Definition: lm_state.h:162
tesseract::PTRAIN_SHAPE_COST_PER_CHAR
Definition: params_training_featdef.h:60
BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:75
tesseract::LMConsistencyInfo::num_punc
int num_punc
Definition: lm_consistency.h:120
tesseract::BestChoiceBundle::best_vse
ViterbiStateEntry * best_vse
Best ViterbiStateEntry and BLOB_CHOICE.
Definition: lm_state.h:240
tesseract::PTRAIN_NUM_BAD_CASE
Definition: params_training_featdef.h:63
tesseract::DAWG_TYPE_WORD
Definition: dawg.h:68
BlamerBundle::MatrixPositionCorrect
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:146
tesseract::LMConsistencyInfo::BodyMinXHeight
float BodyMinXHeight() const
Definition: lm_consistency.h:106
BOOL_INIT_MEMBER
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:327
UNICHARSET::get_script
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:653
tesseract::AssociateStats
Definition: associate.h:36
UNICHARSET::get_ispunctuation
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:509
BLOB_CHOICE::matrix_cell
const MATRIX_COORD & matrix_cell()
Definition: ratngs.h:115
tesseract::LMConsistencyInfo::xht_decision
XHeightConsistencyEnum xht_decision
Definition: lm_consistency.h:123
WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:197
tesseract::LanguageModel::UpdateState
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:253
tesseract::PTRAIN_NUM_FEATURE_TYPES
Definition: params_training_featdef.h:70
tesseract::Dict::hyphenated
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135
tesseract::PTRAIN_DICT_SHORT
Definition: params_training_featdef.h:53
WERD_CHOICE::string_and_lengths
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:451
WERD_CHOICE::SetScriptPositions
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:552
tesseract::Dict::LetterIsOkay
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:376
tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step
bool language_model_ngram_use_only_first_uft8_step
Definition: language_model.h:333
BlamerBundle::UpdateBestRating
void UpdateBestRating(float rating)
Definition: blamer.h:137
tesseract::LanguageModel::language_model_ngram_small_prob
double language_model_ngram_small_prob
Definition: language_model.h:328
tesseract::BestChoiceBundle::beam
PointerVector< LanguageModelState > beam
Definition: lm_state.h:238
tesseract::DawgArgs::permuter
PermuterType permuter
Definition: dict.h:86
tesseract::LanguageModel::kMaxAvgNgramCost
static const float kMaxAvgNgramCost
Definition: language_model.h:61
tesseract::LanguageModel::LanguageModel
LanguageModel(const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
Definition: language_model.cpp:53
tesseract::BestChoiceBundle::updated
bool updated
Flag to indicate whether anything was changed.
Definition: lm_state.h:232
tesseract::LanguageModel::language_model_ngram_rating_factor
double language_model_ngram_rating_factor
Definition: language_model.h:339
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
tesseract::LanguageModel::rating_cert_scale_
float rating_cert_scale_
Definition: language_model.h:374
WERD_RES::LogNewCookedChoice
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:616
tesseract::BestChoiceBundle
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:222
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
unicharset.h
dawg.h
tesseract::LanguageModelState::viterbi_state_entries_length
int viterbi_state_entries_length
Total number of entries in viterbi_state_entries.
Definition: lm_state.h:218
tesseract::LanguageModel::language_model_ngram_space_delimited_language
bool language_model_ngram_space_delimited_language
Definition: language_model.h:341
tesseract::PTRAIN_DOC_SHORT
Definition: params_training_featdef.h:49
tesseract::ViterbiStateEntry::ngram_info
LanguageModelNgramInfo * ngram_info
Definition: lm_state.h:170
tesseract::Dict::set_hyphen_word
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:59
tesseract::LMConsistencyInfo::NumInconsistentCase
int NumInconsistentCase() const
Definition: lm_consistency.h:87
UNICHARSET::null_sid
int null_sid() const
Definition: unicharset.h:874
tesseract::LMConsistencyInfo::num_lower
int num_lower
Definition: lm_consistency.h:125
tesseract::AssociateStats::full_wh_ratio_var
float full_wh_ratio_var
Definition: associate.h:57
tesseract::LanguageModelState::Print
void Print(const char *msg)
Definition: lm_state.cpp:69
ccutil.h
tesseract::ParamsTrainingHypothesis::str
STRING str
Definition: params_training_featdef.h:122
tesseract::DawgArgs::updated_dawgs
DawgPositionVector * updated_dawgs
Definition: dict.h:85
UNICHARSET::han_sid
int han_sid() const
Definition: unicharset.h:879
tesseract::LanguageModel::GetTopLowerUpperDigit
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
Definition: language_model.cpp:383
tesseract::ViterbiStateEntry::associate_stats
AssociateStats associate_stats
character widths/gaps/seams
Definition: lm_state.h:188
BlamerBundle::GuidedSegsearchStillGoing
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:512
tesseract::LMConsistencyInfo::punc_ref
EDGE_REF punc_ref
Definition: lm_consistency.h:117
matrix.h
tesseract::ParamsTrainingHypothesis
Definition: params_training_featdef.h:106
TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:457
tesseract::UNICHAR::utf8_step
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:138
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
UNICHARSET
Definition: unicharset.h:145
UNICHARSET::hiragana_sid
int hiragana_sid() const
Definition: unicharset.h:880
tesseract::PTRAIN_XHEIGHT_CONSISTENCY
Definition: params_training_featdef.h:64
double_MEMBER
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:321
tesseract::LMConsistencyInfo::num_inconsistent_spaces
int num_inconsistent_spaces
Definition: lm_consistency.h:127
tesseract::LMPainPoints
Definition: lm_pain_points.h:56
tesseract::ViterbiStateEntry::curr_b
BLOB_CHOICE * curr_b
Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this).
Definition: lm_state.h:158
tesseract::Dawg::edge_char_of
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
mixed
Definition: cluster.h:43
tesseract::LanguageModelFlagsType
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:37
NGRAM_PERM
Definition: ratngs.h:236
WERD_RES::raw_choice
WERD_CHOICE * raw_choice
Definition: pageres.h:240
tesseract::LanguageModelState::viterbi_state_entries_prunable_max_cost
float viterbi_state_entries_prunable_max_cost
Definition: lm_state.h:216
WERD_CHOICE::set_rating
void set_rating(float new_val)
Definition: ratngs.h:357
WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:206
tesseract
Definition: baseapi.h:65
tesseract::LanguageModelNgramInfo::ngram_and_classifier_cost
float ngram_and_classifier_cost
-[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
Definition: lm_state.h:86
tesseract::LanguageModelState
Struct to store information maintained by various language model components.
Definition: lm_state.h:200
tesseract::LanguageModel::language_model_ngram_order
int language_model_ngram_order
Definition: language_model.h:320
tesseract::LanguageModel::ComputeDenom
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
Definition: language_model.cpp:994
tesseract::DawgPosition::dawg_index
int8_t dawg_index
Definition: dawg.h:367
tesseract::BestChoiceBundle::fixpt
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
Definition: lm_state.h:234
BlamerBundle::set_best_choice_is_dict_and_top_choice
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:150
tesseract::ViterbiStateEntry::consistency_info
LMConsistencyInfo consistency_info
path consistency info
Definition: lm_state.h:187
tesseract::LanguageModelState::viterbi_state_entries
ViterbiStateEntry_LIST viterbi_state_entries
Storage for the Viterbi state.
Definition: lm_state.h:213
tesseract::AssociateStats::full_wh_ratio
float full_wh_ratio
Definition: associate.h:54
tesseract::ParamsTrainingHypothesis::features
float features[PTRAIN_NUM_FEATURE_TYPES]
Definition: params_training_featdef.h:121
tesseract::LanguageModel::GenerateDawgInfo
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
Definition: language_model.cpp:785
tesseract::ViterbiStateEntry::updated
bool updated
set to true if the entry has just been created/updated
Definition: lm_state.h:194
tesseract::LanguageModel::fixed_pitch_
bool fixed_pitch_
Definition: language_model.h:390
WERD_CHOICE::DisplaySegmentation
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:763
tesseract::LanguageModel::kUpperCaseFlag
static const LanguageModelFlagsType kUpperCaseFlag
Definition: language_model.h:55
tprintf.h
TOP_CHOICE_PERM
Definition: ratngs.h:233
tesseract::LanguageModel::prev_word_unichar_step_len_
int prev_word_unichar_step_len_
Definition: language_model.h:401
BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:78
tesseract::FontInfo
Definition: fontinfo.h:62
tesseract::ViterbiStateEntry::parent_vse
ViterbiStateEntry * parent_vse
Definition: lm_state.h:159
tesseract::DawgPositionVector
Definition: dawg.h:373
tesseract::Dict::compound_marker
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:113
tesseract::LanguageModel::acceptable_choice_found_
bool acceptable_choice_found_
Definition: language_model.h:416
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
tesseract::LanguageModel::fontinfo_table_
const UnicityTable< FontInfo > * fontinfo_table_
Definition: language_model.h:379
USER_PATTERN_PERM
Definition: ratngs.h:238
tesseract::LanguageModelDawgInfo::active_dawgs
DawgPositionVector active_dawgs
Definition: lm_state.h:64
GenericVector
Definition: baseapi.h:40
UNICHARSET::common_sid
int common_sid() const
Definition: unicharset.h:875
tesseract::LMConsistencyInfo::num_digits
int num_digits
Definition: lm_consistency.h:119
tesseract::LanguageModel::language_model_penalty_increment
double language_model_penalty_increment
Definition: language_model.h:361
tesseract::LanguageModel::ComputeConsistencyAdjustment
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
Definition: language_model.h:135
tesseract::LanguageModelDawgInfo::permuter
PermuterType permuter
Definition: lm_state.h:65
UnicityTable
Definition: fontinfo.h:30
tesseract::LanguageModel::GenerateTopChoiceInfo
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
Definition: language_model.cpp:769
BLOB_CHOICE::fontinfo_id
int16_t fontinfo_id() const
Definition: ratngs.h:84
tesseract::ViterbiStateEntry::Compare
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:132
tesseract::LanguageModelNgramInfo
Definition: lm_state.h:70
tesseract::Dict
Definition: dict.h:91
tesseract::DawgPosition::back_to_punc
bool back_to_punc
Definition: dawg.h:370
UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:495
tesseract::Dict::init_active_dawgs
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:600
WERD_CHOICE::print_state
void print_state(const char *msg) const
Definition: ratngs.cpp:754
tesseract::ViterbiStateEntry::HasAlnumChoice
bool HasAlnumChoice(const UNICHARSET &unicharset)
Definition: lm_state.h:147
tesseract::ParamsModel::ComputeCost
float ComputeCost(const float features[]) const
Definition: params_model.cpp:80
WERD_RES::GetBlobsGap
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:736
tesseract::LanguageModel::GetNextParentVSE
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
Definition: language_model.cpp:500
tesseract::LMConsistencyInfo::script_id
int script_id
Definition: lm_consistency.h:126
BLOB_CHOICE::PosAndSizeAgree
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:154
tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable
int language_model_viterbi_list_max_num_prunable
Definition: language_model.h:323
tesseract::LMConsistencyInfo
Definition: lm_consistency.h:38
tesseract::Dict::GetStartingNode
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:438
tesseract::LanguageModel::ExtractFeaturesFromPath
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
Definition: language_model.cpp:1341
tesseract::ParamsModel::Initialized
bool Initialized()
Definition: params_model.h:44
tesseract::AssociateUtils::ComputeOutlineLength
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
Definition: associate.h:80
STRING::length
int32_t length() const
Definition: strngs.cpp:187
WERD_CHOICE::print
void print() const
Definition: ratngs.h:568
BLOB_CHOICE::fontinfo_id2
int16_t fontinfo_id2() const
Definition: ratngs.h:87
tesseract::LanguageModel::ConstructWord
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
Definition: language_model.cpp:1390
tesseract::Dawg
Definition: dawg.h:113
tesseract::Dict::has_hyphen_end
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:152
BLOB_CHOICE
Definition: ratngs.h:49
tesseract::Dict::reset_hyphen_vars
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:42
tesseract::LanguageModel::CertaintyScore
float CertaintyScore(float cert)
Definition: language_model.h:112
tesseract::LanguageModel::language_model_min_compound_length
int language_model_min_compound_length
Definition: language_model.h:343
unichar.h
tesseract::PTRAIN_NGRAM_COST_PER_CHAR
Definition: params_training_featdef.h:61
MATRIX_COORD::col
int col
Definition: matrix.h:632
tesseract::Dict::adjust_word
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:701
tesseract::LanguageModel::language_model_viterbi_list_max_size
int language_model_viterbi_list_max_size
Definition: language_model.h:325
tesseract::ViterbiStateEntry::min_certainty
float min_certainty
minimum certainty on the path
Definition: lm_state.h:183
tesseract::LMConsistencyInfo::num_other
int num_other
Definition: lm_consistency.h:121
tesseract::Dict::ProbabilityInContext
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:390
tesseract::LanguageModel::kLowerCaseFlag
static const LanguageModelFlagsType kLowerCaseFlag
Definition: language_model.h:54
tesseract::LMConsistencyInfo::num_non_first_upper
int num_non_first_upper
Definition: lm_consistency.h:124
tesseract::PTRAIN_NUM_BAD_SPACING
Definition: params_training_featdef.h:66
UNICHARSET::normed_ids
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:825
tesseract::PTRAIN_NUM_BAD_CHAR_TYPE
Definition: params_training_featdef.h:65
tesseract::LanguageModel::language_model_ngram_nonmatch_score
double language_model_ngram_nonmatch_score
Definition: language_model.h:330
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::LMConsistencyInfo::invalid_punc
bool invalid_punc
Definition: lm_consistency.h:135
tesseract::LanguageModel::FillConsistencyInfo
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
Definition: language_model.cpp:1015
tesseract::LanguageModelNgramInfo::context_unichar_step_len
int context_unichar_step_len
Definition: lm_state.h:77
tesseract::Dict::getUnicharset
const UNICHARSET & getUnicharset() const
Definition: dict.h:101
UNICHARSET::get_fragment
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:724
tesseract::Dict::GetDawg
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:432
errcode.h
tesseract::ViterbiStateEntry::ratings_sum
float ratings_sum
sum of ratings of character on the path
Definition: lm_state.h:182
tesseract::DawgPosition::dawg_ref
EDGE_REF dawg_ref
Definition: dawg.h:365
tesseract::Dawg::kPatternUnicharID
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:120
tesseract::LanguageModel::SetTopParentLowerUpperDigit
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
Definition: language_model.cpp:423
MATRIX_COORD::row
int row
Definition: matrix.h:633
UNICHARSET::SizesDistinct
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:485
tesseract::DawgArgs::active_dawgs
DawgPositionVector * active_dawgs
Definition: dict.h:84
tesseract::LanguageModel::kSmallestRatingFlag
static const LanguageModelFlagsType kSmallestRatingFlag
Definition: language_model.h:53
tesseract::Dict::NoDangerousAmbig
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:158
tesseract::Dict::is_apostrophe
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:124
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
BlamerBundle::AddHypothesis
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:170
tesseract::LanguageModel::InitForWord
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
Definition: language_model.cpp:136
tesseract::LanguageModel::kDigitFlag
static const LanguageModelFlagsType kDigitFlag
Definition: language_model.h:56
tesseract::LanguageModel::beginning_active_dawgs_
DawgPositionVector beginning_active_dawgs_
Definition: language_model.h:404
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
UNICHARSET::get_other_case
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:673
tesseract::LanguageModel::ComputeAssociateStats
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
Definition: language_model.h:280
tesseract::LanguageModel::language_model_debug_level
int language_model_debug_level
Definition: language_model.h:316
tesseract::LanguageModel::ComputeNgramCost
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
Definition: language_model.cpp:934
tesseract::LMConsistencyInfo::BodyMaxXHeight
float BodyMaxXHeight() const
Definition: lm_consistency.h:111
BOOL_MEMBER
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:315
DOC_DAWG_PERM
Definition: ratngs.h:240
BlamerBundle
Definition: blamer.h:103
blamer.h
tesseract::LanguageModel::PrunablePath
bool PrunablePath(const ViterbiStateEntry &vse)
Definition: language_model.h:299
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::PTRAIN_NUM_SHORT
Definition: params_training_featdef.h:45
tesseract::LMConsistencyInfo::ComputeXheightConsistency
void ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc)
Definition: lm_consistency.cpp:29
FREQ_DAWG_PERM
Definition: ratngs.h:242
tesseract::LMConsistencyInfo::InconsistentXHeight
int InconsistentXHeight() const
Definition: lm_consistency.h:102
tesseract::LMConsistencyInfo::num_alphas
int num_alphas
Definition: lm_consistency.h:118
tesseract::ParamsTrainingHypothesis::cost
float cost
Definition: params_training_featdef.h:123
UNICHARSET::size
int size() const
Definition: unicharset.h:341
NUMBER_PERM
Definition: ratngs.h:237
tesseract::LanguageModelState::viterbi_state_entries_prunable_length
int viterbi_state_entries_prunable_length
Number and max cost of prunable paths in viterbi_state_entries.
Definition: lm_state.h:215
NODE_REF
int64_t NODE_REF
Definition: dawg.h:50
USER_DAWG_PERM
Definition: ratngs.h:241
BandTriMatrix::dimension
int dimension() const
Definition: matrix.h:532
lm_state.h
tesseract::LanguageModel::AcceptablePath
bool AcceptablePath(const ViterbiStateEntry &vse)
Definition: language_model.h:309
tesseract::LMConsistencyInfo::NumInconsistentChartype
int NumInconsistentChartype() const
Definition: lm_consistency.h:90
tesseract::AssociateStats::shape_cost
float shape_cost
Definition: associate.h:52
tesseract::LanguageModel::UpdateBestChoice
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:1241
tesseract::LMConsistencyInfo::inconsistent_font
bool inconsistent_font
Definition: lm_consistency.h:137
tesseract::LanguageModel::language_model_ngram_scale_factor
double language_model_ngram_scale_factor
Definition: language_model.h:336
tesseract::PTRAIN_FREQ_SHORT
Definition: params_training_featdef.h:57