tesseract  5.0.0-alpha-619-ge9db
tesseract::LanguageModel Class Reference

#include <language_model.h>

Public Member Functions

 LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
 
 ~LanguageModel ()
 
void InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
 
bool UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
bool AcceptableChoiceFound ()
 
void SetAcceptableChoiceFound (bool val)
 
ParamsModelgetParamsModel ()
 

Static Public Member Functions

static void ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[])
 

Public Attributes

int language_model_debug_level = 0
 
bool language_model_ngram_on = false
 
int language_model_ngram_order = 8
 
int language_model_viterbi_list_max_num_prunable = 10
 
int language_model_viterbi_list_max_size = 500
 
double language_model_ngram_small_prob = 0.000001
 
double language_model_ngram_nonmatch_score = -40.0
 
bool language_model_ngram_use_only_first_uft8_step = false
 
double language_model_ngram_scale_factor = 0.03
 
double language_model_ngram_rating_factor = 16.0
 
bool language_model_ngram_space_delimited_language = true
 
int language_model_min_compound_length = 3
 
double language_model_penalty_non_freq_dict_word = 0.1
 
double language_model_penalty_non_dict_word = 0.15
 
double language_model_penalty_punc = 0.2
 
double language_model_penalty_case = 0.1
 
double language_model_penalty_script = 0.5
 
double language_model_penalty_chartype = 0.3
 
double language_model_penalty_font = 0.00
 
double language_model_penalty_spacing = 0.05
 
double language_model_penalty_increment = 0.01
 
int wordrec_display_segmentations = 0
 
bool language_model_use_sigmoidal_certainty = false
 

Static Public Attributes

static const LanguageModelFlagsType kSmallestRatingFlag = 0x1
 
static const LanguageModelFlagsType kLowerCaseFlag = 0x2
 
static const LanguageModelFlagsType kUpperCaseFlag = 0x4
 
static const LanguageModelFlagsType kDigitFlag = 0x8
 
static const LanguageModelFlagsType kXhtConsistentFlag = 0x10
 
static const float kMaxAvgNgramCost = 25.0f
 

Protected Member Functions

float CertaintyScore (float cert)
 
float ComputeAdjustment (int num_problems, float penalty)
 
float ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
 
float ComputeAdjustedPathCost (ViterbiStateEntry *vse)
 
bool GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
 
int SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const
 
ViterbiStateEntryGetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
 
bool AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
 
LanguageModelDawgInfoGenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
 
LanguageModelNgramInfoGenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
 
float ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
 
float ComputeDenom (BLOB_CHOICE_LIST *curr_list)
 
void FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
 
void UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
WERD_CHOICEConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
 
void ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
 
bool PrunablePath (const ViterbiStateEntry &vse)
 
bool AcceptablePath (const ViterbiStateEntry &vse)
 

Protected Attributes

DawgArgs dawg_args_
 
float rating_cert_scale_ = 0.0f
 
const UnicityTable< FontInfo > * fontinfo_table_ = nullptr
 
Dictdict_ = nullptr
 
bool fixed_pitch_ = false
 
float max_char_wh_ratio_ = 0.0f
 
STRING prev_word_str_
 
int prev_word_unichar_step_len_ = 0
 
DawgPositionVector very_beginning_active_dawgs_
 
DawgPositionVector beginning_active_dawgs_
 
bool acceptable_choice_found_ = false
 
bool correct_segmentation_explored_ = false
 
ParamsModel params_model_
 

Detailed Description

Definition at line 50 of file language_model.h.

Constructor & Destructor Documentation

◆ LanguageModel()

tesseract::LanguageModel::LanguageModel ( const UnicityTable< FontInfo > *  fontinfo_table,
Dict dict 
)

Definition at line 53 of file language_model.cpp.

55  : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
56  dict->getCCUtil()->params()),
58  "Turn on/off the use of character ngram model",
59  dict->getCCUtil()->params()),
61  "Maximum order of the character ngram model",
62  dict->getCCUtil()->params()),
64  "Maximum number of prunable (those for which"
65  " PrunablePath() is true) entries in each viterbi list"
66  " recorded in BLOB_CHOICEs",
67  dict->getCCUtil()->params()),
69  "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
70  dict->getCCUtil()->params()),
72  "To avoid overly small denominators use this as the "
73  "floor of the probability returned by the ngram model.",
74  dict->getCCUtil()->params()),
76  "Average classifier score of a non-matching unichar.",
77  dict->getCCUtil()->params()),
79  "Use only the first UTF8 step of the given string"
80  " when computing log probabilities.",
81  dict->getCCUtil()->params()),
83  "Strength of the character ngram model relative to the"
84  " character classifier ",
85  dict->getCCUtil()->params()),
87  "Factor to bring log-probs into the same range as ratings"
88  " when multiplied by outline length ",
89  dict->getCCUtil()->params()),
91  "Words are delimited by space", dict->getCCUtil()->params()),
93  "Minimum length of compound words",
94  dict->getCCUtil()->params()),
96  "Penalty for words not in the frequent word dictionary",
97  dict->getCCUtil()->params()),
99  "Penalty for non-dictionary words",
100  dict->getCCUtil()->params()),
102  "Penalty for inconsistent punctuation",
103  dict->getCCUtil()->params()),
105  "Penalty for inconsistent case",
106  dict->getCCUtil()->params()),
108  "Penalty for inconsistent script",
109  dict->getCCUtil()->params()),
111  "Penalty for inconsistent character type",
112  dict->getCCUtil()->params()),
113  // TODO(daria, rays): enable font consistency checking
114  // after improving font analysis.
116  "Penalty for inconsistent font",
117  dict->getCCUtil()->params()),
119  "Penalty for inconsistent spacing",
120  dict->getCCUtil()->params()),
121  double_MEMBER(language_model_penalty_increment, 0.01, "Penalty increment",
122  dict->getCCUtil()->params()),
123  INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
124  dict->getCCUtil()->params()),
126  "Use sigmoidal score for certainty",
127  dict->getCCUtil()->params()),
128  dawg_args_(nullptr, new DawgPositionVector(), NO_PERM),
129  fontinfo_table_(fontinfo_table),
130  dict_(dict) {
131  ASSERT_HOST(dict_ != nullptr);
132 }

◆ ~LanguageModel()

tesseract::LanguageModel::~LanguageModel ( )

Definition at line 134 of file language_model.cpp.

134 { delete dawg_args_.updated_dawgs; }

Member Function Documentation

◆ AcceptableChoiceFound()

bool tesseract::LanguageModel::AcceptableChoiceFound ( )
inline

Definition at line 103 of file language_model.h.

103 { return acceptable_choice_found_; }

◆ AcceptablePath()

bool tesseract::LanguageModel::AcceptablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 309 of file language_model.h.

309  {
310  return (vse.dawg_info != nullptr || vse.Consistent() ||
311  (vse.ngram_info != nullptr && !vse.ngram_info->pruned));
312  }

◆ AddViterbiStateEntry()

bool tesseract::LanguageModel::AddViterbiStateEntry ( LanguageModelFlagsType  top_choice_flags,
float  denom,
bool  word_end,
int  curr_col,
int  curr_row,
BLOB_CHOICE b,
LanguageModelState curr_state,
ViterbiStateEntry parent_vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 561 of file language_model.cpp.

572  {
573  ViterbiStateEntry_IT vit;
574  if (language_model_debug_level > 1) {
575  tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
576  " certainty=%.4f top_choice_flags=0x%x",
578  b->rating(), b->certainty(), top_choice_flags);
580  tprintf(" parent_vse=%p\n", parent_vse);
581  else
582  tprintf("\n");
583  }
584  ASSERT_HOST(curr_state != nullptr);
585  // Check whether the list is full.
586  if (curr_state->viterbi_state_entries_length >=
588  if (language_model_debug_level > 1) {
589  tprintf("AddViterbiStateEntry: viterbi list is full!\n");
590  }
591  return false;
592  }
593 
594  // Invoke Dawg language model component.
595  LanguageModelDawgInfo *dawg_info =
596  GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
597 
598  float outline_length =
600  // Invoke Ngram language model component.
601  LanguageModelNgramInfo *ngram_info = nullptr;
603  ngram_info = GenerateNgramInfo(
605  denom, curr_col, curr_row, outline_length, parent_vse);
606  ASSERT_HOST(ngram_info != nullptr);
607  }
608  bool liked_by_language_model = dawg_info != nullptr ||
609  (ngram_info != nullptr && !ngram_info->pruned);
610  // Quick escape if not liked by the language model, can't be consistent
611  // xheight, and not top choice.
612  if (!liked_by_language_model && top_choice_flags == 0) {
613  if (language_model_debug_level > 1) {
614  tprintf("Language model components very early pruned this entry\n");
615  }
616  delete ngram_info;
617  delete dawg_info;
618  return false;
619  }
620 
621  // Check consistency of the path and set the relevant consistency_info.
622  LMConsistencyInfo consistency_info(
623  parent_vse != nullptr ? &parent_vse->consistency_info : nullptr);
624  // Start with just the x-height consistency, as it provides significant
625  // pruning opportunity.
626  consistency_info.ComputeXheightConsistency(
628  // Turn off xheight consistent flag if not consistent.
629  if (consistency_info.InconsistentXHeight()) {
630  top_choice_flags &= ~kXhtConsistentFlag;
631  }
632 
633  // Quick escape if not liked by the language model, not consistent xheight,
634  // and not top choice.
635  if (!liked_by_language_model && top_choice_flags == 0) {
636  if (language_model_debug_level > 1) {
637  tprintf("Language model components early pruned this entry\n");
638  }
639  delete ngram_info;
640  delete dawg_info;
641  return false;
642  }
643 
644  // Compute the rest of the consistency info.
645  FillConsistencyInfo(curr_col, word_end, b, parent_vse,
646  word_res, &consistency_info);
647  if (dawg_info != nullptr && consistency_info.invalid_punc) {
648  consistency_info.invalid_punc = false; // do not penalize dict words
649  }
650 
651  // Compute cost of associating the blobs that represent the current unichar.
652  AssociateStats associate_stats;
653  ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
654  parent_vse, word_res, &associate_stats);
655  if (parent_vse != nullptr) {
656  associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
657  associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
658  }
659 
660  // Create the new ViterbiStateEntry compute the adjusted cost of the path.
661  auto *new_vse = new ViterbiStateEntry(
662  parent_vse, b, 0.0, outline_length,
663  consistency_info, associate_stats, top_choice_flags, dawg_info,
664  ngram_info, (language_model_debug_level > 0) ?
665  dict_->getUnicharset().id_to_unichar(b->unichar_id()) : nullptr);
666  new_vse->cost = ComputeAdjustedPathCost(new_vse);
668  tprintf("Adjusted cost = %g\n", new_vse->cost);
669 
670  // Invoke Top Choice language model component to make the final adjustments
671  // to new_vse->top_choice_flags.
672  if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
673  GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
674  }
675 
676  // If language model components did not like this unichar - return.
677  bool keep = new_vse->top_choice_flags || liked_by_language_model;
678  if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths
679  consistency_info.inconsistent_script) { // with inconsistent script
680  keep = false;
681  }
682  if (!keep) {
683  if (language_model_debug_level > 1) {
684  tprintf("Language model components did not like this entry\n");
685  }
686  delete new_vse;
687  return false;
688  }
689 
690  // Discard this entry if it represents a prunable path and
691  // language_model_viterbi_list_max_num_prunable such entries with a lower
692  // cost have already been recorded.
693  if (PrunablePath(*new_vse) &&
694  (curr_state->viterbi_state_entries_prunable_length >=
696  new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
697  if (language_model_debug_level > 1) {
698  tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
699  new_vse->cost,
700  curr_state->viterbi_state_entries_prunable_max_cost);
701  }
702  delete new_vse;
703  return false;
704  }
705 
706  // Update best choice if needed.
707  if (word_end) {
708  UpdateBestChoice(new_vse, pain_points, word_res,
709  best_choice_bundle, blamer_bundle);
710  // Discard the entry if UpdateBestChoice() found flaws in it.
711  if (new_vse->cost >= WERD_CHOICE::kBadRating &&
712  new_vse != best_choice_bundle->best_vse) {
713  if (language_model_debug_level > 1) {
714  tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
715  }
716  delete new_vse;
717  return false;
718  }
719  }
720 
721  // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
722  curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
723  false, new_vse);
724  curr_state->viterbi_state_entries_length++;
725  if (PrunablePath(*new_vse)) {
726  curr_state->viterbi_state_entries_prunable_length++;
727  }
728 
729  // Update lms->viterbi_state_entries_prunable_max_cost and clear
730  // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
731  if ((curr_state->viterbi_state_entries_prunable_length >=
733  new_vse->top_choice_flags) {
734  ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
735  int prunable_counter = language_model_viterbi_list_max_num_prunable;
736  vit.set_to_list(&(curr_state->viterbi_state_entries));
737  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
738  ViterbiStateEntry *curr_vse = vit.data();
739  // Clear the appropriate top choice flags of the entries in the
740  // list that have cost higher thank new_entry->cost
741  // (since they will not be top choices any more).
742  if (curr_vse->top_choice_flags && curr_vse != new_vse &&
743  curr_vse->cost > new_vse->cost) {
744  curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
745  }
746  if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
747  // Update curr_state->viterbi_state_entries_prunable_max_cost.
748  if (prunable_counter == 0) {
749  curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
750  if (language_model_debug_level > 1) {
751  tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
752  curr_state->viterbi_state_entries_prunable_max_cost);
753  }
754  prunable_counter = -1; // stop counting
755  }
756  }
757  }
758 
759  // Print the newly created ViterbiStateEntry.
760  if (language_model_debug_level > 2) {
761  new_vse->Print("New");
763  curr_state->Print("Updated viterbi list");
764  }
765 
766  return true;
767 }

◆ CertaintyScore()

float tesseract::LanguageModel::CertaintyScore ( float  cert)
inlineprotected

Definition at line 112 of file language_model.h.

112  {
114  // cert is assumed to be between 0 and -dict_->certainty_scale.
115  // If you enable language_model_use_sigmoidal_certainty, you
116  // need to adjust language_model_ngram_nonmatch_score as well.
117  cert = -cert / dict_->certainty_scale;
118  return 1.0f / (1.0f + exp(10.0f * cert));
119  } else {
120  return (-1.0f / cert);
121  }
122  }

◆ ComputeAdjustedPathCost()

float tesseract::LanguageModel::ComputeAdjustedPathCost ( ViterbiStateEntry vse)
protected

Definition at line 1199 of file language_model.cpp.

1199  {
1200  ASSERT_HOST(vse != nullptr);
1201  if (params_model_.Initialized()) {
1202  float features[PTRAIN_NUM_FEATURE_TYPES];
1203  ExtractFeaturesFromPath(*vse, features);
1204  float cost = params_model_.ComputeCost(features);
1205  if (language_model_debug_level > 3) {
1206  tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1207  if (language_model_debug_level >= 5) {
1208  for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
1209  tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1210  }
1211  }
1212  }
1213  return cost * vse->outline_length;
1214  } else {
1215  float adjustment = 1.0f;
1216  if (vse->dawg_info == nullptr || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
1218  }
1219  if (vse->dawg_info == nullptr) {
1221  if (vse->length > language_model_min_compound_length) {
1222  adjustment += ((vse->length - language_model_min_compound_length) *
1224  }
1225  }
1226  if (vse->associate_stats.shape_cost > 0) {
1227  adjustment += vse->associate_stats.shape_cost /
1228  static_cast<float>(vse->length);
1229  }
1231  ASSERT_HOST(vse->ngram_info != nullptr);
1232  return vse->ngram_info->ngram_and_classifier_cost * adjustment;
1233  } else {
1234  adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
1235  vse->consistency_info);
1236  return vse->ratings_sum * adjustment;
1237  }
1238  }
1239 }

◆ ComputeAdjustment()

float tesseract::LanguageModel::ComputeAdjustment ( int  num_problems,
float  penalty 
)
inlineprotected

Definition at line 124 of file language_model.h.

124  {
125  if (num_problems == 0) return 0.0f;
126  if (num_problems == 1) return penalty;
127  return (penalty + (language_model_penalty_increment *
128  static_cast<float>(num_problems-1)));
129  }

◆ ComputeAssociateStats()

void tesseract::LanguageModel::ComputeAssociateStats ( int  col,
int  row,
float  max_char_wh_ratio,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
AssociateStats associate_stats 
)
inlineprotected

Definition at line 280 of file language_model.h.

284  {
286  col, row,
287  (parent_vse != nullptr) ? &(parent_vse->associate_stats) : nullptr,
288  (parent_vse != nullptr) ? parent_vse->length : 0,
289  fixed_pitch_, max_char_wh_ratio,
290  word_res, language_model_debug_level > 2, associate_stats);
291  }

◆ ComputeConsistencyAdjustment()

float tesseract::LanguageModel::ComputeConsistencyAdjustment ( const LanguageModelDawgInfo dawg_info,
const LMConsistencyInfo consistency_info 
)
inlineprotected

Definition at line 135 of file language_model.h.

137  {
138  if (dawg_info != nullptr) {
139  return ComputeAdjustment(consistency_info.NumInconsistentCase(),
141  (consistency_info.inconsistent_script ?
143  }
144  return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
146  ComputeAdjustment(consistency_info.NumInconsistentCase(),
148  ComputeAdjustment(consistency_info.NumInconsistentChartype(),
150  ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
152  (consistency_info.inconsistent_script ?
154  (consistency_info.inconsistent_font ?
156  }

◆ ComputeDenom()

float tesseract::LanguageModel::ComputeDenom ( BLOB_CHOICE_LIST *  curr_list)
protected

Definition at line 994 of file language_model.cpp.

994  {
995  if (curr_list->empty()) return 1.0f;
996  float denom = 0.0f;
997  int len = 0;
998  BLOB_CHOICE_IT c_it(curr_list);
999  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
1000  ASSERT_HOST(c_it.data() != nullptr);
1001  ++len;
1002  denom += CertaintyScore(c_it.data()->certainty());
1003  }
1004  assert(len != 0);
1005  // The ideal situation would be to have the classifier scores for
1006  // classifying each position as each of the characters in the unicharset.
1007  // Since we can not do this because of speed, we add a very crude estimate
1008  // of what these scores for the "missing" classifications would sum up to.
1009  denom += (dict_->getUnicharset().size() - len) *
1011 
1012  return denom;
1013 }

◆ ComputeNgramCost()

float tesseract::LanguageModel::ComputeNgramCost ( const char *  unichar,
float  certainty,
float  denom,
const char *  context,
int *  unichar_step_len,
bool *  found_small_prob,
float *  ngram_prob 
)
protected

Definition at line 934 of file language_model.cpp.

940  {
941  const char *context_ptr = context;
942  char *modified_context = nullptr;
943  char *modified_context_end = nullptr;
944  const char *unichar_ptr = unichar;
945  const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
946  float prob = 0.0f;
947  int step = 0;
948  while (unichar_ptr < unichar_end &&
949  (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
950  if (language_model_debug_level > 1) {
951  tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
952  dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
953  }
954  prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
955  ++(*unichar_step_len);
957  unichar_ptr += step;
958  // If there are multiple UTF8 characters present in unichar, context is
959  // updated to include the previously examined characters from str,
960  // unless use_only_first_uft8_step is true.
961  if (unichar_ptr < unichar_end) {
962  if (modified_context == nullptr) {
963  size_t context_len = strlen(context);
964  modified_context =
965  new char[context_len + strlen(unichar_ptr) + step + 1];
966  memcpy(modified_context, context, context_len);
967  modified_context_end = modified_context + context_len;
968  context_ptr = modified_context;
969  }
970  strncpy(modified_context_end, unichar_ptr - step, step);
971  modified_context_end += step;
972  *modified_context_end = '\0';
973  }
974  }
975  prob /= static_cast<float>(*unichar_step_len); // normalize
976  if (prob < language_model_ngram_small_prob) {
977  if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
978  *found_small_prob = true;
980  }
981  *ngram_cost = -1.0*log2(prob);
982  float ngram_and_classifier_cost =
983  -1.0*log2(CertaintyScore(certainty)/denom) +
984  *ngram_cost * language_model_ngram_scale_factor;
985  if (language_model_debug_level > 1) {
986  tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
987  unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
988  ngram_and_classifier_cost);
989  }
990  delete[] modified_context;
991  return ngram_and_classifier_cost;
992 }

◆ ConstructWord()

WERD_CHOICE * tesseract::LanguageModel::ConstructWord ( ViterbiStateEntry vse,
WERD_RES word_res,
DANGERR fixpt,
BlamerBundle blamer_bundle,
bool *  truth_path 
)
protected

Definition at line 1390 of file language_model.cpp.

1395  {
1396  if (truth_path != nullptr) {
1397  *truth_path =
1398  (blamer_bundle != nullptr &&
1399  vse->length == blamer_bundle->correct_segmentation_length());
1400  }
1401  BLOB_CHOICE *curr_b = vse->curr_b;
1402  ViterbiStateEntry *curr_vse = vse;
1403 
1404  int i;
1405  bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1406 
1407  // Re-compute the variance of the width-to-height ratios (since we now
1408  // can compute the mean over the whole word).
1409  float full_wh_ratio_mean = 0.0f;
1410  if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1411  vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
1412  full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
1413  static_cast<float>(vse->length));
1414  vse->associate_stats.full_wh_ratio_var = 0.0f;
1415  }
1416 
1417  // Construct a WERD_CHOICE by tracing parent pointers.
1418  auto *word = new WERD_CHOICE(word_res->uch_set, vse->length);
1419  word->set_length(vse->length);
1420  int total_blobs = 0;
1421  for (i = (vse->length-1); i >= 0; --i) {
1422  if (blamer_bundle != nullptr && truth_path != nullptr && *truth_path &&
1423  !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
1424  *truth_path = false;
1425  }
1426  // The number of blobs used for this choice is row - col + 1.
1427  int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
1428  total_blobs += num_blobs;
1429  word->set_blob_choice(i, num_blobs, curr_b);
1430  // Update the width-to-height ratio variance. Useful non-space delimited
1431  // languages to ensure that the blobs are of uniform width.
1432  // Skip leading and trailing punctuation when computing the variance.
1433  if ((full_wh_ratio_mean != 0.0f &&
1434  ((curr_vse != vse && curr_vse->parent_vse != nullptr) ||
1435  !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
1436  vse->associate_stats.full_wh_ratio_var +=
1437  pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1438  if (language_model_debug_level > 2) {
1439  tprintf("full_wh_ratio_var += (%g-%g)^2\n",
1440  full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
1441  }
1442  }
1443 
1444  // Mark the word as compound if compound permuter was set for any of
1445  // the unichars on the path (usually this will happen for unichars
1446  // that are compounding operators, like "-" and "/").
1447  if (!compound && curr_vse->dawg_info &&
1448  curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
1449 
1450  // Update curr_* pointers.
1451  curr_vse = curr_vse->parent_vse;
1452  if (curr_vse == nullptr) break;
1453  curr_b = curr_vse->curr_b;
1454  }
1455  ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.
1456  ASSERT_HOST(total_blobs == word_res->ratings->dimension());
1457  // Re-adjust shape cost to include the updated width-to-height variance.
1458  if (full_wh_ratio_mean != 0.0f) {
1459  vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
1460  }
1461 
1462  word->set_rating(vse->ratings_sum);
1463  word->set_certainty(vse->min_certainty);
1464  word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
1465  vse->consistency_info.BodyMaxXHeight());
1466  if (vse->dawg_info != nullptr) {
1467  word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1468  } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1469  word->set_permuter(NGRAM_PERM);
1470  } else if (vse->top_choice_flags) {
1471  word->set_permuter(TOP_CHOICE_PERM);
1472  } else {
1473  word->set_permuter(NO_PERM);
1474  }
1475  word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
1476  word_res->ratings));
1477  return word;
1478 }

◆ ExtractFeaturesFromPath()

void tesseract::LanguageModel::ExtractFeaturesFromPath ( const ViterbiStateEntry vse,
float  features[] 
)
static

Definition at line 1341 of file language_model.cpp.

1342  {
1343  memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
1344  // Record dictionary match info.
1345  int len = vse.length <= kMaxSmallWordUnichars ? 0 :
1346  vse.length <= kMaxMediumWordUnichars ? 1 : 2;
1347  if (vse.dawg_info != nullptr) {
1348  int permuter = vse.dawg_info->permuter;
1349  if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
1350  if (vse.consistency_info.num_digits == vse.length) {
1351  features[PTRAIN_DIGITS_SHORT+len] = 1.0;
1352  } else {
1353  features[PTRAIN_NUM_SHORT+len] = 1.0;
1354  }
1355  } else if (permuter == DOC_DAWG_PERM) {
1356  features[PTRAIN_DOC_SHORT+len] = 1.0;
1357  } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
1358  permuter == COMPOUND_PERM) {
1359  features[PTRAIN_DICT_SHORT+len] = 1.0;
1360  } else if (permuter == FREQ_DAWG_PERM) {
1361  features[PTRAIN_FREQ_SHORT+len] = 1.0;
1362  }
1363  }
1364  // Record shape cost feature (normalized by path length).
1365  features[PTRAIN_SHAPE_COST_PER_CHAR] =
1366  vse.associate_stats.shape_cost / static_cast<float>(vse.length);
1367  // Record ngram cost. (normalized by the path length).
1368  features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
1369  if (vse.ngram_info != nullptr) {
1370  features[PTRAIN_NGRAM_COST_PER_CHAR] =
1371  vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
1372  }
1373  // Record consistency-related features.
1374  // Disabled this feature for due to its poor performance.
1375  // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
1376  features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase();
1377  features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision;
1378  features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == nullptr ?
1379  vse.consistency_info.NumInconsistentChartype() : 0.0;
1380  features[PTRAIN_NUM_BAD_SPACING] =
1381  vse.consistency_info.NumInconsistentSpaces();
1382  // Disabled this feature for now due to its poor performance.
1383  // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
1384 
1385  // Classifier-related features.
1386  features[PTRAIN_RATING_PER_CHAR] =
1387  vse.ratings_sum / static_cast<float>(vse.outline_length);
1388 }

◆ FillConsistencyInfo()

void tesseract::LanguageModel::FillConsistencyInfo ( int  curr_col,
bool  word_end,
BLOB_CHOICE b,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
LMConsistencyInfo consistency_info 
)
protected

Definition at line 1015 of file language_model.cpp.

1021  {
1022  const UNICHARSET &unicharset = dict_->getUnicharset();
1023  UNICHAR_ID unichar_id = b->unichar_id();
1024  BLOB_CHOICE* parent_b = parent_vse != nullptr ? parent_vse->curr_b : nullptr;
1025 
1026  // Check punctuation validity.
1027  if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
1028  if (dict_->GetPuncDawg() != nullptr && !consistency_info->invalid_punc) {
1029  if (dict_->compound_marker(unichar_id) && parent_b != nullptr &&
1030  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1031  unicharset.get_isdigit(parent_b->unichar_id()))) {
1032  // reset punc_ref for compound words
1033  consistency_info->punc_ref = NO_EDGE;
1034  } else {
1035  bool is_apos = dict_->is_apostrophe(unichar_id);
1036  bool prev_is_numalpha = (parent_b != nullptr &&
1037  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1038  unicharset.get_isdigit(parent_b->unichar_id())));
1039  UNICHAR_ID pattern_unichar_id =
1040  (unicharset.get_isalpha(unichar_id) ||
1041  unicharset.get_isdigit(unichar_id) ||
1042  (is_apos && prev_is_numalpha)) ?
1043  Dawg::kPatternUnicharID : unichar_id;
1044  if (consistency_info->punc_ref == NO_EDGE ||
1045  pattern_unichar_id != Dawg::kPatternUnicharID ||
1046  dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1049  consistency_info->punc_ref);
1050  consistency_info->punc_ref =
1051  (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1052  node, pattern_unichar_id, word_end) : NO_EDGE;
1053  if (consistency_info->punc_ref == NO_EDGE) {
1054  consistency_info->invalid_punc = true;
1055  }
1056  }
1057  }
1058  }
1059 
1060  // Update case related counters.
1061  if (parent_vse != nullptr && !word_end && dict_->compound_marker(unichar_id)) {
1062  // Reset counters if we are dealing with a compound word.
1063  consistency_info->num_lower = 0;
1064  consistency_info->num_non_first_upper = 0;
1065  }
1066  else if (unicharset.get_islower(unichar_id)) {
1067  consistency_info->num_lower++;
1068  } else if ((parent_b != nullptr) && unicharset.get_isupper(unichar_id)) {
1069  if (unicharset.get_isupper(parent_b->unichar_id()) ||
1070  consistency_info->num_lower > 0 ||
1071  consistency_info->num_non_first_upper > 0) {
1072  consistency_info->num_non_first_upper++;
1073  }
1074  }
1075 
1076  // Initialize consistency_info->script_id (use script of unichar_id
1077  // if it is not Common, use script id recorded by the parent otherwise).
1078  // Set inconsistent_script to true if the script of the current unichar
1079  // is not consistent with that of the parent.
1080  consistency_info->script_id = unicharset.get_script(unichar_id);
1081  // Hiragana and Katakana can mix with Han.
1083  if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1084  consistency_info->script_id == unicharset.hiragana_sid()) ||
1085  (unicharset.katakana_sid() != unicharset.null_sid() &&
1086  consistency_info->script_id == unicharset.katakana_sid())) {
1087  consistency_info->script_id = dict_->getUnicharset().han_sid();
1088  }
1089  }
1090 
1091  if (parent_vse != nullptr &&
1092  (parent_vse->consistency_info.script_id !=
1093  dict_->getUnicharset().common_sid())) {
1094  int parent_script_id = parent_vse->consistency_info.script_id;
1095  // If script_id is Common, use script id of the parent instead.
1096  if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1097  consistency_info->script_id = parent_script_id;
1098  }
1099  if (consistency_info->script_id != parent_script_id) {
1100  consistency_info->inconsistent_script = true;
1101  }
1102  }
1103 
1104  // Update chartype related counters.
1105  if (unicharset.get_isalpha(unichar_id)) {
1106  consistency_info->num_alphas++;
1107  } else if (unicharset.get_isdigit(unichar_id)) {
1108  consistency_info->num_digits++;
1109  } else if (!unicharset.get_ispunctuation(unichar_id)) {
1110  consistency_info->num_other++;
1111  }
1112 
1113  // Check font and spacing consistency.
1114  if (fontinfo_table_->size() > 0 && parent_b != nullptr) {
1115  int fontinfo_id = -1;
1116  if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1117  parent_b->fontinfo_id2() == b->fontinfo_id()) {
1118  fontinfo_id = b->fontinfo_id();
1119  } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1120  parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1121  fontinfo_id = b->fontinfo_id2();
1122  }
1123  if(language_model_debug_level > 1) {
1124  tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1125  (parent_b->fontinfo_id() >= 0) ?
1126  fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
1127  (parent_b->fontinfo_id2() >= 0) ?
1128  fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
1129  (b->fontinfo_id() >= 0) ?
1130  fontinfo_table_->get(b->fontinfo_id()).name : "",
1131  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1132  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1133  fontinfo_id);
1134  }
1135  if (!word_res->blob_widths.empty()) { // if we have widths/gaps info
1136  bool expected_gap_found = false;
1137  float expected_gap = 0.0f;
1138  int temp_gap;
1139  if (fontinfo_id >= 0) { // found a common font
1140  ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1141  if (fontinfo_table_->get(fontinfo_id).get_spacing(
1142  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1143  expected_gap = temp_gap;
1144  expected_gap_found = true;
1145  }
1146  } else {
1147  consistency_info->inconsistent_font = true;
1148  // Get an average of the expected gaps in each font
1149  int num_addends = 0;
1150  int temp_fid;
1151  for (int i = 0; i < 4; ++i) {
1152  if (i == 0) {
1153  temp_fid = parent_b->fontinfo_id();
1154  } else if (i == 1) {
1155  temp_fid = parent_b->fontinfo_id2();
1156  } else if (i == 2) {
1157  temp_fid = b->fontinfo_id();
1158  } else {
1159  temp_fid = b->fontinfo_id2();
1160  }
1161  ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1162  if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
1163  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1164  expected_gap += temp_gap;
1165  num_addends++;
1166  }
1167  }
1168  if (num_addends > 0) {
1169  expected_gap /= static_cast<float>(num_addends);
1170  expected_gap_found = true;
1171  }
1172  }
1173  if (expected_gap_found) {
1174  int actual_gap = word_res->GetBlobsGap(curr_col-1);
1175  if (actual_gap == 0) {
1176  consistency_info->num_inconsistent_spaces++;
1177  } else {
1178  float gap_ratio = expected_gap / actual_gap;
1179  // TODO(rays) The gaps seem to be way off most of the time, saved by
1180  // the error here that the ratio was compared to 1/2, when it should
1181  // have been 0.5f. Find the source of the gaps discrepancy and put
1182  // the 0.5f here in place of 0.0f.
1183  // Test on 2476595.sj, pages 0 to 6. (In French.)
1184  if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1185  consistency_info->num_inconsistent_spaces++;
1186  }
1187  }
1188  if (language_model_debug_level > 1) {
1189  tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %d\n",
1190  unicharset.id_to_unichar(parent_b->unichar_id()),
1191  parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
1192  unichar_id, curr_col, expected_gap, actual_gap);
1193  }
1194  }
1195  }
1196  }
1197 }

◆ GenerateDawgInfo()

LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo ( bool  word_end,
int  curr_col,
int  curr_row,
const BLOB_CHOICE b,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 785 of file language_model.cpp.

789  {
790  // Initialize active_dawgs from parent_vse if it is not nullptr.
791  // Otherwise use very_beginning_active_dawgs_.
792  if (parent_vse == nullptr) {
795  } else {
796  if (parent_vse->dawg_info == nullptr) return nullptr; // not a dict word path
797  dawg_args_.active_dawgs = &parent_vse->dawg_info->active_dawgs;
798  dawg_args_.permuter = parent_vse->dawg_info->permuter;
799  }
800 
801  // Deal with hyphenated words.
802  if (word_end && dict_->has_hyphen_end(&dict_->getUnicharset(),
803  b.unichar_id(), curr_col == 0)) {
804  if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
805  return new LanguageModelDawgInfo(dawg_args_.active_dawgs, COMPOUND_PERM);
806  }
807 
808  // Deal with compound words.
809  if (dict_->compound_marker(b.unichar_id()) &&
810  (parent_vse == nullptr || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
811  if (language_model_debug_level > 0) tprintf("Found compound marker\n");
812  // Do not allow compound operators at the beginning and end of the word.
813  // Do not allow more than one compound operator per word.
814  // Do not allow compounding of words with lengths shorter than
815  // language_model_min_compound_length
816  if (parent_vse == nullptr || word_end ||
818  parent_vse->length < language_model_min_compound_length)
819  return nullptr;
820 
821  int i;
822  // Check a that the path terminated before the current character is a word.
823  bool has_word_ending = false;
824  for (i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {
825  const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];
826  const Dawg *pdawg = pos.dawg_index < 0
827  ? nullptr : dict_->GetDawg(pos.dawg_index);
828  if (pdawg == nullptr || pos.back_to_punc) continue;;
829  if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
830  pdawg->end_of_word(pos.dawg_ref)) {
831  has_word_ending = true;
832  break;
833  }
834  }
835  if (!has_word_ending) return nullptr;
836 
837  if (language_model_debug_level > 0) tprintf("Compound word found\n");
838  return new LanguageModelDawgInfo(&beginning_active_dawgs_, COMPOUND_PERM);
839  } // done dealing with compound words
840 
841  LanguageModelDawgInfo *dawg_info = nullptr;
842 
843  // Call LetterIsOkay().
844  // Use the normalized IDs so that all shapes of ' can be allowed in words
845  // like don't.
846  const GenericVector<UNICHAR_ID>& normed_ids =
848  DawgPositionVector tmp_active_dawgs;
849  for (int i = 0; i < normed_ids.size(); ++i) {
851  tprintf("Test Letter OK for unichar %d, normed %d\n",
852  b.unichar_id(), normed_ids[i]);
853  dict_->LetterIsOkay(&dawg_args_, dict_->getUnicharset(), normed_ids[i],
854  word_end && i == normed_ids.size() - 1);
855  if (dawg_args_.permuter == NO_PERM) {
856  break;
857  } else if (i < normed_ids.size() - 1) {
858  tmp_active_dawgs = *dawg_args_.updated_dawgs;
859  dawg_args_.active_dawgs = &tmp_active_dawgs;
860  }
862  tprintf("Letter was OK for unichar %d, normed %d\n",
863  b.unichar_id(), normed_ids[i]);
864  }
865  dawg_args_.active_dawgs = nullptr;
866  if (dawg_args_.permuter != NO_PERM) {
867  dawg_info = new LanguageModelDawgInfo(dawg_args_.updated_dawgs,
869  } else if (language_model_debug_level > 3) {
870  tprintf("Letter %s not OK!\n",
872  }
873 
874  return dawg_info;
875 }

◆ GenerateNgramInfo()

LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo ( const char *  unichar,
float  certainty,
float  denom,
int  curr_col,
int  curr_row,
float  outline_length,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 877 of file language_model.cpp.

880  {
881  // Initialize parent context.
882  const char *pcontext_ptr = "";
883  int pcontext_unichar_step_len = 0;
884  if (parent_vse == nullptr) {
885  pcontext_ptr = prev_word_str_.c_str();
886  pcontext_unichar_step_len = prev_word_unichar_step_len_;
887  } else {
888  pcontext_ptr = parent_vse->ngram_info->context.c_str();
889  pcontext_unichar_step_len =
890  parent_vse->ngram_info->context_unichar_step_len;
891  }
892  // Compute p(unichar | parent context).
893  int unichar_step_len = 0;
894  bool pruned = false;
895  float ngram_cost;
896  float ngram_and_classifier_cost =
897  ComputeNgramCost(unichar, certainty, denom,
898  pcontext_ptr, &unichar_step_len,
899  &pruned, &ngram_cost);
900  // Normalize just the ngram_and_classifier_cost by outline_length.
901  // The ngram_cost is used by the params_model, so it needs to be left as-is,
902  // and the params model cost will be normalized by outline_length.
903  ngram_and_classifier_cost *=
904  outline_length / language_model_ngram_rating_factor;
905  // Add the ngram_cost of the parent.
906  if (parent_vse != nullptr) {
907  ngram_and_classifier_cost +=
908  parent_vse->ngram_info->ngram_and_classifier_cost;
909  ngram_cost += parent_vse->ngram_info->ngram_cost;
910  }
911 
912  // Shorten parent context string by unichar_step_len unichars.
913  int num_remove = (unichar_step_len + pcontext_unichar_step_len -
915  if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
916  while (num_remove > 0 && *pcontext_ptr != '\0') {
917  pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
918  --num_remove;
919  }
920 
921  // Decide whether to prune this ngram path and update changed accordingly.
922  if (parent_vse != nullptr && parent_vse->ngram_info->pruned) pruned = true;
923 
924  // Construct and return the new LanguageModelNgramInfo.
925  auto *ngram_info = new LanguageModelNgramInfo(
926  pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
927  ngram_and_classifier_cost);
928  ngram_info->context += unichar;
929  ngram_info->context_unichar_step_len += unichar_step_len;
930  assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
931  return ngram_info;
932 }

◆ GenerateTopChoiceInfo()

void tesseract::LanguageModel::GenerateTopChoiceInfo ( ViterbiStateEntry new_vse,
const ViterbiStateEntry parent_vse,
LanguageModelState lms 
)
protected

Definition at line 769 of file language_model.cpp.

771  {
772  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
773  for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
774  new_vse->cost >= vit.data()->cost; vit.forward()) {
775  // Clear the appropriate flags if the list already contains
776  // a top choice entry with a lower cost.
777  new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
778  }
779  if (language_model_debug_level > 2) {
780  tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
781  new_vse->top_choice_flags);
782  }
783 }

◆ GetNextParentVSE()

ViterbiStateEntry * tesseract::LanguageModel::GetNextParentVSE ( bool  just_classified,
bool  mixed_alnum,
const BLOB_CHOICE bc,
LanguageModelFlagsType  blob_choice_flags,
const UNICHARSET unicharset,
WERD_RES word_res,
ViterbiStateEntry_IT *  vse_it,
LanguageModelFlagsType top_choice_flags 
) const
protected

Finds the next ViterbiStateEntry with which the given unichar_id can combine sensibly, taking into account any mixed alnum/mixed case situation, and whether this combination has been inspected before.

Definition at line 500 of file language_model.cpp.

504  {
505  for (; !vse_it->cycled_list(); vse_it->forward()) {
506  ViterbiStateEntry* parent_vse = vse_it->data();
507  // Only consider the parent if it has been updated or
508  // if the current ratings cell has just been classified.
509  if (!just_classified && !parent_vse->updated) continue;
511  parent_vse->Print("Considering");
512  // If the parent is non-alnum, then upper counts as lower.
513  *top_choice_flags = blob_choice_flags;
514  if ((blob_choice_flags & kUpperCaseFlag) &&
515  !parent_vse->HasAlnumChoice(unicharset)) {
516  *top_choice_flags |= kLowerCaseFlag;
517  }
518  *top_choice_flags &= parent_vse->top_choice_flags;
519  UNICHAR_ID unichar_id = bc->unichar_id();
520  const BLOB_CHOICE* parent_b = parent_vse->curr_b;
521  UNICHAR_ID parent_id = parent_b->unichar_id();
522  // Digits do not bind to alphas if there is a mix in both parent and current
523  // or if the alpha is not the top choice.
524  if (unicharset.get_isdigit(unichar_id) &&
525  unicharset.get_isalpha(parent_id) &&
526  (mixed_alnum || *top_choice_flags == 0))
527  continue; // Digits don't bind to alphas.
528  // Likewise alphas do not bind to digits if there is a mix in both or if
529  // the digit is not the top choice.
530  if (unicharset.get_isalpha(unichar_id) &&
531  unicharset.get_isdigit(parent_id) &&
532  (mixed_alnum || *top_choice_flags == 0))
533  continue; // Alphas don't bind to digits.
534  // If there is a case mix of the same alpha in the parent list, then
535  // competing_vse is non-null and will be used to determine whether
536  // or not to bind the current blob choice.
537  if (parent_vse->competing_vse != nullptr) {
538  const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
539  UNICHAR_ID other_id = competing_b->unichar_id();
540  if (language_model_debug_level >= 5) {
541  tprintf("Parent %s has competition %s\n",
542  unicharset.id_to_unichar(parent_id),
543  unicharset.id_to_unichar(other_id));
544  }
545  if (unicharset.SizesDistinct(parent_id, other_id)) {
546  // If other_id matches bc wrt position and size, and parent_id, doesn't,
547  // don't bind to the current parent.
548  if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
550  !bc->PosAndSizeAgree(*parent_b, word_res->x_height,
552  continue; // Competing blobchoice has a better vertical match.
553  }
554  }
555  vse_it->forward();
556  return parent_vse; // This one is good!
557  }
558  return nullptr; // Ran out of possibilities.
559 }

◆ getParamsModel()

ParamsModel& tesseract::LanguageModel::getParamsModel ( )
inline

Definition at line 108 of file language_model.h.

108 { return params_model_; }

◆ GetTopLowerUpperDigit()

bool tesseract::LanguageModel::GetTopLowerUpperDigit ( BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE **  first_lower,
BLOB_CHOICE **  first_upper,
BLOB_CHOICE **  first_digit 
) const
protected

Finds the first lower and upper case letter and first digit in curr_list. For non-upper/lower languages, alpha counts as upper. Uses the first character in the list in place of empty results. Returns true if both alpha and digits are found.

Definition at line 383 of file language_model.cpp.

386  {
387  BLOB_CHOICE_IT c_it(curr_list);
388  const UNICHARSET &unicharset = dict_->getUnicharset();
389  BLOB_CHOICE *first_unichar = nullptr;
390  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
391  UNICHAR_ID unichar_id = c_it.data()->unichar_id();
392  if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
393  if (first_unichar == nullptr) first_unichar = c_it.data();
394  if (*first_lower == nullptr && unicharset.get_islower(unichar_id)) {
395  *first_lower = c_it.data();
396  }
397  if (*first_upper == nullptr && unicharset.get_isalpha(unichar_id) &&
398  !unicharset.get_islower(unichar_id)) {
399  *first_upper = c_it.data();
400  }
401  if (*first_digit == nullptr && unicharset.get_isdigit(unichar_id)) {
402  *first_digit = c_it.data();
403  }
404  }
405  ASSERT_HOST(first_unichar != nullptr);
406  bool mixed = (*first_lower != nullptr || *first_upper != nullptr) &&
407  *first_digit != nullptr;
408  if (*first_lower == nullptr) *first_lower = first_unichar;
409  if (*first_upper == nullptr) *first_upper = first_unichar;
410  if (*first_digit == nullptr) *first_digit = first_unichar;
411  return mixed;
412 }

◆ InitForWord()

void tesseract::LanguageModel::InitForWord ( const WERD_CHOICE prev_word,
bool  fixed_pitch,
float  max_char_wh_ratio,
float  rating_cert_scale 
)

Definition at line 136 of file language_model.cpp.

138  {
139  fixed_pitch_ = fixed_pitch;
140  max_char_wh_ratio_ = max_char_wh_ratio;
141  rating_cert_scale_ = rating_cert_scale;
142  acceptable_choice_found_ = false;
144 
145  // Initialize vectors with beginning DawgInfos.
150 
151  // Fill prev_word_str_ with the last language_model_ngram_order
152  // unichars from prev_word.
154  if (prev_word != nullptr && prev_word->unichar_string() != nullptr) {
155  prev_word_str_ = prev_word->unichar_string();
157  } else {
158  prev_word_str_ = " ";
159  }
160  const char *str_ptr = prev_word_str_.c_str();
161  const char *str_end = str_ptr + prev_word_str_.length();
162  int step;
164  while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
165  str_ptr += step;
167  }
168  ASSERT_HOST(str_ptr == str_end);
169  }
170 }

◆ PrunablePath()

bool tesseract::LanguageModel::PrunablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 299 of file language_model.h.

299  {
300  if (vse.top_choice_flags) return false;
301  if (vse.dawg_info != nullptr &&
302  (vse.dawg_info->permuter == SYSTEM_DAWG_PERM ||
303  vse.dawg_info->permuter == USER_DAWG_PERM ||
304  vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false;
305  return true;
306  }

◆ SetAcceptableChoiceFound()

void tesseract::LanguageModel::SetAcceptableChoiceFound ( bool  val)
inline

Definition at line 104 of file language_model.h.

104  {
106  }

◆ SetTopParentLowerUpperDigit()

int tesseract::LanguageModel::SetTopParentLowerUpperDigit ( LanguageModelState parent_node) const
protected

Forces there to be at least one entry in the overall set of the viterbi_state_entries of each element of parent_node that has the top_choice_flag set for lower, upper and digit using the same rules as GetTopLowerUpperDigit, setting the flag on the first found suitable candidate, whether or not the flag is set on some other parent. Returns 1 if both alpha and digits are found among the parents, -1 if no parents are found at all (a legitimate case), and 0 otherwise.

Definition at line 423 of file language_model.cpp.

424  {
425  if (parent_node == nullptr) return -1;
426  UNICHAR_ID top_id = INVALID_UNICHAR_ID;
427  ViterbiStateEntry* top_lower = nullptr;
428  ViterbiStateEntry* top_upper = nullptr;
429  ViterbiStateEntry* top_digit = nullptr;
430  ViterbiStateEntry* top_choice = nullptr;
431  float lower_rating = 0.0f;
432  float upper_rating = 0.0f;
433  float digit_rating = 0.0f;
434  float top_rating = 0.0f;
435  const UNICHARSET &unicharset = dict_->getUnicharset();
436  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
437  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
438  ViterbiStateEntry* vse = vit.data();
439  // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
440  // back to the real character if needed.
441  ViterbiStateEntry* unichar_vse = vse;
442  UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
443  float rating = unichar_vse->curr_b->rating();
444  while (unichar_id == INVALID_UNICHAR_ID &&
445  unichar_vse->parent_vse != nullptr) {
446  unichar_vse = unichar_vse->parent_vse;
447  unichar_id = unichar_vse->curr_b->unichar_id();
448  rating = unichar_vse->curr_b->rating();
449  }
450  if (unichar_id != INVALID_UNICHAR_ID) {
451  if (unicharset.get_islower(unichar_id)) {
452  if (top_lower == nullptr || lower_rating > rating) {
453  top_lower = vse;
454  lower_rating = rating;
455  }
456  } else if (unicharset.get_isalpha(unichar_id)) {
457  if (top_upper == nullptr || upper_rating > rating) {
458  top_upper = vse;
459  upper_rating = rating;
460  }
461  } else if (unicharset.get_isdigit(unichar_id)) {
462  if (top_digit == nullptr || digit_rating > rating) {
463  top_digit = vse;
464  digit_rating = rating;
465  }
466  }
467  }
468  if (top_choice == nullptr || top_rating > rating) {
469  top_choice = vse;
470  top_rating = rating;
471  top_id = unichar_id;
472  }
473  }
474  if (top_choice == nullptr) return -1;
475  bool mixed = (top_lower != nullptr || top_upper != nullptr) &&
476  top_digit != nullptr;
477  if (top_lower == nullptr) top_lower = top_choice;
478  top_lower->top_choice_flags |= kLowerCaseFlag;
479  if (top_upper == nullptr) top_upper = top_choice;
480  top_upper->top_choice_flags |= kUpperCaseFlag;
481  if (top_digit == nullptr) top_digit = top_choice;
482  top_digit->top_choice_flags |= kDigitFlag;
483  top_choice->top_choice_flags |= kSmallestRatingFlag;
484  if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
485  (top_choice->top_choice_flags &
487  // If the compound marker top choice carries any of the top alnum flags,
488  // then give it all of them, allowing words like I-295 to be chosen.
489  top_choice->top_choice_flags |=
491  }
492  return mixed ? 1 : 0;
493 }

◆ UpdateBestChoice()

void tesseract::LanguageModel::UpdateBestChoice ( ViterbiStateEntry vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 1241 of file language_model.cpp.

1246  {
1247  bool truth_path;
1248  WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
1249  blamer_bundle, &truth_path);
1250  ASSERT_HOST(word != nullptr);
1251  if (dict_->stopper_debug_level >= 1) {
1252  STRING word_str;
1253  word->string_and_lengths(&word_str, nullptr);
1254  vse->Print(word_str.c_str());
1255  }
1256  if (language_model_debug_level > 0) {
1257  word->print("UpdateBestChoice() constructed word");
1258  }
1259  // Record features from the current path if necessary.
1260  ParamsTrainingHypothesis curr_hyp;
1261  if (blamer_bundle != nullptr) {
1262  if (vse->dawg_info != nullptr) vse->dawg_info->permuter =
1263  static_cast<PermuterType>(word->permuter());
1264  ExtractFeaturesFromPath(*vse, curr_hyp.features);
1265  word->string_and_lengths(&(curr_hyp.str), nullptr);
1266  curr_hyp.cost = vse->cost; // record cost for error rate computations
1267  if (language_model_debug_level > 0) {
1268  tprintf("Raw features extracted from %s (cost=%g) [ ",
1269  curr_hyp.str.c_str(), curr_hyp.cost);
1270  for (float feature : curr_hyp.features) {
1271  tprintf("%g ", feature);
1272  }
1273  tprintf("]\n");
1274  }
1275  // Record the current hypothesis in params_training_bundle.
1276  blamer_bundle->AddHypothesis(curr_hyp);
1277  if (truth_path)
1278  blamer_bundle->UpdateBestRating(word->rating());
1279  }
1280  if (blamer_bundle != nullptr && blamer_bundle->GuidedSegsearchStillGoing()) {
1281  // The word was constructed solely for blamer_bundle->AddHypothesis, so
1282  // we no longer need it.
1283  delete word;
1284  return;
1285  }
1286  if (word_res->chopped_word != nullptr && !word_res->chopped_word->blobs.empty())
1288  // Update and log new raw_choice if needed.
1289  if (word_res->raw_choice == nullptr ||
1290  word->rating() < word_res->raw_choice->rating()) {
1291  if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
1292  tprintf("Updated raw choice\n");
1293  }
1294  // Set the modified rating for best choice to vse->cost and log best choice.
1295  word->set_rating(vse->cost);
1296  // Call LogNewChoice() for best choice from Dict::adjust_word() since it
1297  // computes adjust_factor that is used by the adaption code (e.g. by
1298  // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
1299  // Note: the rating of the word is not adjusted.
1300  dict_->adjust_word(word, vse->dawg_info == nullptr,
1301  vse->consistency_info.xht_decision, 0.0,
1302  false, language_model_debug_level > 0);
1303  // Hand ownership of the word over to the word_res.
1305  dict_->stopper_debug_level >= 1, word)) {
1306  // The word was so bad that it was deleted.
1307  return;
1308  }
1309  if (word_res->best_choice == word) {
1310  // Word was the new best.
1311  if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) &&
1312  AcceptablePath(*vse)) {
1313  acceptable_choice_found_ = true;
1314  }
1315  // Update best_choice_bundle.
1316  best_choice_bundle->updated = true;
1317  best_choice_bundle->best_vse = vse;
1318  if (language_model_debug_level > 0) {
1319  tprintf("Updated best choice\n");
1320  word->print_state("New state ");
1321  }
1322  // Update hyphen state if we are dealing with a dictionary word.
1323  if (vse->dawg_info != nullptr) {
1324  if (dict_->has_hyphen_end(*word)) {
1326  } else {
1327  dict_->reset_hyphen_vars(true);
1328  }
1329  }
1330 
1331  if (blamer_bundle != nullptr) {
1333  vse->dawg_info != nullptr && vse->top_choice_flags);
1334  }
1335  }
1336  if (wordrec_display_segmentations && word_res->chopped_word != nullptr) {
1337  word->DisplaySegmentation(word_res->chopped_word);
1338  }
1339 }

◆ UpdateState()

bool tesseract::LanguageModel::UpdateState ( bool  just_classified,
int  curr_col,
int  curr_row,
BLOB_CHOICE_LIST *  curr_list,
LanguageModelState parent_node,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

UpdateState has the job of combining the ViterbiStateEntry lists on each of the choices on parent_list with each of the blob choices in curr_list, making a new ViterbiStateEntry for each sensible path.

This could be a huge set of combinations, creating a lot of work only to be truncated by some beam limit, but only certain kinds of paths will continue at the next step:

  • paths that are liked by the language model: either a DAWG or the n-gram model, where active.
  • paths that represent some kind of top choice. The old permuter permuted the top raw classifier score, the top upper case word and the top lower- case word. UpdateState now concentrates its top-choice paths on top lower-case, top upper-case (or caseless alpha), and top digit sequence, with allowance for continuation of these paths through blobs where such a character does not appear in the choices list.

GetNextParentVSE enforces some of these models to minimize the number of calls to AddViterbiStateEntry, even prior to looking at the language model. Thus an n-blob sequence of [l1I] will produce 3n calls to AddViterbiStateEntry instead of 3^n.

Of course it isn't quite that simple as Title Case is handled by allowing lower case to continue an upper case initial, but it has to be detected in the combiner so it knows which upper case letters are initial alphas.

Definition at line 253 of file language_model.cpp.

261  {
262  if (language_model_debug_level > 0) {
263  tprintf("\nUpdateState: col=%d row=%d %s",
264  curr_col, curr_row, just_classified ? "just_classified" : "");
266  tprintf("(parent=%p)\n", parent_node);
267  else
268  tprintf("\n");
269  }
270  // Initialize helper variables.
271  bool word_end = (curr_row+1 >= word_res->ratings->dimension());
272  bool new_changed = false;
273  float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
274  const UNICHARSET& unicharset = dict_->getUnicharset();
275  BLOB_CHOICE *first_lower = nullptr;
276  BLOB_CHOICE *first_upper = nullptr;
277  BLOB_CHOICE *first_digit = nullptr;
278  bool has_alnum_mix = false;
279  if (parent_node != nullptr) {
280  int result = SetTopParentLowerUpperDigit(parent_node);
281  if (result < 0) {
283  tprintf("No parents found to process\n");
284  return false;
285  }
286  if (result > 0)
287  has_alnum_mix = true;
288  }
289  if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
290  &first_digit))
291  has_alnum_mix = false;;
292  ScanParentsForCaseMix(unicharset, parent_node);
293  if (language_model_debug_level > 3 && parent_node != nullptr) {
294  parent_node->Print("Parent viterbi list");
295  }
296  LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
297 
298  // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
299  ViterbiStateEntry_IT vit;
300  BLOB_CHOICE_IT c_it(curr_list);
301  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
302  BLOB_CHOICE* choice = c_it.data();
303  // TODO(antonova): make sure commenting this out if ok for ngram
304  // model scoring (I think this was introduced to fix ngram model quirks).
305  // Skip nullptr unichars unless it is the only choice.
306  //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
307  UNICHAR_ID unichar_id = choice->unichar_id();
308  if (unicharset.get_fragment(unichar_id)) {
309  continue; // Skip fragments.
310  }
311  // Set top choice flags.
312  LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
313  if (c_it.at_first() || !new_changed)
314  blob_choice_flags |= kSmallestRatingFlag;
315  if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
316  if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
317  if (first_digit == choice) blob_choice_flags |= kDigitFlag;
318 
319  if (parent_node == nullptr) {
320  // Process the beginning of a word.
321  // If there is a better case variant that is not distinguished by size,
322  // skip this blob choice, as we have no choice but to accept the result
323  // of the character classifier to distinguish between them, even if
324  // followed by an upper case.
325  // With words like iPoc, and other CamelBackWords, the lower-upper
326  // transition can only be achieved if the classifier has the correct case
327  // as the top choice, and leaving an initial I lower down the list
328  // increases the chances of choosing IPoc simply because it doesn't
329  // include such a transition. iPoc will beat iPOC and ipoc because
330  // the other words are baseline/x-height inconsistent.
331  if (HasBetterCaseVariant(unicharset, choice, curr_list))
332  continue;
333  // Upper counts as lower at the beginning of a word.
334  if (blob_choice_flags & kUpperCaseFlag)
335  blob_choice_flags |= kLowerCaseFlag;
336  new_changed |= AddViterbiStateEntry(
337  blob_choice_flags, denom, word_end, curr_col, curr_row,
338  choice, curr_state, nullptr, pain_points,
339  word_res, best_choice_bundle, blamer_bundle);
340  } else {
341  // Get viterbi entries from each parent ViterbiStateEntry.
342  vit.set_to_list(&parent_node->viterbi_state_entries);
343  int vit_counter = 0;
344  vit.mark_cycle_pt();
345  ViterbiStateEntry* parent_vse = nullptr;
346  LanguageModelFlagsType top_choice_flags;
347  while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
348  c_it.data(), blob_choice_flags,
349  unicharset, word_res, &vit,
350  &top_choice_flags)) != nullptr) {
351  // Skip pruned entries and do not look at prunable entries if already
352  // examined language_model_viterbi_list_max_num_prunable of those.
353  if (PrunablePath(*parent_vse) &&
355  (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
356  continue;
357  }
358  // If the parent has no alnum choice, (ie choice is the first in a
359  // string of alnum), and there is a better case variant that is not
360  // distinguished by size, skip this blob choice/parent, as with the
361  // initial blob treatment above.
362  if (!parent_vse->HasAlnumChoice(unicharset) &&
363  HasBetterCaseVariant(unicharset, choice, curr_list))
364  continue;
365  // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
366  // looks good according to the Dawgs or character ngram model.
367  new_changed |= AddViterbiStateEntry(
368  top_choice_flags, denom, word_end, curr_col, curr_row,
369  c_it.data(), curr_state, parent_vse, pain_points,
370  word_res, best_choice_bundle, blamer_bundle);
371  }
372  }
373  }
374  return new_changed;
375 }

Member Data Documentation

◆ acceptable_choice_found_

bool tesseract::LanguageModel::acceptable_choice_found_ = false
protected

Definition at line 416 of file language_model.h.

◆ beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::beginning_active_dawgs_
protected

Definition at line 404 of file language_model.h.

◆ correct_segmentation_explored_

bool tesseract::LanguageModel::correct_segmentation_explored_ = false
protected

Definition at line 418 of file language_model.h.

◆ dawg_args_

DawgArgs tesseract::LanguageModel::dawg_args_
protected

Definition at line 364 of file language_model.h.

◆ dict_

Dict* tesseract::LanguageModel::dict_ = nullptr
protected

Definition at line 383 of file language_model.h.

◆ fixed_pitch_

bool tesseract::LanguageModel::fixed_pitch_ = false
protected

Definition at line 390 of file language_model.h.

◆ fontinfo_table_

const UnicityTable<FontInfo>* tesseract::LanguageModel::fontinfo_table_ = nullptr
protected

Definition at line 379 of file language_model.h.

◆ kDigitFlag

const LanguageModelFlagsType tesseract::LanguageModel::kDigitFlag = 0x8
static

Definition at line 56 of file language_model.h.

◆ kLowerCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kLowerCaseFlag = 0x2
static

Definition at line 54 of file language_model.h.

◆ kMaxAvgNgramCost

const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f
static

Definition at line 61 of file language_model.h.

◆ kSmallestRatingFlag

const LanguageModelFlagsType tesseract::LanguageModel::kSmallestRatingFlag = 0x1
static

Definition at line 53 of file language_model.h.

◆ kUpperCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kUpperCaseFlag = 0x4
static

Definition at line 55 of file language_model.h.

◆ kXhtConsistentFlag

const LanguageModelFlagsType tesseract::LanguageModel::kXhtConsistentFlag = 0x10
static

Definition at line 57 of file language_model.h.

◆ language_model_debug_level

int tesseract::LanguageModel::language_model_debug_level = 0

"Language model debug level"

Definition at line 316 of file language_model.h.

◆ language_model_min_compound_length

int tesseract::LanguageModel::language_model_min_compound_length = 3

"Minimum length of compound words"

Definition at line 343 of file language_model.h.

◆ language_model_ngram_nonmatch_score

double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0

"Average classifier score of a non-matching unichar"

Definition at line 330 of file language_model.h.

◆ language_model_ngram_on

bool tesseract::LanguageModel::language_model_ngram_on = false

"Turn on/off the use of character ngram model"

Definition at line 318 of file language_model.h.

◆ language_model_ngram_order

int tesseract::LanguageModel::language_model_ngram_order = 8

"Maximum order of the character ngram model"

Definition at line 320 of file language_model.h.

◆ language_model_ngram_rating_factor

double tesseract::LanguageModel::language_model_ngram_rating_factor = 16.0

"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "

Definition at line 339 of file language_model.h.

◆ language_model_ngram_scale_factor

double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03

"Strength of the character ngram model relative to the" " character classifier "

Definition at line 336 of file language_model.h.

◆ language_model_ngram_small_prob

double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001

"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"

Definition at line 328 of file language_model.h.

◆ language_model_ngram_space_delimited_language

bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true

"Words are delimited by space"

Definition at line 341 of file language_model.h.

◆ language_model_ngram_use_only_first_uft8_step

bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities"

Definition at line 333 of file language_model.h.

◆ language_model_penalty_case

double tesseract::LanguageModel::language_model_penalty_case = 0.1

"Penalty for inconsistent case"

Definition at line 352 of file language_model.h.

◆ language_model_penalty_chartype

double tesseract::LanguageModel::language_model_penalty_chartype = 0.3

"Penalty for inconsistent character type"

Definition at line 356 of file language_model.h.

◆ language_model_penalty_font

double tesseract::LanguageModel::language_model_penalty_font = 0.00

"Penalty for inconsistent font"

Definition at line 358 of file language_model.h.

◆ language_model_penalty_increment

double tesseract::LanguageModel::language_model_penalty_increment = 0.01

"Penalty increment"

Definition at line 361 of file language_model.h.

◆ language_model_penalty_non_dict_word

double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15

"Penalty for non-dictionary words"

Definition at line 348 of file language_model.h.

◆ language_model_penalty_non_freq_dict_word

double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1

"Penalty for words not in the frequent word dictionary"

Definition at line 346 of file language_model.h.

◆ language_model_penalty_punc

double tesseract::LanguageModel::language_model_penalty_punc = 0.2

"Penalty for inconsistent punctuation"

Definition at line 350 of file language_model.h.

◆ language_model_penalty_script

double tesseract::LanguageModel::language_model_penalty_script = 0.5

"Penalty for inconsistent script"

Definition at line 354 of file language_model.h.

◆ language_model_penalty_spacing

double tesseract::LanguageModel::language_model_penalty_spacing = 0.05

"Penalty for inconsistent spacing"

Definition at line 360 of file language_model.h.

◆ language_model_use_sigmoidal_certainty

bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false

"Use sigmoidal score for certainty"

Definition at line 364 of file language_model.h.

◆ language_model_viterbi_list_max_num_prunable

int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10

"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"

Definition at line 323 of file language_model.h.

◆ language_model_viterbi_list_max_size

int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500

"Maximum size of viterbi lists recorded in BLOB_CHOICEs"

Definition at line 325 of file language_model.h.

◆ max_char_wh_ratio_

float tesseract::LanguageModel::max_char_wh_ratio_ = 0.0f
protected

Definition at line 393 of file language_model.h.

◆ params_model_

ParamsModel tesseract::LanguageModel::params_model_
protected

Definition at line 421 of file language_model.h.

◆ prev_word_str_

STRING tesseract::LanguageModel::prev_word_str_
protected

Definition at line 400 of file language_model.h.

◆ prev_word_unichar_step_len_

int tesseract::LanguageModel::prev_word_unichar_step_len_ = 0
protected

Definition at line 401 of file language_model.h.

◆ rating_cert_scale_

float tesseract::LanguageModel::rating_cert_scale_ = 0.0f
protected

Definition at line 374 of file language_model.h.

◆ very_beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::very_beginning_active_dawgs_
protected

Definition at line 403 of file language_model.h.

◆ wordrec_display_segmentations

int tesseract::LanguageModel::wordrec_display_segmentations = 0

"Display Segmentations"

Definition at line 362 of file language_model.h.


The documentation for this class was generated from the following files:
tesseract::LanguageModel::prev_word_str_
STRING prev_word_str_
Definition: language_model.h:400
tesseract::Dawg::edge_letter
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
tesseract::LanguageModel::AddViterbiStateEntry
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:561
tesseract::DawgPositionVector::clear
void clear()
Definition: dawg.h:377
BlamerBundle::correct_segmentation_length
int correct_segmentation_length() const
Definition: blamer.h:141
UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:488
tesseract::Dict::GetPuncDawg
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:434
WERD_RES::blob_widths
GenericVector< int > blob_widths
Definition: pageres.h:210
tesseract::LanguageModel::language_model_ngram_on
bool language_model_ngram_on
Definition: language_model.h:318
tesseract::PTRAIN_RATING_PER_CHAR
Definition: params_training_featdef.h:68
tesseract::LanguageModel::wordrec_display_segmentations
int wordrec_display_segmentations
Definition: language_model.h:362
tesseract::LanguageModel::params_model_
ParamsModel params_model_
Definition: language_model.h:421
tesseract::LanguageModel::language_model_penalty_non_dict_word
double language_model_penalty_non_dict_word
Definition: language_model.h:348
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
WERD_CHOICE
Definition: ratngs.h:261
UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:481
tesseract::PTRAIN_DIGITS_SHORT
Definition: params_training_featdef.h:41
tesseract::LanguageModel::correct_segmentation_explored_
bool correct_segmentation_explored_
Definition: language_model.h:418
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
WERD_RES::LogNewRawChoice
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:600
tesseract::LanguageModel::language_model_penalty_script
double language_model_penalty_script
Definition: language_model.h:354
tesseract::LanguageModel::kXhtConsistentFlag
static const LanguageModelFlagsType kXhtConsistentFlag
Definition: language_model.h:57
SYSTEM_DAWG_PERM
Definition: ratngs.h:239
tesseract::Dict::tessedit_truncate_wordchoice_log
int tessedit_truncate_wordchoice_log
Definition: dict.h:642
INT_MEMBER
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:312
tesseract::LanguageModel::max_char_wh_ratio_
float max_char_wh_ratio_
Definition: language_model.h:393
BLOB_CHOICE::certainty
float certainty() const
Definition: ratngs.h:81
tesseract::LanguageModel::language_model_penalty_spacing
double language_model_penalty_spacing
Definition: language_model.h:360
tesseract::LanguageModel::language_model_penalty_non_freq_dict_word
double language_model_penalty_non_freq_dict_word
Definition: language_model.h:346
NO_PERM
Definition: ratngs.h:231
tesseract::LanguageModel::GenerateNgramInfo
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
Definition: language_model.cpp:877
tesseract::LanguageModel::very_beginning_active_dawgs_
DawgPositionVector very_beginning_active_dawgs_
Definition: language_model.h:403
tesseract::LanguageModel::dict_
Dict * dict_
Definition: language_model.h:383
STRING
Definition: strngs.h:45
WERD_RES::x_height
float x_height
Definition: pageres.h:310
tesseract::Dict::stopper_debug_level
int stopper_debug_level
Definition: dict.h:638
tesseract::LanguageModel::ComputeAdjustedPathCost
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
Definition: language_model.cpp:1199
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:334
COMPOUND_PERM
Definition: ratngs.h:243
WERD_RES::ratings
MATRIX * ratings
Definition: pageres.h:231
tesseract::Dict::AcceptableChoice
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:56
WERD_CHOICE::kBadRating
static const float kBadRating
Definition: ratngs.h:263
tesseract::LanguageModel::dawg_args_
DawgArgs dawg_args_
Definition: language_model.h:364
tesseract::Dict::default_dawgs
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:617
UNICHARSET::katakana_sid
int katakana_sid() const
Definition: unicharset.h:881
tesseract::PTRAIN_SHAPE_COST_PER_CHAR
Definition: params_training_featdef.h:60
BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:75
tesseract::PTRAIN_NUM_BAD_CASE
Definition: params_training_featdef.h:63
tesseract::DAWG_TYPE_WORD
Definition: dawg.h:68
BlamerBundle::MatrixPositionCorrect
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:146
BOOL_INIT_MEMBER
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:327
UNICHARSET::get_script
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:653
UNICHARSET::get_ispunctuation
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:509
BLOB_CHOICE::matrix_cell
const MATRIX_COORD & matrix_cell()
Definition: ratngs.h:115
WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:197
tesseract::PTRAIN_NUM_FEATURE_TYPES
Definition: params_training_featdef.h:70
tesseract::Dict::hyphenated
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135
tesseract::PTRAIN_DICT_SHORT
Definition: params_training_featdef.h:53
WERD_CHOICE::string_and_lengths
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:451
WERD_CHOICE::SetScriptPositions
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:552
tesseract::Dict::LetterIsOkay
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:376
tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step
bool language_model_ngram_use_only_first_uft8_step
Definition: language_model.h:333
BlamerBundle::UpdateBestRating
void UpdateBestRating(float rating)
Definition: blamer.h:137
tesseract::LanguageModel::language_model_ngram_small_prob
double language_model_ngram_small_prob
Definition: language_model.h:328
tesseract::DawgArgs::permuter
PermuterType permuter
Definition: dict.h:86
tesseract::LanguageModel::language_model_ngram_rating_factor
double language_model_ngram_rating_factor
Definition: language_model.h:339
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
tesseract::LanguageModel::rating_cert_scale_
float rating_cert_scale_
Definition: language_model.h:374
WERD_RES::LogNewCookedChoice
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:616
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::LanguageModel::language_model_ngram_space_delimited_language
bool language_model_ngram_space_delimited_language
Definition: language_model.h:341
tesseract::PTRAIN_DOC_SHORT
Definition: params_training_featdef.h:49
tesseract::Dict::set_hyphen_word
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:59
UNICHARSET::null_sid
int null_sid() const
Definition: unicharset.h:874
tesseract::DawgArgs::updated_dawgs
DawgPositionVector * updated_dawgs
Definition: dict.h:85
UNICHARSET::han_sid
int han_sid() const
Definition: unicharset.h:879
tesseract::LanguageModel::GetTopLowerUpperDigit
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
Definition: language_model.cpp:383
BlamerBundle::GuidedSegsearchStillGoing
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:512
TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:457
tesseract::UNICHAR::utf8_step
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:138
tesseract::Dict::certainty_scale
double certainty_scale
Definition: dict.h:627
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
UNICHARSET
Definition: unicharset.h:145
UNICHARSET::hiragana_sid
int hiragana_sid() const
Definition: unicharset.h:880
tesseract::PTRAIN_XHEIGHT_CONSISTENCY
Definition: params_training_featdef.h:64
double_MEMBER
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:321
tesseract::Dawg::edge_char_of
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
mixed
Definition: cluster.h:43
tesseract::LanguageModel::language_model_penalty_case
double language_model_penalty_case
Definition: language_model.h:352
tesseract::LanguageModelFlagsType
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:37
NGRAM_PERM
Definition: ratngs.h:236
WERD_RES::raw_choice
WERD_CHOICE * raw_choice
Definition: pageres.h:240
WERD_CHOICE::set_rating
void set_rating(float new_val)
Definition: ratngs.h:357
tesseract::LanguageModel::ComputeAdjustment
float ComputeAdjustment(int num_problems, float penalty)
Definition: language_model.h:124
WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:206
tesseract::LanguageModel::language_model_ngram_order
int language_model_ngram_order
Definition: language_model.h:320
tesseract::LanguageModel::ComputeDenom
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
Definition: language_model.cpp:994
BlamerBundle::set_best_choice_is_dict_and_top_choice
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:150
tesseract::LanguageModel::GenerateDawgInfo
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
Definition: language_model.cpp:785
tesseract::LanguageModel::fixed_pitch_
bool fixed_pitch_
Definition: language_model.h:390
WERD_CHOICE::DisplaySegmentation
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:763
tesseract::LanguageModel::kUpperCaseFlag
static const LanguageModelFlagsType kUpperCaseFlag
Definition: language_model.h:55
TOP_CHOICE_PERM
Definition: ratngs.h:233
tesseract::LanguageModel::prev_word_unichar_step_len_
int prev_word_unichar_step_len_
Definition: language_model.h:401
BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:78
tesseract::Dict::compound_marker
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:113
tesseract::LanguageModel::acceptable_choice_found_
bool acceptable_choice_found_
Definition: language_model.h:416
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
tesseract::LanguageModel::fontinfo_table_
const UnicityTable< FontInfo > * fontinfo_table_
Definition: language_model.h:379
USER_PATTERN_PERM
Definition: ratngs.h:238
GenericVector< UNICHAR_ID >
tesseract::LanguageModel::language_model_use_sigmoidal_certainty
bool language_model_use_sigmoidal_certainty
Definition: language_model.h:364
UNICHARSET::common_sid
int common_sid() const
Definition: unicharset.h:875
tesseract::LanguageModel::language_model_penalty_increment
double language_model_penalty_increment
Definition: language_model.h:361
tesseract::LanguageModel::ComputeConsistencyAdjustment
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
Definition: language_model.h:135
tesseract::LanguageModel::GenerateTopChoiceInfo
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
Definition: language_model.cpp:769
BLOB_CHOICE::fontinfo_id
int16_t fontinfo_id() const
Definition: ratngs.h:84
tesseract::ViterbiStateEntry::Compare
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:132
UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:495
tesseract::Dict::init_active_dawgs
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:600
WERD_CHOICE::print_state
void print_state(const char *msg) const
Definition: ratngs.cpp:754
WERD_RES::GetBlobsGap
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:736
tesseract::ParamsModel::ComputeCost
float ComputeCost(const float features[]) const
Definition: params_model.cpp:80
tesseract::LanguageModel::GetNextParentVSE
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
Definition: language_model.cpp:500
BLOB_CHOICE::PosAndSizeAgree
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:154
tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable
int language_model_viterbi_list_max_num_prunable
Definition: language_model.h:323
tesseract::Dict::GetStartingNode
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:438
tesseract::LanguageModel::ExtractFeaturesFromPath
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
Definition: language_model.cpp:1341
tesseract::ParamsModel::Initialized
bool Initialized()
Definition: params_model.h:44
tesseract::AssociateUtils::ComputeOutlineLength
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
Definition: associate.h:80
STRING::length
int32_t length() const
Definition: strngs.cpp:187
WERD_CHOICE::print
void print() const
Definition: ratngs.h:568
BLOB_CHOICE::fontinfo_id2
int16_t fontinfo_id2() const
Definition: ratngs.h:87
tesseract::LanguageModel::ConstructWord
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
Definition: language_model.cpp:1390
tesseract::Dict::has_hyphen_end
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:152
BLOB_CHOICE
Definition: ratngs.h:49
tesseract::Dict::reset_hyphen_vars
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:42
tesseract::LanguageModel::language_model_penalty_chartype
double language_model_penalty_chartype
Definition: language_model.h:356
tesseract::LanguageModel::CertaintyScore
float CertaintyScore(float cert)
Definition: language_model.h:112
tesseract::LanguageModel::language_model_min_compound_length
int language_model_min_compound_length
Definition: language_model.h:343
tesseract::PTRAIN_NGRAM_COST_PER_CHAR
Definition: params_training_featdef.h:61
MATRIX_COORD::col
int col
Definition: matrix.h:632
tesseract::Dict::adjust_word
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:701
tesseract::LanguageModel::language_model_viterbi_list_max_size
int language_model_viterbi_list_max_size
Definition: language_model.h:325
tesseract::Dict::ProbabilityInContext
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:390
tesseract::LanguageModel::kLowerCaseFlag
static const LanguageModelFlagsType kLowerCaseFlag
Definition: language_model.h:54
tesseract::PTRAIN_NUM_BAD_SPACING
Definition: params_training_featdef.h:66
UNICHARSET::normed_ids
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:825
tesseract::PTRAIN_NUM_BAD_CHAR_TYPE
Definition: params_training_featdef.h:65
tesseract::LanguageModel::language_model_ngram_nonmatch_score
double language_model_ngram_nonmatch_score
Definition: language_model.h:330
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::LanguageModel::FillConsistencyInfo
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
Definition: language_model.cpp:1015
tesseract::Dict::getUnicharset
const UNICHARSET & getUnicharset() const
Definition: dict.h:101
UNICHARSET::get_fragment
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:724
tesseract::Dict::GetDawg
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:432
tesseract::AssociateUtils::ComputeStats
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
Definition: associate.cpp:34
tesseract::Dawg::kPatternUnicharID
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:120
tesseract::LanguageModel::SetTopParentLowerUpperDigit
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
Definition: language_model.cpp:423
MATRIX_COORD::row
int row
Definition: matrix.h:633
UNICHARSET::SizesDistinct
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:485
tesseract::DawgArgs::active_dawgs
DawgPositionVector * active_dawgs
Definition: dict.h:84
tesseract::LanguageModel::kSmallestRatingFlag
static const LanguageModelFlagsType kSmallestRatingFlag
Definition: language_model.h:53
tesseract::Dict::NoDangerousAmbig
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:158
tesseract::Dict::is_apostrophe
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:124
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
BlamerBundle::AddHypothesis
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:170
tesseract::LanguageModel::kDigitFlag
static const LanguageModelFlagsType kDigitFlag
Definition: language_model.h:56
tesseract::LanguageModel::beginning_active_dawgs_
DawgPositionVector beginning_active_dawgs_
Definition: language_model.h:404
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
tesseract::LanguageModel::ComputeAssociateStats
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
Definition: language_model.h:280
tesseract::LanguageModel::language_model_penalty_font
double language_model_penalty_font
Definition: language_model.h:358
tesseract::LanguageModel::language_model_debug_level
int language_model_debug_level
Definition: language_model.h:316
tesseract::LanguageModel::ComputeNgramCost
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
Definition: language_model.cpp:934
BOOL_MEMBER
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:315
DOC_DAWG_PERM
Definition: ratngs.h:240
tesseract::LanguageModel::PrunablePath
bool PrunablePath(const ViterbiStateEntry &vse)
Definition: language_model.h:299
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::PTRAIN_NUM_SHORT
Definition: params_training_featdef.h:45
FREQ_DAWG_PERM
Definition: ratngs.h:242
tesseract::LanguageModel::language_model_penalty_punc
double language_model_penalty_punc
Definition: language_model.h:350
UNICHARSET::size
int size() const
Definition: unicharset.h:341
NUMBER_PERM
Definition: ratngs.h:237
NODE_REF
int64_t NODE_REF
Definition: dawg.h:50
USER_DAWG_PERM
Definition: ratngs.h:241
BandTriMatrix::dimension
int dimension() const
Definition: matrix.h:532
tesseract::LanguageModel::AcceptablePath
bool AcceptablePath(const ViterbiStateEntry &vse)
Definition: language_model.h:309
tesseract::LanguageModel::UpdateBestChoice
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:1241
tesseract::LanguageModel::language_model_ngram_scale_factor
double language_model_ngram_scale_factor
Definition: language_model.h:336
tesseract::PTRAIN_FREQ_SHORT
Definition: params_training_featdef.h:57