tesseract  4.0.0-1-g2a2b
tesseract::LanguageModel Class Reference

#include <language_model.h>

Public Member Functions

 LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
 
 ~LanguageModel ()
 
void InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
 
bool UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
bool AcceptableChoiceFound ()
 
void SetAcceptableChoiceFound (bool val)
 
ParamsModelgetParamsModel ()
 

Static Public Member Functions

static void ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[])
 

Public Attributes

int language_model_debug_level = 0
 
bool language_model_ngram_on = false
 
int language_model_ngram_order = 8
 
int language_model_viterbi_list_max_num_prunable = 10
 
int language_model_viterbi_list_max_size = 500
 
double language_model_ngram_small_prob = 0.000001
 
double language_model_ngram_nonmatch_score = -40.0
 
bool language_model_ngram_use_only_first_uft8_step = false
 
double language_model_ngram_scale_factor = 0.03
 
double language_model_ngram_rating_factor = 16.0
 
bool language_model_ngram_space_delimited_language = true
 
int language_model_min_compound_length = 3
 
double language_model_penalty_non_freq_dict_word = 0.1
 
double language_model_penalty_non_dict_word = 0.15
 
double language_model_penalty_punc = 0.2
 
double language_model_penalty_case = 0.1
 
double language_model_penalty_script = 0.5
 
double language_model_penalty_chartype = 0.3
 
double language_model_penalty_font = 0.00
 
double language_model_penalty_spacing = 0.05
 
double language_model_penalty_increment = 0.01
 
int wordrec_display_segmentations = 0
 
bool language_model_use_sigmoidal_certainty = false
 

Static Public Attributes

static const LanguageModelFlagsType kSmallestRatingFlag = 0x1
 
static const LanguageModelFlagsType kLowerCaseFlag = 0x2
 
static const LanguageModelFlagsType kUpperCaseFlag = 0x4
 
static const LanguageModelFlagsType kDigitFlag = 0x8
 
static const LanguageModelFlagsType kXhtConsistentFlag = 0x10
 
static const float kMaxAvgNgramCost = 25.0f
 

Protected Member Functions

float CertaintyScore (float cert)
 
float ComputeAdjustment (int num_problems, float penalty)
 
float ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
 
float ComputeAdjustedPathCost (ViterbiStateEntry *vse)
 
bool GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
 
int SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const
 
ViterbiStateEntryGetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
 
bool AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
 
LanguageModelDawgInfoGenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
 
LanguageModelNgramInfoGenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
 
float ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
 
float ComputeDenom (BLOB_CHOICE_LIST *curr_list)
 
void FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
 
void UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
WERD_CHOICEConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
 
void ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
 
bool PrunablePath (const ViterbiStateEntry &vse)
 
bool AcceptablePath (const ViterbiStateEntry &vse)
 

Protected Attributes

DawgArgs dawg_args_
 
float rating_cert_scale_
 
const UnicityTable< FontInfo > * fontinfo_table_
 
Dictdict_
 
bool fixed_pitch_
 
float max_char_wh_ratio_
 
STRING prev_word_str_
 
int prev_word_unichar_step_len_
 
DawgPositionVector very_beginning_active_dawgs_
 
DawgPositionVector beginning_active_dawgs_
 
bool acceptable_choice_found_
 
bool correct_segmentation_explored_
 
ParamsModel params_model_
 

Detailed Description

Definition at line 51 of file language_model.h.

Constructor & Destructor Documentation

◆ LanguageModel()

tesseract::LanguageModel::LanguageModel ( const UnicityTable< FontInfo > *  fontinfo_table,
Dict dict 
)

Definition at line 54 of file language_model.cpp.

56  : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
57  dict->getCCUtil()->params()),
59  "Turn on/off the use of character ngram model",
60  dict->getCCUtil()->params()),
62  "Maximum order of the character ngram model",
63  dict->getCCUtil()->params()),
65  "Maximum number of prunable (those for which"
66  " PrunablePath() is true) entries in each viterbi list"
67  " recorded in BLOB_CHOICEs",
68  dict->getCCUtil()->params()),
70  "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
71  dict->getCCUtil()->params()),
73  "To avoid overly small denominators use this as the "
74  "floor of the probability returned by the ngram model.",
75  dict->getCCUtil()->params()),
77  "Average classifier score of a non-matching unichar.",
78  dict->getCCUtil()->params()),
80  "Use only the first UTF8 step of the given string"
81  " when computing log probabilities.",
82  dict->getCCUtil()->params()),
84  "Strength of the character ngram model relative to the"
85  " character classifier ",
86  dict->getCCUtil()->params()),
88  "Factor to bring log-probs into the same range as ratings"
89  " when multiplied by outline length ",
90  dict->getCCUtil()->params()),
92  "Words are delimited by space", dict->getCCUtil()->params()),
94  "Minimum length of compound words",
95  dict->getCCUtil()->params()),
97  "Penalty for words not in the frequent word dictionary",
98  dict->getCCUtil()->params()),
100  "Penalty for non-dictionary words",
101  dict->getCCUtil()->params()),
103  "Penalty for inconsistent punctuation",
104  dict->getCCUtil()->params()),
106  "Penalty for inconsistent case",
107  dict->getCCUtil()->params()),
109  "Penalty for inconsistent script",
110  dict->getCCUtil()->params()),
112  "Penalty for inconsistent character type",
113  dict->getCCUtil()->params()),
114  // TODO(daria, rays): enable font consistency checking
115  // after improving font analysis.
117  "Penalty for inconsistent font",
118  dict->getCCUtil()->params()),
120  "Penalty for inconsistent spacing",
121  dict->getCCUtil()->params()),
122  double_MEMBER(language_model_penalty_increment, 0.01, "Penalty increment",
123  dict->getCCUtil()->params()),
124  INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
125  dict->getCCUtil()->params()),
127  "Use sigmoidal score for certainty",
128  dict->getCCUtil()->params()),
129  dawg_args_(nullptr, new DawgPositionVector(), NO_PERM),
130  fontinfo_table_(fontinfo_table),
131  dict_(dict),
132  fixed_pitch_(false),
133  max_char_wh_ratio_(0.0),
134  acceptable_choice_found_(false) {
135  ASSERT_HOST(dict_ != nullptr);
136 }
int language_model_viterbi_list_max_num_prunable
bool language_model_ngram_space_delimited_language
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:303
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:288
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:291
double language_model_penalty_non_freq_dict_word
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:297
bool language_model_ngram_use_only_first_uft8_step
const UnicityTable< FontInfo > * fontinfo_table_
double language_model_penalty_non_dict_word
double language_model_ngram_nonmatch_score
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ ~LanguageModel()

tesseract::LanguageModel::~LanguageModel ( )

Definition at line 138 of file language_model.cpp.

138 { delete dawg_args_.updated_dawgs; }
DawgPositionVector * updated_dawgs
Definition: dict.h:82

Member Function Documentation

◆ AcceptableChoiceFound()

bool tesseract::LanguageModel::AcceptableChoiceFound ( )
inline

Definition at line 104 of file language_model.h.

◆ AcceptablePath()

bool tesseract::LanguageModel::AcceptablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 310 of file language_model.h.

310  {
311  return (vse.dawg_info != nullptr || vse.Consistent() ||
312  (vse.ngram_info != nullptr && !vse.ngram_info->pruned));
313  }

◆ AddViterbiStateEntry()

bool tesseract::LanguageModel::AddViterbiStateEntry ( LanguageModelFlagsType  top_choice_flags,
float  denom,
bool  word_end,
int  curr_col,
int  curr_row,
BLOB_CHOICE b,
LanguageModelState curr_state,
ViterbiStateEntry parent_vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 565 of file language_model.cpp.

576  {
577  ViterbiStateEntry_IT vit;
578  if (language_model_debug_level > 1) {
579  tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
580  " certainty=%.4f top_choice_flags=0x%x",
582  b->rating(), b->certainty(), top_choice_flags);
584  tprintf(" parent_vse=%p\n", parent_vse);
585  else
586  tprintf("\n");
587  }
588  ASSERT_HOST(curr_state != nullptr);
589  // Check whether the list is full.
590  if (curr_state->viterbi_state_entries_length >=
592  if (language_model_debug_level > 1) {
593  tprintf("AddViterbiStateEntry: viterbi list is full!\n");
594  }
595  return false;
596  }
597 
598  // Invoke Dawg language model component.
599  LanguageModelDawgInfo *dawg_info =
600  GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
601 
602  float outline_length =
604  // Invoke Ngram language model component.
605  LanguageModelNgramInfo *ngram_info = nullptr;
607  ngram_info = GenerateNgramInfo(
609  denom, curr_col, curr_row, outline_length, parent_vse);
610  ASSERT_HOST(ngram_info != nullptr);
611  }
612  bool liked_by_language_model = dawg_info != nullptr ||
613  (ngram_info != nullptr && !ngram_info->pruned);
614  // Quick escape if not liked by the language model, can't be consistent
615  // xheight, and not top choice.
616  if (!liked_by_language_model && top_choice_flags == 0) {
617  if (language_model_debug_level > 1) {
618  tprintf("Language model components very early pruned this entry\n");
619  }
620  delete ngram_info;
621  delete dawg_info;
622  return false;
623  }
624 
625  // Check consistency of the path and set the relevant consistency_info.
626  LMConsistencyInfo consistency_info(
627  parent_vse != nullptr ? &parent_vse->consistency_info : nullptr);
628  // Start with just the x-height consistency, as it provides significant
629  // pruning opportunity.
630  consistency_info.ComputeXheightConsistency(
632  // Turn off xheight consistent flag if not consistent.
633  if (consistency_info.InconsistentXHeight()) {
634  top_choice_flags &= ~kXhtConsistentFlag;
635  }
636 
637  // Quick escape if not liked by the language model, not consistent xheight,
638  // and not top choice.
639  if (!liked_by_language_model && top_choice_flags == 0) {
640  if (language_model_debug_level > 1) {
641  tprintf("Language model components early pruned this entry\n");
642  }
643  delete ngram_info;
644  delete dawg_info;
645  return false;
646  }
647 
648  // Compute the rest of the consistency info.
649  FillConsistencyInfo(curr_col, word_end, b, parent_vse,
650  word_res, &consistency_info);
651  if (dawg_info != nullptr && consistency_info.invalid_punc) {
652  consistency_info.invalid_punc = false; // do not penalize dict words
653  }
654 
655  // Compute cost of associating the blobs that represent the current unichar.
656  AssociateStats associate_stats;
657  ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
658  parent_vse, word_res, &associate_stats);
659  if (parent_vse != nullptr) {
660  associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
661  associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
662  }
663 
664  // Create the new ViterbiStateEntry compute the adjusted cost of the path.
665  ViterbiStateEntry *new_vse = new ViterbiStateEntry(
666  parent_vse, b, 0.0, outline_length,
667  consistency_info, associate_stats, top_choice_flags, dawg_info,
668  ngram_info, (language_model_debug_level > 0) ?
669  dict_->getUnicharset().id_to_unichar(b->unichar_id()) : nullptr);
670  new_vse->cost = ComputeAdjustedPathCost(new_vse);
672  tprintf("Adjusted cost = %g\n", new_vse->cost);
673 
674  // Invoke Top Choice language model component to make the final adjustments
675  // to new_vse->top_choice_flags.
676  if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
677  GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
678  }
679 
680  // If language model components did not like this unichar - return.
681  bool keep = new_vse->top_choice_flags || liked_by_language_model;
682  if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths
683  consistency_info.inconsistent_script) { // with inconsistent script
684  keep = false;
685  }
686  if (!keep) {
687  if (language_model_debug_level > 1) {
688  tprintf("Language model components did not like this entry\n");
689  }
690  delete new_vse;
691  return false;
692  }
693 
694  // Discard this entry if it represents a prunable path and
695  // language_model_viterbi_list_max_num_prunable such entries with a lower
696  // cost have already been recorded.
697  if (PrunablePath(*new_vse) &&
698  (curr_state->viterbi_state_entries_prunable_length >=
700  new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
701  if (language_model_debug_level > 1) {
702  tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
703  new_vse->cost,
704  curr_state->viterbi_state_entries_prunable_max_cost);
705  }
706  delete new_vse;
707  return false;
708  }
709 
710  // Update best choice if needed.
711  if (word_end) {
712  UpdateBestChoice(new_vse, pain_points, word_res,
713  best_choice_bundle, blamer_bundle);
714  // Discard the entry if UpdateBestChoice() found flaws in it.
715  if (new_vse->cost >= WERD_CHOICE::kBadRating &&
716  new_vse != best_choice_bundle->best_vse) {
717  if (language_model_debug_level > 1) {
718  tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
719  }
720  delete new_vse;
721  return false;
722  }
723  }
724 
725  // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
726  curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
727  false, new_vse);
728  curr_state->viterbi_state_entries_length++;
729  if (PrunablePath(*new_vse)) {
730  curr_state->viterbi_state_entries_prunable_length++;
731  }
732 
733  // Update lms->viterbi_state_entries_prunable_max_cost and clear
734  // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
735  if ((curr_state->viterbi_state_entries_prunable_length >=
737  new_vse->top_choice_flags) {
738  ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
739  int prunable_counter = language_model_viterbi_list_max_num_prunable;
740  vit.set_to_list(&(curr_state->viterbi_state_entries));
741  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
742  ViterbiStateEntry *curr_vse = vit.data();
743  // Clear the appropriate top choice flags of the entries in the
744  // list that have cost higher thank new_entry->cost
745  // (since they will not be top choices any more).
746  if (curr_vse->top_choice_flags && curr_vse != new_vse &&
747  curr_vse->cost > new_vse->cost) {
748  curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
749  }
750  if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
751  // Update curr_state->viterbi_state_entries_prunable_max_cost.
752  if (prunable_counter == 0) {
753  curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
754  if (language_model_debug_level > 1) {
755  tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
756  curr_state->viterbi_state_entries_prunable_max_cost);
757  }
758  prunable_counter = -1; // stop counting
759  }
760  }
761  }
762 
763  // Print the newly created ViterbiStateEntry.
764  if (language_model_debug_level > 2) {
765  new_vse->Print("New");
767  curr_state->Print("Updated viterbi list");
768  }
769 
770  return true;
771 }
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
static const float kBadRating
Definition: ratngs.h:275
float certainty() const
Definition: ratngs.h:83
int language_model_viterbi_list_max_num_prunable
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:514
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:128
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
static const LanguageModelFlagsType kXhtConsistentFlag
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
static const LanguageModelFlagsType kSmallestRatingFlag
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool PrunablePath(const ViterbiStateEntry &vse)
float rating() const
Definition: ratngs.h:80
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
Definition: associate.h:80
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ CertaintyScore()

float tesseract::LanguageModel::CertaintyScore ( float  cert)
inlineprotected

Definition at line 113 of file language_model.h.

113  {
115  // cert is assumed to be between 0 and -dict_->certainty_scale.
116  // If you enable language_model_use_sigmoidal_certainty, you
117  // need to adjust language_model_ngram_nonmatch_score as well.
118  cert = -cert / dict_->certainty_scale;
119  return 1.0f / (1.0f + exp(10.0f * cert));
120  } else {
121  return (-1.0f / cert);
122  }
123  }
double certainty_scale
Definition: dict.h:611

◆ ComputeAdjustedPathCost()

float tesseract::LanguageModel::ComputeAdjustedPathCost ( ViterbiStateEntry vse)
protected

Definition at line 1199 of file language_model.cpp.

1199  {
1200  ASSERT_HOST(vse != nullptr);
1201  if (params_model_.Initialized()) {
1202  float features[PTRAIN_NUM_FEATURE_TYPES];
1203  ExtractFeaturesFromPath(*vse, features);
1204  float cost = params_model_.ComputeCost(features);
1205  if (language_model_debug_level > 3) {
1206  tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1207  if (language_model_debug_level >= 5) {
1208  for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
1209  tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1210  }
1211  }
1212  }
1213  return cost * vse->outline_length;
1214  } else {
1215  float adjustment = 1.0f;
1216  if (vse->dawg_info == nullptr || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
1218  }
1219  if (vse->dawg_info == nullptr) {
1221  if (vse->length > language_model_min_compound_length) {
1222  adjustment += ((vse->length - language_model_min_compound_length) *
1224  }
1225  }
1226  if (vse->associate_stats.shape_cost > 0) {
1227  adjustment += vse->associate_stats.shape_cost /
1228  static_cast<float>(vse->length);
1229  }
1231  ASSERT_HOST(vse->ngram_info != nullptr);
1232  return vse->ngram_info->ngram_and_classifier_cost * adjustment;
1233  } else {
1234  adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
1235  vse->consistency_info);
1236  return vse->ratings_sum * adjustment;
1237  }
1238  }
1239 }
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
double language_model_penalty_non_freq_dict_word
float ComputeCost(const float features[]) const
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
double language_model_penalty_non_dict_word
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ ComputeAdjustment()

float tesseract::LanguageModel::ComputeAdjustment ( int  num_problems,
float  penalty 
)
inlineprotected

Definition at line 125 of file language_model.h.

125  {
126  if (num_problems == 0) return 0.0f;
127  if (num_problems == 1) return penalty;
128  return (penalty + (language_model_penalty_increment *
129  static_cast<float>(num_problems-1)));
130  }

◆ ComputeAssociateStats()

void tesseract::LanguageModel::ComputeAssociateStats ( int  col,
int  row,
float  max_char_wh_ratio,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
AssociateStats associate_stats 
)
inlineprotected

Definition at line 281 of file language_model.h.

285  {
287  col, row,
288  (parent_vse != nullptr) ? &(parent_vse->associate_stats) : nullptr,
289  (parent_vse != nullptr) ? parent_vse->length : 0,
290  fixed_pitch_, max_char_wh_ratio,
291  word_res, language_model_debug_level > 2, associate_stats);
292  }
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
Definition: associate.cpp:34

◆ ComputeConsistencyAdjustment()

float tesseract::LanguageModel::ComputeConsistencyAdjustment ( const LanguageModelDawgInfo dawg_info,
const LMConsistencyInfo consistency_info 
)
inlineprotected

Definition at line 136 of file language_model.h.

138  {
139  if (dawg_info != nullptr) {
140  return ComputeAdjustment(consistency_info.NumInconsistentCase(),
142  (consistency_info.inconsistent_script ?
144  }
145  return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
147  ComputeAdjustment(consistency_info.NumInconsistentCase(),
149  ComputeAdjustment(consistency_info.NumInconsistentChartype(),
151  ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
153  (consistency_info.inconsistent_script ?
155  (consistency_info.inconsistent_font ?
157  }
float ComputeAdjustment(int num_problems, float penalty)

◆ ComputeDenom()

float tesseract::LanguageModel::ComputeDenom ( BLOB_CHOICE_LIST *  curr_list)
protected

Definition at line 997 of file language_model.cpp.

997  {
998  if (curr_list->empty()) return 1.0f;
999  float denom = 0.0f;
1000  int len = 0;
1001  BLOB_CHOICE_IT c_it(curr_list);
1002  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
1003  ASSERT_HOST(c_it.data() != nullptr);
1004  ++len;
1005  denom += CertaintyScore(c_it.data()->certainty());
1006  }
1007  assert(len != 0);
1008  // The ideal situation would be to have the classifier scores for
1009  // classifying each position as each of the characters in the unicharset.
1010  // Since we can not do this because of speed, we add a very crude estimate
1011  // of what these scores for the "missing" classifications would sum up to.
1012  denom += (dict_->getUnicharset().size() - len) *
1014 
1015  return denom;
1016 }
float CertaintyScore(float cert)
int size() const
Definition: unicharset.h:336
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
double language_model_ngram_nonmatch_score
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ ComputeNgramCost()

float tesseract::LanguageModel::ComputeNgramCost ( const char *  unichar,
float  certainty,
float  denom,
const char *  context,
int *  unichar_step_len,
bool *  found_small_prob,
float *  ngram_prob 
)
protected

Definition at line 937 of file language_model.cpp.

943  {
944  const char *context_ptr = context;
945  char *modified_context = nullptr;
946  char *modified_context_end = nullptr;
947  const char *unichar_ptr = unichar;
948  const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
949  float prob = 0.0f;
950  int step = 0;
951  while (unichar_ptr < unichar_end &&
952  (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
953  if (language_model_debug_level > 1) {
954  tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
955  dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
956  }
957  prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
958  ++(*unichar_step_len);
960  unichar_ptr += step;
961  // If there are multiple UTF8 characters present in unichar, context is
962  // updated to include the previously examined characters from str,
963  // unless use_only_first_uft8_step is true.
964  if (unichar_ptr < unichar_end) {
965  if (modified_context == nullptr) {
966  size_t context_len = strlen(context);
967  modified_context =
968  new char[context_len + strlen(unichar_ptr) + step + 1];
969  memcpy(modified_context, context, context_len);
970  modified_context_end = modified_context + context_len;
971  context_ptr = modified_context;
972  }
973  strncpy(modified_context_end, unichar_ptr - step, step);
974  modified_context_end += step;
975  *modified_context_end = '\0';
976  }
977  }
978  prob /= static_cast<float>(*unichar_step_len); // normalize
979  if (prob < language_model_ngram_small_prob) {
980  if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
981  *found_small_prob = true;
983  }
984  *ngram_cost = -1.0*log2(prob);
985  float ngram_and_classifier_cost =
986  -1.0*log2(CertaintyScore(certainty)/denom) +
987  *ngram_cost * language_model_ngram_scale_factor;
988  if (language_model_debug_level > 1) {
989  tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
990  unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
991  ngram_and_classifier_cost);
992  }
993  delete[] modified_context;
994  return ngram_and_classifier_cost;
995 }
float CertaintyScore(float cert)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:136
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:375
bool language_model_ngram_use_only_first_uft8_step

◆ ConstructWord()

WERD_CHOICE * tesseract::LanguageModel::ConstructWord ( ViterbiStateEntry vse,
WERD_RES word_res,
DANGERR fixpt,
BlamerBundle blamer_bundle,
bool *  truth_path 
)
protected

Definition at line 1390 of file language_model.cpp.

1395  {
1396  if (truth_path != nullptr) {
1397  *truth_path =
1398  (blamer_bundle != nullptr &&
1399  vse->length == blamer_bundle->correct_segmentation_length());
1400  }
1401  BLOB_CHOICE *curr_b = vse->curr_b;
1402  ViterbiStateEntry *curr_vse = vse;
1403 
1404  int i;
1405  bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1406 
1407  // Re-compute the variance of the width-to-height ratios (since we now
1408  // can compute the mean over the whole word).
1409  float full_wh_ratio_mean = 0.0f;
1410  if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1411  vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
1412  full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
1413  static_cast<float>(vse->length));
1414  vse->associate_stats.full_wh_ratio_var = 0.0f;
1415  }
1416 
1417  // Construct a WERD_CHOICE by tracing parent pointers.
1418  WERD_CHOICE *word = new WERD_CHOICE(word_res->uch_set, vse->length);
1419  word->set_length(vse->length);
1420  int total_blobs = 0;
1421  for (i = (vse->length-1); i >= 0; --i) {
1422  if (blamer_bundle != nullptr && truth_path != nullptr && *truth_path &&
1423  !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
1424  *truth_path = false;
1425  }
1426  // The number of blobs used for this choice is row - col + 1.
1427  int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
1428  total_blobs += num_blobs;
1429  word->set_blob_choice(i, num_blobs, curr_b);
1430  // Update the width-to-height ratio variance. Useful non-space delimited
1431  // languages to ensure that the blobs are of uniform width.
1432  // Skip leading and trailing punctuation when computing the variance.
1433  if ((full_wh_ratio_mean != 0.0f &&
1434  ((curr_vse != vse && curr_vse->parent_vse != nullptr) ||
1435  !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
1436  vse->associate_stats.full_wh_ratio_var +=
1437  pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1438  if (language_model_debug_level > 2) {
1439  tprintf("full_wh_ratio_var += (%g-%g)^2\n",
1440  full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
1441  }
1442  }
1443 
1444  // Mark the word as compound if compound permuter was set for any of
1445  // the unichars on the path (usually this will happen for unichars
1446  // that are compounding operators, like "-" and "/").
1447  if (!compound && curr_vse->dawg_info &&
1448  curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
1449 
1450  // Update curr_* pointers.
1451  curr_vse = curr_vse->parent_vse;
1452  if (curr_vse == nullptr) break;
1453  curr_b = curr_vse->curr_b;
1454  }
1455  ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.
1456  ASSERT_HOST(total_blobs == word_res->ratings->dimension());
1457  // Re-adjust shape cost to include the updated width-to-height variance.
1458  if (full_wh_ratio_mean != 0.0f) {
1459  vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
1460  }
1461 
1462  word->set_rating(vse->ratings_sum);
1463  word->set_certainty(vse->min_certainty);
1464  word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
1465  vse->consistency_info.BodyMaxXHeight());
1466  if (vse->dawg_info != nullptr) {
1467  word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1468  } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1469  word->set_permuter(NGRAM_PERM);
1470  } else if (vse->top_choice_flags) {
1472  } else {
1473  word->set_permuter(NO_PERM);
1474  }
1475  word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
1476  word_res->ratings));
1477  return word;
1478 }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:514
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:312
int dimension() const
Definition: matrix.h:533
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void set_length(int len)
Definition: ratngs.h:381
const MATRIX_COORD & matrix_cell()
Definition: ratngs.h:115
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:127
MATRIX * ratings
Definition: pageres.h:231
const UNICHARSET * uch_set
Definition: pageres.h:206
void set_x_heights(float min_height, float max_height)
Definition: ratngs.h:342
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
void set_dangerous_ambig_found_(bool value)
Definition: ratngs.h:366
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:143
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
int correct_segmentation_length() const
Definition: blamer.h:138
void set_certainty(float new_val)
Definition: ratngs.h:372
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:142
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_rating(float new_val)
Definition: ratngs.h:369
void set_permuter(uint8_t perm)
Definition: ratngs.h:375

◆ ExtractFeaturesFromPath()

void tesseract::LanguageModel::ExtractFeaturesFromPath ( const ViterbiStateEntry vse,
float  features[] 
)
static

Definition at line 1341 of file language_model.cpp.

1342  {
1343  memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
1344  // Record dictionary match info.
1345  int len = vse.length <= kMaxSmallWordUnichars ? 0 :
1346  vse.length <= kMaxMediumWordUnichars ? 1 : 2;
1347  if (vse.dawg_info != nullptr) {
1348  int permuter = vse.dawg_info->permuter;
1349  if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
1350  if (vse.consistency_info.num_digits == vse.length) {
1351  features[PTRAIN_DIGITS_SHORT+len] = 1.0;
1352  } else {
1353  features[PTRAIN_NUM_SHORT+len] = 1.0;
1354  }
1355  } else if (permuter == DOC_DAWG_PERM) {
1356  features[PTRAIN_DOC_SHORT+len] = 1.0;
1357  } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
1358  permuter == COMPOUND_PERM) {
1359  features[PTRAIN_DICT_SHORT+len] = 1.0;
1360  } else if (permuter == FREQ_DAWG_PERM) {
1361  features[PTRAIN_FREQ_SHORT+len] = 1.0;
1362  }
1363  }
1364  // Record shape cost feature (normalized by path length).
1365  features[PTRAIN_SHAPE_COST_PER_CHAR] =
1366  vse.associate_stats.shape_cost / static_cast<float>(vse.length);
1367  // Record ngram cost. (normalized by the path length).
1368  features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
1369  if (vse.ngram_info != nullptr) {
1370  features[PTRAIN_NGRAM_COST_PER_CHAR] =
1371  vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
1372  }
1373  // Record consistency-related features.
1374  // Disabled this feature for due to its poor performance.
1375  // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
1376  features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase();
1377  features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision;
1378  features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == nullptr ?
1379  vse.consistency_info.NumInconsistentChartype() : 0.0;
1380  features[PTRAIN_NUM_BAD_SPACING] =
1381  vse.consistency_info.NumInconsistentSpaces();
1382  // Disabled this feature for now due to its poor performance.
1383  // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
1384 
1385  // Classifier-related features.
1386  features[PTRAIN_RATING_PER_CHAR] =
1387  vse.ratings_sum / static_cast<float>(vse.outline_length);
1388 }

◆ FillConsistencyInfo()

void tesseract::LanguageModel::FillConsistencyInfo ( int  curr_col,
bool  word_end,
BLOB_CHOICE b,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
LMConsistencyInfo consistency_info 
)
protected

Definition at line 1018 of file language_model.cpp.

1024  {
1025  const UNICHARSET &unicharset = dict_->getUnicharset();
1026  UNICHAR_ID unichar_id = b->unichar_id();
1027  BLOB_CHOICE* parent_b = parent_vse != nullptr ? parent_vse->curr_b : nullptr;
1028 
1029  // Check punctuation validity.
1030  if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
1031  if (dict_->GetPuncDawg() != nullptr && !consistency_info->invalid_punc) {
1032  if (dict_->compound_marker(unichar_id) && parent_b != nullptr &&
1033  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1034  unicharset.get_isdigit(parent_b->unichar_id()))) {
1035  // reset punc_ref for compound words
1036  consistency_info->punc_ref = NO_EDGE;
1037  } else {
1038  bool is_apos = dict_->is_apostrophe(unichar_id);
1039  bool prev_is_numalpha = (parent_b != nullptr &&
1040  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1041  unicharset.get_isdigit(parent_b->unichar_id())));
1042  UNICHAR_ID pattern_unichar_id =
1043  (unicharset.get_isalpha(unichar_id) ||
1044  unicharset.get_isdigit(unichar_id) ||
1045  (is_apos && prev_is_numalpha)) ?
1046  Dawg::kPatternUnicharID : unichar_id;
1047  if (consistency_info->punc_ref == NO_EDGE ||
1048  pattern_unichar_id != Dawg::kPatternUnicharID ||
1049  dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1052  consistency_info->punc_ref);
1053  consistency_info->punc_ref =
1054  (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1055  node, pattern_unichar_id, word_end) : NO_EDGE;
1056  if (consistency_info->punc_ref == NO_EDGE) {
1057  consistency_info->invalid_punc = true;
1058  }
1059  }
1060  }
1061  }
1062 
1063  // Update case related counters.
1064  if (parent_vse != nullptr && !word_end && dict_->compound_marker(unichar_id)) {
1065  // Reset counters if we are dealing with a compound word.
1066  consistency_info->num_lower = 0;
1067  consistency_info->num_non_first_upper = 0;
1068  }
1069  else if (unicharset.get_islower(unichar_id)) {
1070  consistency_info->num_lower++;
1071  } else if ((parent_b != nullptr) && unicharset.get_isupper(unichar_id)) {
1072  if (unicharset.get_isupper(parent_b->unichar_id()) ||
1073  consistency_info->num_lower > 0 ||
1074  consistency_info->num_non_first_upper > 0) {
1075  consistency_info->num_non_first_upper++;
1076  }
1077  }
1078 
1079  // Initialize consistency_info->script_id (use script of unichar_id
1080  // if it is not Common, use script id recorded by the parent otherwise).
1081  // Set inconsistent_script to true if the script of the current unichar
1082  // is not consistent with that of the parent.
1083  consistency_info->script_id = unicharset.get_script(unichar_id);
1084  // Hiragana and Katakana can mix with Han.
1086  if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1087  consistency_info->script_id == unicharset.hiragana_sid()) ||
1088  (unicharset.katakana_sid() != unicharset.null_sid() &&
1089  consistency_info->script_id == unicharset.katakana_sid())) {
1090  consistency_info->script_id = dict_->getUnicharset().han_sid();
1091  }
1092  }
1093 
1094  if (parent_vse != nullptr &&
1095  (parent_vse->consistency_info.script_id !=
1096  dict_->getUnicharset().common_sid())) {
1097  int parent_script_id = parent_vse->consistency_info.script_id;
1098  // If script_id is Common, use script id of the parent instead.
1099  if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1100  consistency_info->script_id = parent_script_id;
1101  }
1102  if (consistency_info->script_id != parent_script_id) {
1103  consistency_info->inconsistent_script = true;
1104  }
1105  }
1106 
1107  // Update chartype related counters.
1108  if (unicharset.get_isalpha(unichar_id)) {
1109  consistency_info->num_alphas++;
1110  } else if (unicharset.get_isdigit(unichar_id)) {
1111  consistency_info->num_digits++;
1112  } else if (!unicharset.get_ispunctuation(unichar_id)) {
1113  consistency_info->num_other++;
1114  }
1115 
1116  // Check font and spacing consistency.
1117  if (fontinfo_table_->size() > 0 && parent_b != nullptr) {
1118  int fontinfo_id = -1;
1119  if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1120  parent_b->fontinfo_id2() == b->fontinfo_id()) {
1121  fontinfo_id = b->fontinfo_id();
1122  } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1123  parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1124  fontinfo_id = b->fontinfo_id2();
1125  }
1126  if(language_model_debug_level > 1) {
1127  tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1128  (parent_b->fontinfo_id() >= 0) ?
1129  fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
1130  (parent_b->fontinfo_id2() >= 0) ?
1131  fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
1132  (b->fontinfo_id() >= 0) ?
1133  fontinfo_table_->get(b->fontinfo_id()).name : "",
1134  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1135  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1136  fontinfo_id);
1137  }
1138  if (!word_res->blob_widths.empty()) { // if we have widths/gaps info
1139  bool expected_gap_found = false;
1140  float expected_gap = 0.0f;
1141  int temp_gap;
1142  if (fontinfo_id >= 0) { // found a common font
1143  ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1144  if (fontinfo_table_->get(fontinfo_id).get_spacing(
1145  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1146  expected_gap = temp_gap;
1147  expected_gap_found = true;
1148  }
1149  } else {
1150  consistency_info->inconsistent_font = true;
1151  // Get an average of the expected gaps in each font
1152  int num_addends = 0;
1153  int temp_fid;
1154  for (int i = 0; i < 4; ++i) {
1155  if (i == 0) {
1156  temp_fid = parent_b->fontinfo_id();
1157  } else if (i == 1) {
1158  temp_fid = parent_b->fontinfo_id2();
1159  } else if (i == 2) {
1160  temp_fid = b->fontinfo_id();
1161  } else {
1162  temp_fid = b->fontinfo_id2();
1163  }
1164  ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1165  if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
1166  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1167  expected_gap += temp_gap;
1168  num_addends++;
1169  }
1170  }
1171  if (num_addends > 0) {
1172  expected_gap /= static_cast<float>(num_addends);
1173  expected_gap_found = true;
1174  }
1175  }
1176  if (expected_gap_found) {
1177  float actual_gap =
1178  static_cast<float>(word_res->GetBlobsGap(curr_col-1));
1179  float gap_ratio = expected_gap / actual_gap;
1180  // TODO(rays) The gaps seem to be way off most of the time, saved by
1181  // the error here that the ratio was compared to 1/2, when it should
1182  // have been 0.5f. Find the source of the gaps discrepancy and put
1183  // the 0.5f here in place of 0.0f.
1184  // Test on 2476595.sj, pages 0 to 6. (In French.)
1185  if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1186  consistency_info->num_inconsistent_spaces++;
1187  }
1188  if (language_model_debug_level > 1) {
1189  tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
1190  unicharset.id_to_unichar(parent_b->unichar_id()),
1191  parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
1192  unichar_id, curr_col, expected_gap, actual_gap);
1193  }
1194  }
1195  }
1196  }
1197 }
int common_sid() const
Definition: unicharset.h:879
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:493
int UNICHAR_ID
Definition: unichar.h:35
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:514
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:419
GenericVector< int > blob_widths
Definition: pageres.h:219
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:746
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
int hiragana_sid() const
Definition: unicharset.h:884
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
int16_t fontinfo_id() const
Definition: ratngs.h:86
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:423
int64_t NODE_REF
Definition: dawg.h:56
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
int16_t fontinfo_id2() const
Definition: ratngs.h:89
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int null_sid() const
Definition: unicharset.h:878
int han_sid() const
Definition: unicharset.h:883
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
const UnicityTable< FontInfo > * fontinfo_table_
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:126
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:118
int katakana_sid() const
Definition: unicharset.h:885
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:109
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:658
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ GenerateDawgInfo()

LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo ( bool  word_end,
int  curr_col,
int  curr_row,
const BLOB_CHOICE b,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 789 of file language_model.cpp.

793  {
794  // Initialize active_dawgs from parent_vse if it is not nullptr.
795  // Otherwise use very_beginning_active_dawgs_.
796  if (parent_vse == nullptr) {
799  } else {
800  if (parent_vse->dawg_info == nullptr) return nullptr; // not a dict word path
801  dawg_args_.active_dawgs = &parent_vse->dawg_info->active_dawgs;
802  dawg_args_.permuter = parent_vse->dawg_info->permuter;
803  }
804 
805  // Deal with hyphenated words.
806  if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
807  if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
808  return new LanguageModelDawgInfo(dawg_args_.active_dawgs, COMPOUND_PERM);
809  }
810 
811  // Deal with compound words.
812  if (dict_->compound_marker(b.unichar_id()) &&
813  (parent_vse == nullptr || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
814  if (language_model_debug_level > 0) tprintf("Found compound marker\n");
815  // Do not allow compound operators at the beginning and end of the word.
816  // Do not allow more than one compound operator per word.
817  // Do not allow compounding of words with lengths shorter than
818  // language_model_min_compound_length
819  if (parent_vse == nullptr || word_end ||
821  parent_vse->length < language_model_min_compound_length)
822  return nullptr;
823 
824  int i;
825  // Check a that the path terminated before the current character is a word.
826  bool has_word_ending = false;
827  for (i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {
828  const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];
829  const Dawg *pdawg = pos.dawg_index < 0
830  ? nullptr : dict_->GetDawg(pos.dawg_index);
831  if (pdawg == nullptr || pos.back_to_punc) continue;;
832  if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
833  pdawg->end_of_word(pos.dawg_ref)) {
834  has_word_ending = true;
835  break;
836  }
837  }
838  if (!has_word_ending) return nullptr;
839 
840  if (language_model_debug_level > 0) tprintf("Compound word found\n");
841  return new LanguageModelDawgInfo(&beginning_active_dawgs_, COMPOUND_PERM);
842  } // done dealing with compound words
843 
844  LanguageModelDawgInfo *dawg_info = nullptr;
845 
846  // Call LetterIsOkay().
847  // Use the normalized IDs so that all shapes of ' can be allowed in words
848  // like don't.
849  const GenericVector<UNICHAR_ID>& normed_ids =
851  DawgPositionVector tmp_active_dawgs;
852  for (int i = 0; i < normed_ids.size(); ++i) {
854  tprintf("Test Letter OK for unichar %d, normed %d\n",
855  b.unichar_id(), normed_ids[i]);
856  dict_->LetterIsOkay(&dawg_args_, dict_->getUnicharset(), normed_ids[i],
857  word_end && i == normed_ids.size() - 1);
858  if (dawg_args_.permuter == NO_PERM) {
859  break;
860  } else if (i < normed_ids.size() - 1) {
861  tmp_active_dawgs = *dawg_args_.updated_dawgs;
862  dawg_args_.active_dawgs = &tmp_active_dawgs;
863  }
865  tprintf("Letter was OK for unichar %d, normed %d\n",
866  b.unichar_id(), normed_ids[i]);
867  }
868  dawg_args_.active_dawgs = nullptr;
869  if (dawg_args_.permuter != NO_PERM) {
870  dawg_info = new LanguageModelDawgInfo(dawg_args_.updated_dawgs,
872  } else if (language_model_debug_level > 3) {
873  tprintf("Letter %s not OK!\n",
875  }
876 
877  return dawg_info;
878 }
PermuterType permuter
Definition: dict.h:83
int size() const
Definition: genericvector.h:71
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:830
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:417
DawgPositionVector beginning_active_dawgs_
DawgPositionVector * updated_dawgs
Definition: dict.h:82
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
DawgPositionVector * active_dawgs
Definition: dict.h:81
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
DawgPositionVector very_beginning_active_dawgs_
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:109
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:144
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:361

◆ GenerateNgramInfo()

LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo ( const char *  unichar,
float  certainty,
float  denom,
int  curr_col,
int  curr_row,
float  outline_length,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 880 of file language_model.cpp.

883  {
884  // Initialize parent context.
885  const char *pcontext_ptr = "";
886  int pcontext_unichar_step_len = 0;
887  if (parent_vse == nullptr) {
888  pcontext_ptr = prev_word_str_.string();
889  pcontext_unichar_step_len = prev_word_unichar_step_len_;
890  } else {
891  pcontext_ptr = parent_vse->ngram_info->context.string();
892  pcontext_unichar_step_len =
893  parent_vse->ngram_info->context_unichar_step_len;
894  }
895  // Compute p(unichar | parent context).
896  int unichar_step_len = 0;
897  bool pruned = false;
898  float ngram_cost;
899  float ngram_and_classifier_cost =
900  ComputeNgramCost(unichar, certainty, denom,
901  pcontext_ptr, &unichar_step_len,
902  &pruned, &ngram_cost);
903  // Normalize just the ngram_and_classifier_cost by outline_length.
904  // The ngram_cost is used by the params_model, so it needs to be left as-is,
905  // and the params model cost will be normalized by outline_length.
906  ngram_and_classifier_cost *=
907  outline_length / language_model_ngram_rating_factor;
908  // Add the ngram_cost of the parent.
909  if (parent_vse != nullptr) {
910  ngram_and_classifier_cost +=
911  parent_vse->ngram_info->ngram_and_classifier_cost;
912  ngram_cost += parent_vse->ngram_info->ngram_cost;
913  }
914 
915  // Shorten parent context string by unichar_step_len unichars.
916  int num_remove = (unichar_step_len + pcontext_unichar_step_len -
918  if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
919  while (num_remove > 0 && *pcontext_ptr != '\0') {
920  pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
921  --num_remove;
922  }
923 
924  // Decide whether to prune this ngram path and update changed accordingly.
925  if (parent_vse != nullptr && parent_vse->ngram_info->pruned) pruned = true;
926 
927  // Construct and return the new LanguageModelNgramInfo.
928  LanguageModelNgramInfo *ngram_info = new LanguageModelNgramInfo(
929  pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
930  ngram_and_classifier_cost);
931  ngram_info->context += unichar;
932  ngram_info->context_unichar_step_len += unichar_step_len;
933  assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
934  return ngram_info;
935 }
const char * string() const
Definition: strngs.cpp:196
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:136

◆ GenerateTopChoiceInfo()

void tesseract::LanguageModel::GenerateTopChoiceInfo ( ViterbiStateEntry new_vse,
const ViterbiStateEntry parent_vse,
LanguageModelState lms 
)
protected

Definition at line 773 of file language_model.cpp.

775  {
776  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
777  for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
778  new_vse->cost >= vit.data()->cost; vit.forward()) {
779  // Clear the appropriate flags if the list already contains
780  // a top choice entry with a lower cost.
781  new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
782  }
783  if (language_model_debug_level > 2) {
784  tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
785  new_vse->top_choice_flags);
786  }
787 }
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37

◆ GetNextParentVSE()

ViterbiStateEntry * tesseract::LanguageModel::GetNextParentVSE ( bool  just_classified,
bool  mixed_alnum,
const BLOB_CHOICE bc,
LanguageModelFlagsType  blob_choice_flags,
const UNICHARSET unicharset,
WERD_RES word_res,
ViterbiStateEntry_IT *  vse_it,
LanguageModelFlagsType top_choice_flags 
) const
protected

Finds the next ViterbiStateEntry with which the given unichar_id can combine sensibly, taking into account any mixed alnum/mixed case situation, and whether this combination has been inspected before.

Definition at line 504 of file language_model.cpp.

508  {
509  for (; !vse_it->cycled_list(); vse_it->forward()) {
510  ViterbiStateEntry* parent_vse = vse_it->data();
511  // Only consider the parent if it has been updated or
512  // if the current ratings cell has just been classified.
513  if (!just_classified && !parent_vse->updated) continue;
515  parent_vse->Print("Considering");
516  // If the parent is non-alnum, then upper counts as lower.
517  *top_choice_flags = blob_choice_flags;
518  if ((blob_choice_flags & kUpperCaseFlag) &&
519  !parent_vse->HasAlnumChoice(unicharset)) {
520  *top_choice_flags |= kLowerCaseFlag;
521  }
522  *top_choice_flags &= parent_vse->top_choice_flags;
523  UNICHAR_ID unichar_id = bc->unichar_id();
524  const BLOB_CHOICE* parent_b = parent_vse->curr_b;
525  UNICHAR_ID parent_id = parent_b->unichar_id();
526  // Digits do not bind to alphas if there is a mix in both parent and current
527  // or if the alpha is not the top choice.
528  if (unicharset.get_isdigit(unichar_id) &&
529  unicharset.get_isalpha(parent_id) &&
530  (mixed_alnum || *top_choice_flags == 0))
531  continue; // Digits don't bind to alphas.
532  // Likewise alphas do not bind to digits if there is a mix in both or if
533  // the digit is not the top choice.
534  if (unicharset.get_isalpha(unichar_id) &&
535  unicharset.get_isdigit(parent_id) &&
536  (mixed_alnum || *top_choice_flags == 0))
537  continue; // Alphas don't bind to digits.
538  // If there is a case mix of the same alpha in the parent list, then
539  // competing_vse is non-null and will be used to determine whether
540  // or not to bind the current blob choice.
541  if (parent_vse->competing_vse != nullptr) {
542  const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
543  UNICHAR_ID other_id = competing_b->unichar_id();
544  if (language_model_debug_level >= 5) {
545  tprintf("Parent %s has competition %s\n",
546  unicharset.id_to_unichar(parent_id),
547  unicharset.id_to_unichar(other_id));
548  }
549  if (unicharset.SizesDistinct(parent_id, other_id)) {
550  // If other_id matches bc wrt position and size, and parent_id, doesn't,
551  // don't bind to the current parent.
552  if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
554  !bc->PosAndSizeAgree(*parent_b, word_res->x_height,
556  continue; // Competing blobchoice has a better vertical match.
557  }
558  }
559  vse_it->forward();
560  return parent_vse; // This one is good!
561  }
562  return nullptr; // Ran out of possibilities.
563 }
int UNICHAR_ID
Definition: unichar.h:35
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
static const LanguageModelFlagsType kUpperCaseFlag
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:152
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:485
float x_height
Definition: pageres.h:311
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
static const LanguageModelFlagsType kLowerCaseFlag
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77

◆ getParamsModel()

ParamsModel& tesseract::LanguageModel::getParamsModel ( )
inline

Definition at line 109 of file language_model.h.

109 { return params_model_; }

◆ GetTopLowerUpperDigit()

bool tesseract::LanguageModel::GetTopLowerUpperDigit ( BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE **  first_lower,
BLOB_CHOICE **  first_upper,
BLOB_CHOICE **  first_digit 
) const
protected

Finds the first lower and upper case letter and first digit in curr_list. For non-upper/lower languages, alpha counts as upper. Uses the first character in the list in place of empty results. Returns true if both alpha and digits are found.

Definition at line 387 of file language_model.cpp.

390  {
391  BLOB_CHOICE_IT c_it(curr_list);
392  const UNICHARSET &unicharset = dict_->getUnicharset();
393  BLOB_CHOICE *first_unichar = nullptr;
394  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
395  UNICHAR_ID unichar_id = c_it.data()->unichar_id();
396  if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
397  if (first_unichar == nullptr) first_unichar = c_it.data();
398  if (*first_lower == nullptr && unicharset.get_islower(unichar_id)) {
399  *first_lower = c_it.data();
400  }
401  if (*first_upper == nullptr && unicharset.get_isalpha(unichar_id) &&
402  !unicharset.get_islower(unichar_id)) {
403  *first_upper = c_it.data();
404  }
405  if (*first_digit == nullptr && unicharset.get_isdigit(unichar_id)) {
406  *first_digit = c_it.data();
407  }
408  }
409  ASSERT_HOST(first_unichar != nullptr);
410  bool mixed = (*first_lower != nullptr || *first_upper != nullptr) &&
411  *first_digit != nullptr;
412  if (*first_lower == nullptr) *first_lower = first_unichar;
413  if (*first_upper == nullptr) *first_upper = first_unichar;
414  if (*first_digit == nullptr) *first_digit = first_unichar;
415  return mixed;
416 }
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:493
int UNICHAR_ID
Definition: unichar.h:35
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: cluster.h:45

◆ InitForWord()

void tesseract::LanguageModel::InitForWord ( const WERD_CHOICE prev_word,
bool  fixed_pitch,
float  max_char_wh_ratio,
float  rating_cert_scale 
)

Definition at line 140 of file language_model.cpp.

142  {
143  fixed_pitch_ = fixed_pitch;
144  max_char_wh_ratio_ = max_char_wh_ratio;
145  rating_cert_scale_ = rating_cert_scale;
146  acceptable_choice_found_ = false;
148 
149  // Initialize vectors with beginning DawgInfos.
154 
155  // Fill prev_word_str_ with the last language_model_ngram_order
156  // unichars from prev_word.
158  if (prev_word != nullptr && prev_word->unichar_string() != nullptr) {
159  prev_word_str_ = prev_word->unichar_string();
161  } else {
162  prev_word_str_ = " ";
163  }
164  const char *str_ptr = prev_word_str_.string();
165  const char *str_end = str_ptr + prev_word_str_.length();
166  int step;
168  while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
169  str_ptr += step;
171  }
172  ASSERT_HOST(str_ptr == str_end);
173  }
174 }
bool language_model_ngram_space_delimited_language
const char * string() const
Definition: strngs.cpp:196
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:569
DawgPositionVector beginning_active_dawgs_
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:136
const STRING & unichar_string() const
Definition: ratngs.h:541
DawgPositionVector very_beginning_active_dawgs_
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:586
int32_t length() const
Definition: strngs.cpp:191
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ PrunablePath()

bool tesseract::LanguageModel::PrunablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 300 of file language_model.h.

300  {
301  if (vse.top_choice_flags) return false;
302  if (vse.dawg_info != nullptr &&
303  (vse.dawg_info->permuter == SYSTEM_DAWG_PERM ||
304  vse.dawg_info->permuter == USER_DAWG_PERM ||
305  vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false;
306  return true;
307  }

◆ SetAcceptableChoiceFound()

void tesseract::LanguageModel::SetAcceptableChoiceFound ( bool  val)
inline

Definition at line 105 of file language_model.h.

105  {
107  }

◆ SetTopParentLowerUpperDigit()

int tesseract::LanguageModel::SetTopParentLowerUpperDigit ( LanguageModelState parent_node) const
protected

Forces there to be at least one entry in the overall set of the viterbi_state_entries of each element of parent_node that has the top_choice_flag set for lower, upper and digit using the same rules as GetTopLowerUpperDigit, setting the flag on the first found suitable candidate, whether or not the flag is set on some other parent. Returns 1 if both alpha and digits are found among the parents, -1 if no parents are found at all (a legitimate case), and 0 otherwise.

Definition at line 427 of file language_model.cpp.

428  {
429  if (parent_node == nullptr) return -1;
430  UNICHAR_ID top_id = INVALID_UNICHAR_ID;
431  ViterbiStateEntry* top_lower = nullptr;
432  ViterbiStateEntry* top_upper = nullptr;
433  ViterbiStateEntry* top_digit = nullptr;
434  ViterbiStateEntry* top_choice = nullptr;
435  float lower_rating = 0.0f;
436  float upper_rating = 0.0f;
437  float digit_rating = 0.0f;
438  float top_rating = 0.0f;
439  const UNICHARSET &unicharset = dict_->getUnicharset();
440  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
441  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
442  ViterbiStateEntry* vse = vit.data();
443  // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
444  // back to the real character if needed.
445  ViterbiStateEntry* unichar_vse = vse;
446  UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
447  float rating = unichar_vse->curr_b->rating();
448  while (unichar_id == INVALID_UNICHAR_ID &&
449  unichar_vse->parent_vse != nullptr) {
450  unichar_vse = unichar_vse->parent_vse;
451  unichar_id = unichar_vse->curr_b->unichar_id();
452  rating = unichar_vse->curr_b->rating();
453  }
454  if (unichar_id != INVALID_UNICHAR_ID) {
455  if (unicharset.get_islower(unichar_id)) {
456  if (top_lower == nullptr || lower_rating > rating) {
457  top_lower = vse;
458  lower_rating = rating;
459  }
460  } else if (unicharset.get_isalpha(unichar_id)) {
461  if (top_upper == nullptr || upper_rating > rating) {
462  top_upper = vse;
463  upper_rating = rating;
464  }
465  } else if (unicharset.get_isdigit(unichar_id)) {
466  if (top_digit == nullptr || digit_rating > rating) {
467  top_digit = vse;
468  digit_rating = rating;
469  }
470  }
471  }
472  if (top_choice == nullptr || top_rating > rating) {
473  top_choice = vse;
474  top_rating = rating;
475  top_id = unichar_id;
476  }
477  }
478  if (top_choice == nullptr) return -1;
479  bool mixed = (top_lower != nullptr || top_upper != nullptr) &&
480  top_digit != nullptr;
481  if (top_lower == nullptr) top_lower = top_choice;
482  top_lower->top_choice_flags |= kLowerCaseFlag;
483  if (top_upper == nullptr) top_upper = top_choice;
484  top_upper->top_choice_flags |= kUpperCaseFlag;
485  if (top_digit == nullptr) top_digit = top_choice;
486  top_digit->top_choice_flags |= kDigitFlag;
487  top_choice->top_choice_flags |= kSmallestRatingFlag;
488  if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
489  (top_choice->top_choice_flags &
491  // If the compound marker top choice carries any of the top alnum flags,
492  // then give it all of them, allowing words like I-295 to be chosen.
493  top_choice->top_choice_flags |=
495  }
496  return mixed ? 1 : 0;
497 }
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:493
int UNICHAR_ID
Definition: unichar.h:35
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
static const LanguageModelFlagsType kSmallestRatingFlag
static const LanguageModelFlagsType kDigitFlag
static const LanguageModelFlagsType kUpperCaseFlag
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
static const LanguageModelFlagsType kLowerCaseFlag
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:109
Definition: cluster.h:45

◆ UpdateBestChoice()

void tesseract::LanguageModel::UpdateBestChoice ( ViterbiStateEntry vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 1241 of file language_model.cpp.

1246  {
1247  bool truth_path;
1248  WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
1249  blamer_bundle, &truth_path);
1250  ASSERT_HOST(word != nullptr);
1251  if (dict_->stopper_debug_level >= 1) {
1252  STRING word_str;
1253  word->string_and_lengths(&word_str, nullptr);
1254  vse->Print(word_str.string());
1255  }
1256  if (language_model_debug_level > 0) {
1257  word->print("UpdateBestChoice() constructed word");
1258  }
1259  // Record features from the current path if necessary.
1260  ParamsTrainingHypothesis curr_hyp;
1261  if (blamer_bundle != nullptr) {
1262  if (vse->dawg_info != nullptr) vse->dawg_info->permuter =
1263  static_cast<PermuterType>(word->permuter());
1264  ExtractFeaturesFromPath(*vse, curr_hyp.features);
1265  word->string_and_lengths(&(curr_hyp.str), nullptr);
1266  curr_hyp.cost = vse->cost; // record cost for error rate computations
1267  if (language_model_debug_level > 0) {
1268  tprintf("Raw features extracted from %s (cost=%g) [ ",
1269  curr_hyp.str.string(), curr_hyp.cost);
1270  for (int deb_i = 0; deb_i < PTRAIN_NUM_FEATURE_TYPES; ++deb_i) {
1271  tprintf("%g ", curr_hyp.features[deb_i]);
1272  }
1273  tprintf("]\n");
1274  }
1275  // Record the current hypothesis in params_training_bundle.
1276  blamer_bundle->AddHypothesis(curr_hyp);
1277  if (truth_path)
1278  blamer_bundle->UpdateBestRating(word->rating());
1279  }
1280  if (blamer_bundle != nullptr && blamer_bundle->GuidedSegsearchStillGoing()) {
1281  // The word was constructed solely for blamer_bundle->AddHypothesis, so
1282  // we no longer need it.
1283  delete word;
1284  return;
1285  }
1286  if (word_res->chopped_word != nullptr && !word_res->chopped_word->blobs.empty())
1288  // Update and log new raw_choice if needed.
1289  if (word_res->raw_choice == nullptr ||
1290  word->rating() < word_res->raw_choice->rating()) {
1291  if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
1292  tprintf("Updated raw choice\n");
1293  }
1294  // Set the modified rating for best choice to vse->cost and log best choice.
1295  word->set_rating(vse->cost);
1296  // Call LogNewChoice() for best choice from Dict::adjust_word() since it
1297  // computes adjust_factor that is used by the adaption code (e.g. by
1298  // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
1299  // Note: the rating of the word is not adjusted.
1300  dict_->adjust_word(word, vse->dawg_info == nullptr,
1301  vse->consistency_info.xht_decision, 0.0,
1302  false, language_model_debug_level > 0);
1303  // Hand ownership of the word over to the word_res.
1305  dict_->stopper_debug_level >= 1, word)) {
1306  // The word was so bad that it was deleted.
1307  return;
1308  }
1309  if (word_res->best_choice == word) {
1310  // Word was the new best.
1311  if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) &&
1312  AcceptablePath(*vse)) {
1313  acceptable_choice_found_ = true;
1314  }
1315  // Update best_choice_bundle.
1316  best_choice_bundle->updated = true;
1317  best_choice_bundle->best_vse = vse;
1318  if (language_model_debug_level > 0) {
1319  tprintf("Updated best choice\n");
1320  word->print_state("New state ");
1321  }
1322  // Update hyphen state if we are dealing with a dictionary word.
1323  if (vse->dawg_info != nullptr) {
1324  if (dict_->has_hyphen_end(*word)) {
1326  } else {
1327  dict_->reset_hyphen_vars(true);
1328  }
1329  }
1330 
1331  if (blamer_bundle != nullptr) {
1333  vse->dawg_info != nullptr && vse->top_choice_flags);
1334  }
1335  }
1336  if (wordrec_display_segmentations && word_res->chopped_word != nullptr) {
1337  word->DisplaySegmentation(word_res->chopped_word);
1338  }
1339 }
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:506
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:147
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:28
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:673
const char * string() const
Definition: strngs.cpp:196
int tessedit_truncate_wordchoice_log
Definition: dict.h:626
void print() const
Definition: ratngs.h:580
uint8_t permuter() const
Definition: ratngs.h:346
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:41
float rating() const
Definition: ratngs.h:327
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
int stopper_debug_level
Definition: dict.h:622
void print_state(const char *msg) const
Definition: ratngs.cpp:755
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:45
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:764
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:610
void UpdateBestRating(float rating)
Definition: blamer.h:134
bool AcceptablePath(const ViterbiStateEntry &vse)
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:449
Definition: strngs.h:45
DawgPositionVector * active_dawgs
Definition: dict.h:81
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:144
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:550
WERD_CHOICE * raw_choice
Definition: pageres.h:240
TWERD * chopped_word
Definition: pageres.h:215
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
WERD_CHOICE * best_choice
Definition: pageres.h:235
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:626
PermuterType
Definition: ratngs.h:242
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_rating(float new_val)
Definition: ratngs.h:369
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:166

◆ UpdateState()

bool tesseract::LanguageModel::UpdateState ( bool  just_classified,
int  curr_col,
int  curr_row,
BLOB_CHOICE_LIST *  curr_list,
LanguageModelState parent_node,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

UpdateState has the job of combining the ViterbiStateEntry lists on each of the choices on parent_list with each of the blob choices in curr_list, making a new ViterbiStateEntry for each sensible path.

This could be a huge set of combinations, creating a lot of work only to be truncated by some beam limit, but only certain kinds of paths will continue at the next step:

  • paths that are liked by the language model: either a DAWG or the n-gram model, where active.
  • paths that represent some kind of top choice. The old permuter permuted the top raw classifier score, the top upper case word and the top lower- case word. UpdateState now concentrates its top-choice paths on top lower-case, top upper-case (or caseless alpha), and top digit sequence, with allowance for continuation of these paths through blobs where such a character does not appear in the choices list.

GetNextParentVSE enforces some of these models to minimize the number of calls to AddViterbiStateEntry, even prior to looking at the language model. Thus an n-blob sequence of [l1I] will produce 3n calls to AddViterbiStateEntry instead of 3^n.

Of course it isn't quite that simple as Title Case is handled by allowing lower case to continue an upper case initial, but it has to be detected in the combiner so it knows which upper case letters are initial alphas.

Definition at line 257 of file language_model.cpp.

265  {
266  if (language_model_debug_level > 0) {
267  tprintf("\nUpdateState: col=%d row=%d %s",
268  curr_col, curr_row, just_classified ? "just_classified" : "");
270  tprintf("(parent=%p)\n", parent_node);
271  else
272  tprintf("\n");
273  }
274  // Initialize helper variables.
275  bool word_end = (curr_row+1 >= word_res->ratings->dimension());
276  bool new_changed = false;
277  float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
278  const UNICHARSET& unicharset = dict_->getUnicharset();
279  BLOB_CHOICE *first_lower = nullptr;
280  BLOB_CHOICE *first_upper = nullptr;
281  BLOB_CHOICE *first_digit = nullptr;
282  bool has_alnum_mix = false;
283  if (parent_node != nullptr) {
284  int result = SetTopParentLowerUpperDigit(parent_node);
285  if (result < 0) {
287  tprintf("No parents found to process\n");
288  return false;
289  }
290  if (result > 0)
291  has_alnum_mix = true;
292  }
293  if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
294  &first_digit))
295  has_alnum_mix = false;;
296  ScanParentsForCaseMix(unicharset, parent_node);
297  if (language_model_debug_level > 3 && parent_node != nullptr) {
298  parent_node->Print("Parent viterbi list");
299  }
300  LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
301 
302  // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
303  ViterbiStateEntry_IT vit;
304  BLOB_CHOICE_IT c_it(curr_list);
305  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
306  BLOB_CHOICE* choice = c_it.data();
307  // TODO(antonova): make sure commenting this out if ok for ngram
308  // model scoring (I think this was introduced to fix ngram model quirks).
309  // Skip nullptr unichars unless it is the only choice.
310  //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
311  UNICHAR_ID unichar_id = choice->unichar_id();
312  if (unicharset.get_fragment(unichar_id)) {
313  continue; // Skip fragments.
314  }
315  // Set top choice flags.
316  LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
317  if (c_it.at_first() || !new_changed)
318  blob_choice_flags |= kSmallestRatingFlag;
319  if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
320  if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
321  if (first_digit == choice) blob_choice_flags |= kDigitFlag;
322 
323  if (parent_node == nullptr) {
324  // Process the beginning of a word.
325  // If there is a better case variant that is not distinguished by size,
326  // skip this blob choice, as we have no choice but to accept the result
327  // of the character classifier to distinguish between them, even if
328  // followed by an upper case.
329  // With words like iPoc, and other CamelBackWords, the lower-upper
330  // transition can only be achieved if the classifier has the correct case
331  // as the top choice, and leaving an initial I lower down the list
332  // increases the chances of choosing IPoc simply because it doesn't
333  // include such a transition. iPoc will beat iPOC and ipoc because
334  // the other words are baseline/x-height inconsistent.
335  if (HasBetterCaseVariant(unicharset, choice, curr_list))
336  continue;
337  // Upper counts as lower at the beginning of a word.
338  if (blob_choice_flags & kUpperCaseFlag)
339  blob_choice_flags |= kLowerCaseFlag;
340  new_changed |= AddViterbiStateEntry(
341  blob_choice_flags, denom, word_end, curr_col, curr_row,
342  choice, curr_state, nullptr, pain_points,
343  word_res, best_choice_bundle, blamer_bundle);
344  } else {
345  // Get viterbi entries from each parent ViterbiStateEntry.
346  vit.set_to_list(&parent_node->viterbi_state_entries);
347  int vit_counter = 0;
348  vit.mark_cycle_pt();
349  ViterbiStateEntry* parent_vse = nullptr;
350  LanguageModelFlagsType top_choice_flags;
351  while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
352  c_it.data(), blob_choice_flags,
353  unicharset, word_res, &vit,
354  &top_choice_flags)) != nullptr) {
355  // Skip pruned entries and do not look at prunable entries if already
356  // examined language_model_viterbi_list_max_num_prunable of those.
357  if (PrunablePath(*parent_vse) &&
359  (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
360  continue;
361  }
362  // If the parent has no alnum choice, (ie choice is the first in a
363  // string of alnum), and there is a better case variant that is not
364  // distinguished by size, skip this blob choice/parent, as with the
365  // initial blob treatment above.
366  if (!parent_vse->HasAlnumChoice(unicharset) &&
367  HasBetterCaseVariant(unicharset, choice, curr_list))
368  continue;
369  // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
370  // looks good according to the Dawgs or character ngram model.
371  new_changed |= AddViterbiStateEntry(
372  top_choice_flags, denom, word_end, curr_col, curr_row,
373  c_it.data(), curr_state, parent_vse, pain_points,
374  word_res, best_choice_bundle, blamer_bundle);
375  }
376  }
377  }
378  return new_changed;
379 }
int UNICHAR_ID
Definition: unichar.h:35
int language_model_viterbi_list_max_num_prunable
static const LanguageModelFlagsType kXhtConsistentFlag
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
static const LanguageModelFlagsType kSmallestRatingFlag
static const LanguageModelFlagsType kDigitFlag
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:39
static const LanguageModelFlagsType kUpperCaseFlag
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
int dimension() const
Definition: matrix.h:533
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool PrunablePath(const ViterbiStateEntry &vse)
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
MATRIX * ratings
Definition: pageres.h:231
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
static const LanguageModelFlagsType kLowerCaseFlag
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77

Member Data Documentation

◆ acceptable_choice_found_

bool tesseract::LanguageModel::acceptable_choice_found_
protected

Definition at line 417 of file language_model.h.

◆ beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::beginning_active_dawgs_
protected

Definition at line 405 of file language_model.h.

◆ correct_segmentation_explored_

bool tesseract::LanguageModel::correct_segmentation_explored_
protected

Definition at line 419 of file language_model.h.

◆ dawg_args_

DawgArgs tesseract::LanguageModel::dawg_args_
protected

Definition at line 365 of file language_model.h.

◆ dict_

Dict* tesseract::LanguageModel::dict_
protected

Definition at line 384 of file language_model.h.

◆ fixed_pitch_

bool tesseract::LanguageModel::fixed_pitch_
protected

Definition at line 391 of file language_model.h.

◆ fontinfo_table_

const UnicityTable<FontInfo>* tesseract::LanguageModel::fontinfo_table_
protected

Definition at line 380 of file language_model.h.

◆ kDigitFlag

const LanguageModelFlagsType tesseract::LanguageModel::kDigitFlag = 0x8
static

Definition at line 57 of file language_model.h.

◆ kLowerCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kLowerCaseFlag = 0x2
static

Definition at line 55 of file language_model.h.

◆ kMaxAvgNgramCost

const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f
static

Definition at line 62 of file language_model.h.

◆ kSmallestRatingFlag

const LanguageModelFlagsType tesseract::LanguageModel::kSmallestRatingFlag = 0x1
static

Definition at line 54 of file language_model.h.

◆ kUpperCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kUpperCaseFlag = 0x4
static

Definition at line 56 of file language_model.h.

◆ kXhtConsistentFlag

const LanguageModelFlagsType tesseract::LanguageModel::kXhtConsistentFlag = 0x10
static

Definition at line 58 of file language_model.h.

◆ language_model_debug_level

int tesseract::LanguageModel::language_model_debug_level = 0

"Language model debug level"

Definition at line 317 of file language_model.h.

◆ language_model_min_compound_length

int tesseract::LanguageModel::language_model_min_compound_length = 3

"Minimum length of compound words"

Definition at line 344 of file language_model.h.

◆ language_model_ngram_nonmatch_score

double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0

"Average classifier score of a non-matching unichar"

Definition at line 331 of file language_model.h.

◆ language_model_ngram_on

bool tesseract::LanguageModel::language_model_ngram_on = false

"Turn on/off the use of character ngram model"

Definition at line 319 of file language_model.h.

◆ language_model_ngram_order

int tesseract::LanguageModel::language_model_ngram_order = 8

"Maximum order of the character ngram model"

Definition at line 321 of file language_model.h.

◆ language_model_ngram_rating_factor

double tesseract::LanguageModel::language_model_ngram_rating_factor = 16.0

"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "

Definition at line 340 of file language_model.h.

◆ language_model_ngram_scale_factor

double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03

"Strength of the character ngram model relative to the" " character classifier "

Definition at line 337 of file language_model.h.

◆ language_model_ngram_small_prob

double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001

"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"

Definition at line 329 of file language_model.h.

◆ language_model_ngram_space_delimited_language

bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true

"Words are delimited by space"

Definition at line 342 of file language_model.h.

◆ language_model_ngram_use_only_first_uft8_step

bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities"

Definition at line 334 of file language_model.h.

◆ language_model_penalty_case

double tesseract::LanguageModel::language_model_penalty_case = 0.1

"Penalty for inconsistent case"

Definition at line 353 of file language_model.h.

◆ language_model_penalty_chartype

double tesseract::LanguageModel::language_model_penalty_chartype = 0.3

"Penalty for inconsistent character type"

Definition at line 357 of file language_model.h.

◆ language_model_penalty_font

double tesseract::LanguageModel::language_model_penalty_font = 0.00

"Penalty for inconsistent font"

Definition at line 359 of file language_model.h.

◆ language_model_penalty_increment

double tesseract::LanguageModel::language_model_penalty_increment = 0.01

"Penalty increment"

Definition at line 362 of file language_model.h.

◆ language_model_penalty_non_dict_word

double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15

"Penalty for non-dictionary words"

Definition at line 349 of file language_model.h.

◆ language_model_penalty_non_freq_dict_word

double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1

"Penalty for words not in the frequent word dictionary"

Definition at line 347 of file language_model.h.

◆ language_model_penalty_punc

double tesseract::LanguageModel::language_model_penalty_punc = 0.2

"Penalty for inconsistent punctuation"

Definition at line 351 of file language_model.h.

◆ language_model_penalty_script

double tesseract::LanguageModel::language_model_penalty_script = 0.5

"Penalty for inconsistent script"

Definition at line 355 of file language_model.h.

◆ language_model_penalty_spacing

double tesseract::LanguageModel::language_model_penalty_spacing = 0.05

"Penalty for inconsistent spacing"

Definition at line 361 of file language_model.h.

◆ language_model_use_sigmoidal_certainty

bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false

"Use sigmoidal score for certainty"

Definition at line 365 of file language_model.h.

◆ language_model_viterbi_list_max_num_prunable

int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10

"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"

Definition at line 324 of file language_model.h.

◆ language_model_viterbi_list_max_size

int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500

"Maximum size of viterbi lists recorded in BLOB_CHOICEs"

Definition at line 326 of file language_model.h.

◆ max_char_wh_ratio_

float tesseract::LanguageModel::max_char_wh_ratio_
protected

Definition at line 394 of file language_model.h.

◆ params_model_

ParamsModel tesseract::LanguageModel::params_model_
protected

Definition at line 422 of file language_model.h.

◆ prev_word_str_

STRING tesseract::LanguageModel::prev_word_str_
protected

Definition at line 401 of file language_model.h.

◆ prev_word_unichar_step_len_

int tesseract::LanguageModel::prev_word_unichar_step_len_
protected

Definition at line 402 of file language_model.h.

◆ rating_cert_scale_

float tesseract::LanguageModel::rating_cert_scale_
protected

Definition at line 375 of file language_model.h.

◆ very_beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::very_beginning_active_dawgs_
protected

Definition at line 404 of file language_model.h.

◆ wordrec_display_segmentations

int tesseract::LanguageModel::wordrec_display_segmentations = 0

"Display Segmentations"

Definition at line 363 of file language_model.h.


The documentation for this class was generated from the following files: