#include <language_model.h>
Public Member Functions | |
LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict) | |
~LanguageModel () | |
void | InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale) |
bool | UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
bool | AcceptableChoiceFound () |
void | SetAcceptableChoiceFound (bool val) |
ParamsModel & | getParamsModel () |
Static Public Member Functions | |
static void | ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[]) |
Static Public Attributes | |
static const LanguageModelFlagsType | kSmallestRatingFlag = 0x1 |
static const LanguageModelFlagsType | kLowerCaseFlag = 0x2 |
static const LanguageModelFlagsType | kUpperCaseFlag = 0x4 |
static const LanguageModelFlagsType | kDigitFlag = 0x8 |
static const LanguageModelFlagsType | kXhtConsistentFlag = 0x10 |
static const float | kMaxAvgNgramCost = 25.0f |
Protected Member Functions | |
float | CertaintyScore (float cert) |
float | ComputeAdjustment (int num_problems, float penalty) |
float | ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info) |
float | ComputeAdjustedPathCost (ViterbiStateEntry *vse) |
bool | GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const |
int | SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const |
ViterbiStateEntry * | GetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const |
bool | AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
void | GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms) |
LanguageModelDawgInfo * | GenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse) |
LanguageModelNgramInfo * | GenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse) |
float | ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob) |
float | ComputeDenom (BLOB_CHOICE_LIST *curr_list) |
void | FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info) |
void | UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
WERD_CHOICE * | ConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path) |
void | ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats) |
bool | PrunablePath (const ViterbiStateEntry &vse) |
bool | AcceptablePath (const ViterbiStateEntry &vse) |
Protected Attributes | |
DawgArgs * | dawg_args_ |
float | rating_cert_scale_ |
const UnicityTable< FontInfo > * | fontinfo_table_ |
Dict * | dict_ |
bool | fixed_pitch_ |
float | max_char_wh_ratio_ |
STRING | prev_word_str_ |
int | prev_word_unichar_step_len_ |
DawgPositionVector * | very_beginning_active_dawgs_ |
DawgPositionVector * | beginning_active_dawgs_ |
bool | acceptable_choice_found_ |
bool | correct_segmentation_explored_ |
ParamsModel | params_model_ |
Definition at line 42 of file language_model.h.
tesseract::LanguageModel::LanguageModel | ( | const UnicityTable< FontInfo > * | fontinfo_table, |
Dict * | dict | ||
) |
Definition at line 45 of file language_model.cpp.
tesseract::LanguageModel::~LanguageModel | ( | ) |
Definition at line 131 of file language_model.cpp.
|
inline |
Definition at line 95 of file language_model.h.
|
inlineprotected |
Definition at line 301 of file language_model.h.
|
protected |
Definition at line 563 of file language_model.cpp.
|
inlineprotected |
Definition at line 104 of file language_model.h.
|
protected |
Definition at line 1198 of file language_model.cpp.
|
inlineprotected |
Definition at line 116 of file language_model.h.
|
inlineprotected |
Definition at line 272 of file language_model.h.
|
inlineprotected |
Definition at line 127 of file language_model.h.
|
protected |
Definition at line 995 of file language_model.cpp.
|
protected |
Definition at line 935 of file language_model.cpp.
|
protected |
Definition at line 1389 of file language_model.cpp.
|
static |
Definition at line 1340 of file language_model.cpp.
|
protected |
Definition at line 1016 of file language_model.cpp.
|
protected |
Definition at line 787 of file language_model.cpp.
|
protected |
Definition at line 878 of file language_model.cpp.
|
protected |
Definition at line 771 of file language_model.cpp.
|
protected |
Finds the next ViterbiStateEntry with which the given unichar_id can combine sensibly, taking into account any mixed alnum/mixed case situation, and whether this combination has been inspected before.
Definition at line 502 of file language_model.cpp.
|
inline |
Definition at line 100 of file language_model.h.
|
protected |
Finds the first lower and upper case letter and first digit in curr_list. For non-upper/lower languages, alpha counts as upper. Uses the first character in the list in place of empty results. Returns true if both alpha and digits are found.
Definition at line 385 of file language_model.cpp.
void tesseract::LanguageModel::InitForWord | ( | const WERD_CHOICE * | prev_word, |
bool | fixed_pitch, | ||
float | max_char_wh_ratio, | ||
float | rating_cert_scale | ||
) |
Definition at line 138 of file language_model.cpp.
|
inlineprotected |
Definition at line 291 of file language_model.h.
|
inline |
Definition at line 96 of file language_model.h.
|
protected |
Forces there to be at least one entry in the overall set of the viterbi_state_entries of each element of parent_node that has the top_choice_flag set for lower, upper and digit using the same rules as GetTopLowerUpperDigit, setting the flag on the first found suitable candidate, whether or not the flag is set on some other parent. Returns 1 if both alpha and digits are found among the parents, -1 if no parents are found at all (a legitimate case), and 0 otherwise.
Definition at line 425 of file language_model.cpp.
|
protected |
Definition at line 1240 of file language_model.cpp.
bool tesseract::LanguageModel::UpdateState | ( | bool | just_classified, |
int | curr_col, | ||
int | curr_row, | ||
BLOB_CHOICE_LIST * | curr_list, | ||
LanguageModelState * | parent_node, | ||
LMPainPoints * | pain_points, | ||
WERD_RES * | word_res, | ||
BestChoiceBundle * | best_choice_bundle, | ||
BlamerBundle * | blamer_bundle | ||
) |
UpdateState has the job of combining the ViterbiStateEntry lists on each of the choices on parent_list with each of the blob choices in curr_list, making a new ViterbiStateEntry for each sensible path.
This could be a huge set of combinations, creating a lot of work only to be truncated by some beam limit, but only certain kinds of paths will continue at the next step:
GetNextParentVSE enforces some of these models to minimize the number of calls to AddViterbiStateEntry, even prior to looking at the language model. Thus an n-blob sequence of [l1I] will produce 3n calls to AddViterbiStateEntry instead of 3^n.
Of course it isn't quite that simple as Title Case is handled by allowing lower case to continue an upper case initial, but it has to be detected in the combiner so it knows which upper case letters are initial alphas.
Definition at line 255 of file language_model.cpp.
|
protected |
Definition at line 408 of file language_model.h.
|
protected |
Definition at line 396 of file language_model.h.
|
protected |
Definition at line 410 of file language_model.h.
|
protected |
Definition at line 356 of file language_model.h.
|
protected |
Definition at line 375 of file language_model.h.
|
protected |
Definition at line 382 of file language_model.h.
|
protected |
Definition at line 371 of file language_model.h.
|
static |
Definition at line 48 of file language_model.h.
|
static |
Definition at line 46 of file language_model.h.
|
static |
Definition at line 53 of file language_model.h.
|
static |
Definition at line 45 of file language_model.h.
|
static |
Definition at line 47 of file language_model.h.
|
static |
Definition at line 49 of file language_model.h.
int tesseract::LanguageModel::language_model_debug_level = 0 |
"Language model debug level"
Definition at line 308 of file language_model.h.
int tesseract::LanguageModel::language_model_min_compound_length = 3 |
"Minimum length of compound words"
Definition at line 335 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0 |
"Average classifier score of a non-matching unichar"
Definition at line 322 of file language_model.h.
bool tesseract::LanguageModel::language_model_ngram_on = false |
"Turn on/off the use of character ngram model"
Definition at line 310 of file language_model.h.
int tesseract::LanguageModel::language_model_ngram_order = 8 |
"Maximum order of the character ngram model"
Definition at line 312 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_rating_factor = 16.0 |
"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "
Definition at line 331 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03 |
"Strength of the character ngram model relative to the" " character classifier "
Definition at line 328 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001 |
"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"
Definition at line 320 of file language_model.h.
bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true |
"Words are delimited by space"
Definition at line 333 of file language_model.h.
bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false |
"Use only the first UTF8 step of the given string" " when computing log probabilities"
Definition at line 325 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_case = 0.1 |
"Penalty for inconsistent case"
Definition at line 344 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_chartype = 0.3 |
"Penalty for inconsistent character type"
Definition at line 348 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_font = 0.00 |
"Penalty for inconsistent font"
Definition at line 350 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_increment = 0.01 |
"Penalty increment"
Definition at line 353 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15 |
"Penalty for non-dictionary words"
Definition at line 340 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1 |
"Penalty for words not in the frequent word dictionary"
Definition at line 338 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_punc = 0.2 |
"Penalty for inconsistent punctuation"
Definition at line 342 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_script = 0.5 |
"Penalty for inconsistent script"
Definition at line 346 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_spacing = 0.05 |
"Penalty for inconsistent spacing"
Definition at line 352 of file language_model.h.
bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false |
"Use sigmoidal score for certainty"
Definition at line 356 of file language_model.h.
int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10 |
"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"
Definition at line 315 of file language_model.h.
int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500 |
"Maximum size of viterbi lists recorded in BLOB_CHOICEs"
Definition at line 317 of file language_model.h.
|
protected |
Definition at line 385 of file language_model.h.
|
protected |
Definition at line 413 of file language_model.h.
|
protected |
Definition at line 392 of file language_model.h.
|
protected |
Definition at line 393 of file language_model.h.
|
protected |
Definition at line 366 of file language_model.h.
|
protected |
Definition at line 395 of file language_model.h.
int tesseract::LanguageModel::wordrec_display_segmentations = 0 |
"Display Segmentations"
Definition at line 354 of file language_model.h.