tesseract
4.0.0-1-g2a2b
|
#include <language_model.h>
Public Member Functions | |
LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict) | |
~LanguageModel () | |
void | InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale) |
bool | UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
bool | AcceptableChoiceFound () |
void | SetAcceptableChoiceFound (bool val) |
ParamsModel & | getParamsModel () |
Static Public Member Functions | |
static void | ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[]) |
Static Public Attributes | |
static const LanguageModelFlagsType | kSmallestRatingFlag = 0x1 |
static const LanguageModelFlagsType | kLowerCaseFlag = 0x2 |
static const LanguageModelFlagsType | kUpperCaseFlag = 0x4 |
static const LanguageModelFlagsType | kDigitFlag = 0x8 |
static const LanguageModelFlagsType | kXhtConsistentFlag = 0x10 |
static const float | kMaxAvgNgramCost = 25.0f |
Protected Member Functions | |
float | CertaintyScore (float cert) |
float | ComputeAdjustment (int num_problems, float penalty) |
float | ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info) |
float | ComputeAdjustedPathCost (ViterbiStateEntry *vse) |
bool | GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const |
int | SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const |
ViterbiStateEntry * | GetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const |
bool | AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
void | GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms) |
LanguageModelDawgInfo * | GenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse) |
LanguageModelNgramInfo * | GenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse) |
float | ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob) |
float | ComputeDenom (BLOB_CHOICE_LIST *curr_list) |
void | FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info) |
void | UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
WERD_CHOICE * | ConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path) |
void | ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats) |
bool | PrunablePath (const ViterbiStateEntry &vse) |
bool | AcceptablePath (const ViterbiStateEntry &vse) |
Protected Attributes | |
DawgArgs | dawg_args_ |
float | rating_cert_scale_ |
const UnicityTable< FontInfo > * | fontinfo_table_ |
Dict * | dict_ |
bool | fixed_pitch_ |
float | max_char_wh_ratio_ |
STRING | prev_word_str_ |
int | prev_word_unichar_step_len_ |
DawgPositionVector | very_beginning_active_dawgs_ |
DawgPositionVector | beginning_active_dawgs_ |
bool | acceptable_choice_found_ |
bool | correct_segmentation_explored_ |
ParamsModel | params_model_ |
Definition at line 51 of file language_model.h.
tesseract::LanguageModel::LanguageModel | ( | const UnicityTable< FontInfo > * | fontinfo_table, |
Dict * | dict | ||
) |
Definition at line 54 of file language_model.cpp.
tesseract::LanguageModel::~LanguageModel | ( | ) |
Definition at line 138 of file language_model.cpp.
|
inline |
Definition at line 104 of file language_model.h.
|
inlineprotected |
Definition at line 310 of file language_model.h.
|
protected |
Definition at line 565 of file language_model.cpp.
|
inlineprotected |
Definition at line 113 of file language_model.h.
|
protected |
Definition at line 1199 of file language_model.cpp.
|
inlineprotected |
Definition at line 125 of file language_model.h.
|
inlineprotected |
Definition at line 281 of file language_model.h.
|
inlineprotected |
Definition at line 136 of file language_model.h.
|
protected |
Definition at line 997 of file language_model.cpp.
|
protected |
Definition at line 937 of file language_model.cpp.
|
protected |
Definition at line 1390 of file language_model.cpp.
|
static |
Definition at line 1341 of file language_model.cpp.
|
protected |
Definition at line 1018 of file language_model.cpp.
|
protected |
Definition at line 789 of file language_model.cpp.
|
protected |
Definition at line 880 of file language_model.cpp.
|
protected |
Definition at line 773 of file language_model.cpp.
|
protected |
Finds the next ViterbiStateEntry with which the given unichar_id can combine sensibly, taking into account any mixed alnum/mixed case situation, and whether this combination has been inspected before.
Definition at line 504 of file language_model.cpp.
|
inline |
Definition at line 109 of file language_model.h.
|
protected |
Finds the first lower and upper case letter and first digit in curr_list. For non-upper/lower languages, alpha counts as upper. Uses the first character in the list in place of empty results. Returns true if both alpha and digits are found.
Definition at line 387 of file language_model.cpp.
void tesseract::LanguageModel::InitForWord | ( | const WERD_CHOICE * | prev_word, |
bool | fixed_pitch, | ||
float | max_char_wh_ratio, | ||
float | rating_cert_scale | ||
) |
Definition at line 140 of file language_model.cpp.
|
inlineprotected |
Definition at line 300 of file language_model.h.
|
inline |
Definition at line 105 of file language_model.h.
|
protected |
Forces there to be at least one entry in the overall set of the viterbi_state_entries of each element of parent_node that has the top_choice_flag set for lower, upper and digit using the same rules as GetTopLowerUpperDigit, setting the flag on the first found suitable candidate, whether or not the flag is set on some other parent. Returns 1 if both alpha and digits are found among the parents, -1 if no parents are found at all (a legitimate case), and 0 otherwise.
Definition at line 427 of file language_model.cpp.
|
protected |
Definition at line 1241 of file language_model.cpp.
bool tesseract::LanguageModel::UpdateState | ( | bool | just_classified, |
int | curr_col, | ||
int | curr_row, | ||
BLOB_CHOICE_LIST * | curr_list, | ||
LanguageModelState * | parent_node, | ||
LMPainPoints * | pain_points, | ||
WERD_RES * | word_res, | ||
BestChoiceBundle * | best_choice_bundle, | ||
BlamerBundle * | blamer_bundle | ||
) |
UpdateState has the job of combining the ViterbiStateEntry lists on each of the choices on parent_list with each of the blob choices in curr_list, making a new ViterbiStateEntry for each sensible path.
This could be a huge set of combinations, creating a lot of work only to be truncated by some beam limit, but only certain kinds of paths will continue at the next step:
GetNextParentVSE enforces some of these models to minimize the number of calls to AddViterbiStateEntry, even prior to looking at the language model. Thus an n-blob sequence of [l1I] will produce 3n calls to AddViterbiStateEntry instead of 3^n.
Of course it isn't quite that simple as Title Case is handled by allowing lower case to continue an upper case initial, but it has to be detected in the combiner so it knows which upper case letters are initial alphas.
Definition at line 257 of file language_model.cpp.
|
protected |
Definition at line 417 of file language_model.h.
|
protected |
Definition at line 405 of file language_model.h.
|
protected |
Definition at line 419 of file language_model.h.
|
protected |
Definition at line 365 of file language_model.h.
|
protected |
Definition at line 384 of file language_model.h.
|
protected |
Definition at line 391 of file language_model.h.
|
protected |
Definition at line 380 of file language_model.h.
|
static |
Definition at line 57 of file language_model.h.
|
static |
Definition at line 55 of file language_model.h.
|
static |
Definition at line 62 of file language_model.h.
|
static |
Definition at line 54 of file language_model.h.
|
static |
Definition at line 56 of file language_model.h.
|
static |
Definition at line 58 of file language_model.h.
int tesseract::LanguageModel::language_model_debug_level = 0 |
"Language model debug level"
Definition at line 317 of file language_model.h.
int tesseract::LanguageModel::language_model_min_compound_length = 3 |
"Minimum length of compound words"
Definition at line 344 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0 |
"Average classifier score of a non-matching unichar"
Definition at line 331 of file language_model.h.
bool tesseract::LanguageModel::language_model_ngram_on = false |
"Turn on/off the use of character ngram model"
Definition at line 319 of file language_model.h.
int tesseract::LanguageModel::language_model_ngram_order = 8 |
"Maximum order of the character ngram model"
Definition at line 321 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_rating_factor = 16.0 |
"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "
Definition at line 340 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03 |
"Strength of the character ngram model relative to the" " character classifier "
Definition at line 337 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001 |
"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"
Definition at line 329 of file language_model.h.
bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true |
"Words are delimited by space"
Definition at line 342 of file language_model.h.
bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false |
"Use only the first UTF8 step of the given string" " when computing log probabilities"
Definition at line 334 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_case = 0.1 |
"Penalty for inconsistent case"
Definition at line 353 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_chartype = 0.3 |
"Penalty for inconsistent character type"
Definition at line 357 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_font = 0.00 |
"Penalty for inconsistent font"
Definition at line 359 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_increment = 0.01 |
"Penalty increment"
Definition at line 362 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15 |
"Penalty for non-dictionary words"
Definition at line 349 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1 |
"Penalty for words not in the frequent word dictionary"
Definition at line 347 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_punc = 0.2 |
"Penalty for inconsistent punctuation"
Definition at line 351 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_script = 0.5 |
"Penalty for inconsistent script"
Definition at line 355 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_spacing = 0.05 |
"Penalty for inconsistent spacing"
Definition at line 361 of file language_model.h.
bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false |
"Use sigmoidal score for certainty"
Definition at line 365 of file language_model.h.
int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10 |
"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"
Definition at line 324 of file language_model.h.
int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500 |
"Maximum size of viterbi lists recorded in BLOB_CHOICEs"
Definition at line 326 of file language_model.h.
|
protected |
Definition at line 394 of file language_model.h.
|
protected |
Definition at line 422 of file language_model.h.
|
protected |
Definition at line 401 of file language_model.h.
|
protected |
Definition at line 402 of file language_model.h.
|
protected |
Definition at line 375 of file language_model.h.
|
protected |
Definition at line 404 of file language_model.h.
int tesseract::LanguageModel::wordrec_display_segmentations = 0 |
"Display Segmentations"
Definition at line 363 of file language_model.h.