tesseract
5.0.0-alpha-619-ge9db
|
#include <language_model.h>
|
| LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict) |
|
| ~LanguageModel () |
|
void | InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale) |
|
bool | UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
|
bool | AcceptableChoiceFound () |
|
void | SetAcceptableChoiceFound (bool val) |
|
ParamsModel & | getParamsModel () |
|
|
float | CertaintyScore (float cert) |
|
float | ComputeAdjustment (int num_problems, float penalty) |
|
float | ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info) |
|
float | ComputeAdjustedPathCost (ViterbiStateEntry *vse) |
|
bool | GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const |
|
int | SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const |
|
ViterbiStateEntry * | GetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const |
|
bool | AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
|
void | GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms) |
|
LanguageModelDawgInfo * | GenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse) |
|
LanguageModelNgramInfo * | GenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse) |
|
float | ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob) |
|
float | ComputeDenom (BLOB_CHOICE_LIST *curr_list) |
|
void | FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info) |
|
void | UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
|
WERD_CHOICE * | ConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path) |
|
void | ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats) |
|
bool | PrunablePath (const ViterbiStateEntry &vse) |
|
bool | AcceptablePath (const ViterbiStateEntry &vse) |
|
Definition at line 50 of file language_model.h.
◆ LanguageModel()
Definition at line 53 of file language_model.cpp.
56 dict->getCCUtil()->params()),
58 "Turn on/off the use of character ngram model",
59 dict->getCCUtil()->params()),
61 "Maximum order of the character ngram model",
62 dict->getCCUtil()->params()),
64 "Maximum number of prunable (those for which"
65 " PrunablePath() is true) entries in each viterbi list"
66 " recorded in BLOB_CHOICEs",
67 dict->getCCUtil()->params()),
69 "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
70 dict->getCCUtil()->params()),
72 "To avoid overly small denominators use this as the "
73 "floor of the probability returned by the ngram model.",
74 dict->getCCUtil()->params()),
76 "Average classifier score of a non-matching unichar.",
77 dict->getCCUtil()->params()),
79 "Use only the first UTF8 step of the given string"
80 " when computing log probabilities.",
81 dict->getCCUtil()->params()),
83 "Strength of the character ngram model relative to the"
84 " character classifier ",
85 dict->getCCUtil()->params()),
87 "Factor to bring log-probs into the same range as ratings"
88 " when multiplied by outline length ",
89 dict->getCCUtil()->params()),
91 "Words are delimited by space", dict->getCCUtil()->params()),
93 "Minimum length of compound words",
94 dict->getCCUtil()->params()),
96 "Penalty for words not in the frequent word dictionary",
97 dict->getCCUtil()->params()),
99 "Penalty for non-dictionary words",
100 dict->getCCUtil()->params()),
102 "Penalty for inconsistent punctuation",
103 dict->getCCUtil()->params()),
105 "Penalty for inconsistent case",
106 dict->getCCUtil()->params()),
108 "Penalty for inconsistent script",
109 dict->getCCUtil()->params()),
111 "Penalty for inconsistent character type",
112 dict->getCCUtil()->params()),
116 "Penalty for inconsistent font",
117 dict->getCCUtil()->params()),
119 "Penalty for inconsistent spacing",
120 dict->getCCUtil()->params()),
122 dict->getCCUtil()->params()),
124 dict->getCCUtil()->params()),
126 "Use sigmoidal score for certainty",
127 dict->getCCUtil()->params()),
◆ ~LanguageModel()
tesseract::LanguageModel::~LanguageModel |
( |
| ) |
|
◆ AcceptableChoiceFound()
bool tesseract::LanguageModel::AcceptableChoiceFound |
( |
| ) |
|
|
inline |
◆ AcceptablePath()
Definition at line 309 of file language_model.h.
310 return (vse.dawg_info !=
nullptr || vse.Consistent() ||
311 (vse.ngram_info !=
nullptr && !vse.ngram_info->pruned));
◆ AddViterbiStateEntry()
bool tesseract::LanguageModel::AddViterbiStateEntry |
( |
LanguageModelFlagsType |
top_choice_flags, |
|
|
float |
denom, |
|
|
bool |
word_end, |
|
|
int |
curr_col, |
|
|
int |
curr_row, |
|
|
BLOB_CHOICE * |
b, |
|
|
LanguageModelState * |
curr_state, |
|
|
ViterbiStateEntry * |
parent_vse, |
|
|
LMPainPoints * |
pain_points, |
|
|
WERD_RES * |
word_res, |
|
|
BestChoiceBundle * |
best_choice_bundle, |
|
|
BlamerBundle * |
blamer_bundle |
|
) |
| |
|
protected |
Definition at line 561 of file language_model.cpp.
573 ViterbiStateEntry_IT vit;
575 tprintf(
"AddViterbiStateEntry for unichar %s rating=%.4f"
576 " certainty=%.4f top_choice_flags=0x%x",
580 tprintf(
" parent_vse=%p\n", parent_vse);
586 if (curr_state->viterbi_state_entries_length >=
589 tprintf(
"AddViterbiStateEntry: viterbi list is full!\n");
595 LanguageModelDawgInfo *dawg_info =
598 float outline_length =
601 LanguageModelNgramInfo *ngram_info =
nullptr;
605 denom, curr_col, curr_row, outline_length, parent_vse);
608 bool liked_by_language_model = dawg_info !=
nullptr ||
609 (ngram_info !=
nullptr && !ngram_info->pruned);
612 if (!liked_by_language_model && top_choice_flags == 0) {
614 tprintf(
"Language model components very early pruned this entry\n");
622 LMConsistencyInfo consistency_info(
623 parent_vse !=
nullptr ? &parent_vse->consistency_info :
nullptr);
626 consistency_info.ComputeXheightConsistency(
629 if (consistency_info.InconsistentXHeight()) {
635 if (!liked_by_language_model && top_choice_flags == 0) {
637 tprintf(
"Language model components early pruned this entry\n");
646 word_res, &consistency_info);
647 if (dawg_info !=
nullptr && consistency_info.invalid_punc) {
648 consistency_info.invalid_punc =
false;
652 AssociateStats associate_stats;
654 parent_vse, word_res, &associate_stats);
655 if (parent_vse !=
nullptr) {
656 associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
657 associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
661 auto *new_vse =
new ViterbiStateEntry(
662 parent_vse, b, 0.0, outline_length,
663 consistency_info, associate_stats, top_choice_flags, dawg_info,
668 tprintf(
"Adjusted cost = %g\n", new_vse->cost);
672 if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
677 bool keep = new_vse->top_choice_flags || liked_by_language_model;
679 consistency_info.inconsistent_script) {
684 tprintf(
"Language model components did not like this entry\n");
694 (curr_state->viterbi_state_entries_prunable_length >=
696 new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
698 tprintf(
"Discarded ViterbiEntry with high cost %g max cost %g\n",
700 curr_state->viterbi_state_entries_prunable_max_cost);
709 best_choice_bundle, blamer_bundle);
712 new_vse != best_choice_bundle->best_vse) {
714 tprintf(
"Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
724 curr_state->viterbi_state_entries_length++;
726 curr_state->viterbi_state_entries_prunable_length++;
731 if ((curr_state->viterbi_state_entries_prunable_length >=
733 new_vse->top_choice_flags) {
734 ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
736 vit.set_to_list(&(curr_state->viterbi_state_entries));
737 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
738 ViterbiStateEntry *curr_vse = vit.data();
742 if (curr_vse->top_choice_flags && curr_vse != new_vse &&
743 curr_vse->cost > new_vse->cost) {
744 curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
746 if (prunable_counter > 0 &&
PrunablePath(*curr_vse)) --prunable_counter;
748 if (prunable_counter == 0) {
749 curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
751 tprintf(
"Set viterbi_state_entries_prunable_max_cost to %g\n",
752 curr_state->viterbi_state_entries_prunable_max_cost);
754 prunable_counter = -1;
761 new_vse->Print(
"New");
763 curr_state->Print(
"Updated viterbi list");
◆ CertaintyScore()
float tesseract::LanguageModel::CertaintyScore |
( |
float |
cert | ) |
|
|
inlineprotected |
Definition at line 112 of file language_model.h.
118 return 1.0f / (1.0f + exp(10.0f * cert));
120 return (-1.0f / cert);
◆ ComputeAdjustedPathCost()
Definition at line 1199 of file language_model.cpp.
1206 tprintf(
"ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1209 tprintf(
"%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1213 return cost * vse->outline_length;
1215 float adjustment = 1.0f;
1216 if (vse->dawg_info ==
nullptr || vse->dawg_info->permuter !=
FREQ_DAWG_PERM) {
1219 if (vse->dawg_info ==
nullptr) {
1226 if (vse->associate_stats.shape_cost > 0) {
1227 adjustment += vse->associate_stats.shape_cost /
1228 static_cast<float>(vse->length);
1232 return vse->ngram_info->ngram_and_classifier_cost * adjustment;
1235 vse->consistency_info);
1236 return vse->ratings_sum * adjustment;
◆ ComputeAdjustment()
float tesseract::LanguageModel::ComputeAdjustment |
( |
int |
num_problems, |
|
|
float |
penalty |
|
) |
| |
|
inlineprotected |
Definition at line 124 of file language_model.h.
125 if (num_problems == 0)
return 0.0f;
126 if (num_problems == 1)
return penalty;
128 static_cast<float>(num_problems-1)));
◆ ComputeAssociateStats()
Definition at line 280 of file language_model.h.
287 (parent_vse !=
nullptr) ? &(parent_vse->associate_stats) :
nullptr,
288 (parent_vse !=
nullptr) ? parent_vse->length : 0,
◆ ComputeConsistencyAdjustment()
Definition at line 135 of file language_model.h.
138 if (dawg_info !=
nullptr) {
141 (consistency_info.inconsistent_script ?
152 (consistency_info.inconsistent_script ?
154 (consistency_info.inconsistent_font ?
◆ ComputeDenom()
float tesseract::LanguageModel::ComputeDenom |
( |
BLOB_CHOICE_LIST * |
curr_list | ) |
|
|
protected |
Definition at line 994 of file language_model.cpp.
995 if (curr_list->empty())
return 1.0f;
998 BLOB_CHOICE_IT c_it(curr_list);
999 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
◆ ComputeNgramCost()
float tesseract::LanguageModel::ComputeNgramCost |
( |
const char * |
unichar, |
|
|
float |
certainty, |
|
|
float |
denom, |
|
|
const char * |
context, |
|
|
int * |
unichar_step_len, |
|
|
bool * |
found_small_prob, |
|
|
float * |
ngram_prob |
|
) |
| |
|
protected |
Definition at line 934 of file language_model.cpp.
941 const char *context_ptr = context;
942 char *modified_context =
nullptr;
943 char *modified_context_end =
nullptr;
944 const char *unichar_ptr = unichar;
945 const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
948 while (unichar_ptr < unichar_end &&
951 tprintf(
"prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
955 ++(*unichar_step_len);
961 if (unichar_ptr < unichar_end) {
962 if (modified_context ==
nullptr) {
963 size_t context_len = strlen(context);
965 new char[context_len + strlen(unichar_ptr) + step + 1];
966 memcpy(modified_context, context, context_len);
967 modified_context_end = modified_context + context_len;
968 context_ptr = modified_context;
970 strncpy(modified_context_end, unichar_ptr - step, step);
971 modified_context_end += step;
972 *modified_context_end =
'\0';
975 prob /= static_cast<float>(*unichar_step_len);
978 *found_small_prob =
true;
981 *ngram_cost = -1.0*log2(prob);
982 float ngram_and_classifier_cost =
986 tprintf(
"-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
988 ngram_and_classifier_cost);
990 delete[] modified_context;
991 return ngram_and_classifier_cost;
◆ ConstructWord()
Definition at line 1390 of file language_model.cpp.
1396 if (truth_path !=
nullptr) {
1398 (blamer_bundle !=
nullptr &&
1402 ViterbiStateEntry *curr_vse = vse;
1409 float full_wh_ratio_mean = 0.0f;
1410 if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1411 vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
1412 full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
1413 static_cast<float>(vse->length));
1414 vse->associate_stats.full_wh_ratio_var = 0.0f;
1419 word->set_length(vse->length);
1420 int total_blobs = 0;
1421 for (i = (vse->length-1); i >= 0; --i) {
1422 if (blamer_bundle !=
nullptr && truth_path !=
nullptr && *truth_path &&
1424 *truth_path =
false;
1428 total_blobs += num_blobs;
1429 word->set_blob_choice(i, num_blobs, curr_b);
1433 if ((full_wh_ratio_mean != 0.0f &&
1434 ((curr_vse != vse && curr_vse->parent_vse !=
nullptr) ||
1436 vse->associate_stats.full_wh_ratio_var +=
1437 pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1439 tprintf(
"full_wh_ratio_var += (%g-%g)^2\n",
1440 full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
1447 if (!compound && curr_vse->dawg_info &&
1448 curr_vse->dawg_info->permuter ==
COMPOUND_PERM) compound =
true;
1451 curr_vse = curr_vse->parent_vse;
1452 if (curr_vse ==
nullptr)
break;
1453 curr_b = curr_vse->curr_b;
1458 if (full_wh_ratio_mean != 0.0f) {
1459 vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
1462 word->set_rating(vse->ratings_sum);
1463 word->set_certainty(vse->min_certainty);
1464 word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
1465 vse->consistency_info.BodyMaxXHeight());
1466 if (vse->dawg_info !=
nullptr) {
1467 word->set_permuter(compound ?
COMPOUND_PERM : vse->dawg_info->permuter);
1470 }
else if (vse->top_choice_flags) {
◆ ExtractFeaturesFromPath()
void tesseract::LanguageModel::ExtractFeaturesFromPath |
( |
const ViterbiStateEntry & |
vse, |
|
|
float |
features[] |
|
) |
| |
|
static |
Definition at line 1341 of file language_model.cpp.
1345 int len = vse.length <= kMaxSmallWordUnichars ? 0 :
1346 vse.length <= kMaxMediumWordUnichars ? 1 : 2;
1347 if (vse.dawg_info !=
nullptr) {
1348 int permuter = vse.dawg_info->permuter;
1350 if (vse.consistency_info.num_digits == vse.length) {
1366 vse.associate_stats.shape_cost / static_cast<float>(vse.length);
1369 if (vse.ngram_info !=
nullptr) {
1371 vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
1379 vse.consistency_info.NumInconsistentChartype() : 0.0;
1381 vse.consistency_info.NumInconsistentSpaces();
1387 vse.ratings_sum / static_cast<float>(vse.outline_length);
◆ FillConsistencyInfo()
Definition at line 1015 of file language_model.cpp.
1024 BLOB_CHOICE* parent_b = parent_vse !=
nullptr ? parent_vse->curr_b :
nullptr;
1033 consistency_info->punc_ref = NO_EDGE;
1036 bool prev_is_numalpha = (parent_b !=
nullptr &&
1042 (is_apos && prev_is_numalpha)) ?
1044 if (consistency_info->punc_ref == NO_EDGE ||
1049 consistency_info->punc_ref);
1050 consistency_info->punc_ref =
1052 node, pattern_unichar_id, word_end) : NO_EDGE;
1053 if (consistency_info->punc_ref == NO_EDGE) {
1054 consistency_info->invalid_punc =
true;
1063 consistency_info->num_lower = 0;
1064 consistency_info->num_non_first_upper = 0;
1067 consistency_info->num_lower++;
1068 }
else if ((parent_b !=
nullptr) && unicharset.
get_isupper(unichar_id)) {
1070 consistency_info->num_lower > 0 ||
1071 consistency_info->num_non_first_upper > 0) {
1072 consistency_info->num_non_first_upper++;
1080 consistency_info->script_id = unicharset.
get_script(unichar_id);
1084 consistency_info->script_id == unicharset.
hiragana_sid()) ||
1086 consistency_info->script_id == unicharset.
katakana_sid())) {
1091 if (parent_vse !=
nullptr &&
1092 (parent_vse->consistency_info.script_id !=
1094 int parent_script_id = parent_vse->consistency_info.script_id;
1097 consistency_info->script_id = parent_script_id;
1099 if (consistency_info->script_id != parent_script_id) {
1100 consistency_info->inconsistent_script =
true;
1106 consistency_info->num_alphas++;
1108 consistency_info->num_digits++;
1110 consistency_info->num_other++;
1115 int fontinfo_id = -1;
1124 tprintf(
"pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1136 bool expected_gap_found =
false;
1137 float expected_gap = 0.0f;
1139 if (fontinfo_id >= 0) {
1140 ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1142 parent_b->
unichar_id(), unichar_id, &temp_gap)) {
1143 expected_gap = temp_gap;
1144 expected_gap_found =
true;
1147 consistency_info->inconsistent_font =
true;
1149 int num_addends = 0;
1151 for (
int i = 0; i < 4; ++i) {
1154 }
else if (i == 1) {
1156 }
else if (i == 2) {
1161 ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1163 parent_b->
unichar_id(), unichar_id, &temp_gap)) {
1164 expected_gap += temp_gap;
1168 if (num_addends > 0) {
1169 expected_gap /= static_cast<float>(num_addends);
1170 expected_gap_found =
true;
1173 if (expected_gap_found) {
1174 int actual_gap = word_res->
GetBlobsGap(curr_col-1);
1175 if (actual_gap == 0) {
1176 consistency_info->num_inconsistent_spaces++;
1178 float gap_ratio = expected_gap / actual_gap;
1184 if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1185 consistency_info->num_inconsistent_spaces++;
1189 tprintf(
"spacing for %s(%d) %s(%d) col %d: expected %g actual %d\n",
1192 unichar_id, curr_col, expected_gap, actual_gap);
◆ GenerateDawgInfo()
Definition at line 785 of file language_model.cpp.
792 if (parent_vse ==
nullptr) {
796 if (parent_vse->dawg_info ==
nullptr)
return nullptr;
810 (parent_vse ==
nullptr || parent_vse->dawg_info->permuter !=
NUMBER_PERM)) {
816 if (parent_vse ==
nullptr || word_end ||
823 bool has_word_ending =
false;
824 for (i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {
825 const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];
826 const Dawg *pdawg = pos.dawg_index < 0
828 if (pdawg ==
nullptr || pos.back_to_punc)
continue;;
830 pdawg->end_of_word(pos.dawg_ref)) {
831 has_word_ending =
true;
835 if (!has_word_ending)
return nullptr;
841 LanguageModelDawgInfo *dawg_info =
nullptr;
848 DawgPositionVector tmp_active_dawgs;
849 for (
int i = 0; i < normed_ids.
size(); ++i) {
851 tprintf(
"Test Letter OK for unichar %d, normed %d\n",
854 word_end && i == normed_ids.
size() - 1);
857 }
else if (i < normed_ids.
size() - 1) {
862 tprintf(
"Letter was OK for unichar %d, normed %d\n",
◆ GenerateNgramInfo()
LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo |
( |
const char * |
unichar, |
|
|
float |
certainty, |
|
|
float |
denom, |
|
|
int |
curr_col, |
|
|
int |
curr_row, |
|
|
float |
outline_length, |
|
|
const ViterbiStateEntry * |
parent_vse |
|
) |
| |
|
protected |
Definition at line 877 of file language_model.cpp.
882 const char *pcontext_ptr =
"";
883 int pcontext_unichar_step_len = 0;
884 if (parent_vse ==
nullptr) {
888 pcontext_ptr = parent_vse->ngram_info->context.c_str();
889 pcontext_unichar_step_len =
890 parent_vse->ngram_info->context_unichar_step_len;
893 int unichar_step_len = 0;
896 float ngram_and_classifier_cost =
898 pcontext_ptr, &unichar_step_len,
899 &pruned, &ngram_cost);
903 ngram_and_classifier_cost *=
906 if (parent_vse !=
nullptr) {
907 ngram_and_classifier_cost +=
908 parent_vse->ngram_info->ngram_and_classifier_cost;
909 ngram_cost += parent_vse->ngram_info->ngram_cost;
913 int num_remove = (unichar_step_len + pcontext_unichar_step_len -
915 if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
916 while (num_remove > 0 && *pcontext_ptr !=
'\0') {
922 if (parent_vse !=
nullptr && parent_vse->ngram_info->pruned) pruned =
true;
925 auto *ngram_info =
new LanguageModelNgramInfo(
926 pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
927 ngram_and_classifier_cost);
928 ngram_info->context += unichar;
929 ngram_info->context_unichar_step_len += unichar_step_len;
◆ GenerateTopChoiceInfo()
Definition at line 769 of file language_model.cpp.
772 ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
773 for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
774 new_vse->cost >= vit.data()->cost; vit.forward()) {
777 new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
780 tprintf(
"GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
781 new_vse->top_choice_flags);
◆ GetNextParentVSE()
Finds the next ViterbiStateEntry with which the given unichar_id can combine sensibly, taking into account any mixed alnum/mixed case situation, and whether this combination has been inspected before.
Definition at line 500 of file language_model.cpp.
505 for (; !vse_it->cycled_list(); vse_it->forward()) {
506 ViterbiStateEntry* parent_vse = vse_it->data();
509 if (!just_classified && !parent_vse->updated)
continue;
511 parent_vse->Print(
"Considering");
513 *top_choice_flags = blob_choice_flags;
515 !parent_vse->HasAlnumChoice(unicharset)) {
518 *top_choice_flags &= parent_vse->top_choice_flags;
526 (mixed_alnum || *top_choice_flags == 0))
532 (mixed_alnum || *top_choice_flags == 0))
537 if (parent_vse->competing_vse !=
nullptr) {
538 const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
541 tprintf(
"Parent %s has competition %s\n",
◆ getParamsModel()
ParamsModel& tesseract::LanguageModel::getParamsModel |
( |
| ) |
|
|
inline |
◆ GetTopLowerUpperDigit()
bool tesseract::LanguageModel::GetTopLowerUpperDigit |
( |
BLOB_CHOICE_LIST * |
curr_list, |
|
|
BLOB_CHOICE ** |
first_lower, |
|
|
BLOB_CHOICE ** |
first_upper, |
|
|
BLOB_CHOICE ** |
first_digit |
|
) |
| const |
|
protected |
Finds the first lower and upper case letter and first digit in curr_list. For non-upper/lower languages, alpha counts as upper. Uses the first character in the list in place of empty results. Returns true if both alpha and digits are found.
Definition at line 383 of file language_model.cpp.
387 BLOB_CHOICE_IT c_it(curr_list);
390 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
393 if (first_unichar ==
nullptr) first_unichar = c_it.data();
394 if (*first_lower ==
nullptr && unicharset.
get_islower(unichar_id)) {
395 *first_lower = c_it.data();
397 if (*first_upper ==
nullptr && unicharset.
get_isalpha(unichar_id) &&
399 *first_upper = c_it.data();
401 if (*first_digit ==
nullptr && unicharset.
get_isdigit(unichar_id)) {
402 *first_digit = c_it.data();
406 bool mixed = (*first_lower !=
nullptr || *first_upper !=
nullptr) &&
407 *first_digit !=
nullptr;
408 if (*first_lower ==
nullptr) *first_lower = first_unichar;
409 if (*first_upper ==
nullptr) *first_upper = first_unichar;
410 if (*first_digit ==
nullptr) *first_digit = first_unichar;
◆ InitForWord()
void tesseract::LanguageModel::InitForWord |
( |
const WERD_CHOICE * |
prev_word, |
|
|
bool |
fixed_pitch, |
|
|
float |
max_char_wh_ratio, |
|
|
float |
rating_cert_scale |
|
) |
| |
◆ PrunablePath()
Definition at line 299 of file language_model.h.
300 if (vse.top_choice_flags)
return false;
301 if (vse.dawg_info !=
nullptr &&
◆ SetAcceptableChoiceFound()
void tesseract::LanguageModel::SetAcceptableChoiceFound |
( |
bool |
val | ) |
|
|
inline |
◆ SetTopParentLowerUpperDigit()
int tesseract::LanguageModel::SetTopParentLowerUpperDigit |
( |
LanguageModelState * |
parent_node | ) |
const |
|
protected |
Forces there to be at least one entry in the overall set of the viterbi_state_entries of each element of parent_node that has the top_choice_flag set for lower, upper and digit using the same rules as GetTopLowerUpperDigit, setting the flag on the first found suitable candidate, whether or not the flag is set on some other parent. Returns 1 if both alpha and digits are found among the parents, -1 if no parents are found at all (a legitimate case), and 0 otherwise.
Definition at line 423 of file language_model.cpp.
425 if (parent_node ==
nullptr)
return -1;
427 ViterbiStateEntry* top_lower =
nullptr;
428 ViterbiStateEntry* top_upper =
nullptr;
429 ViterbiStateEntry* top_digit =
nullptr;
430 ViterbiStateEntry* top_choice =
nullptr;
431 float lower_rating = 0.0f;
432 float upper_rating = 0.0f;
433 float digit_rating = 0.0f;
434 float top_rating = 0.0f;
436 ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
437 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
438 ViterbiStateEntry* vse = vit.data();
441 ViterbiStateEntry* unichar_vse = vse;
442 UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
443 float rating = unichar_vse->curr_b->rating();
444 while (unichar_id == INVALID_UNICHAR_ID &&
445 unichar_vse->parent_vse !=
nullptr) {
446 unichar_vse = unichar_vse->parent_vse;
447 unichar_id = unichar_vse->curr_b->unichar_id();
448 rating = unichar_vse->curr_b->rating();
450 if (unichar_id != INVALID_UNICHAR_ID) {
452 if (top_lower ==
nullptr || lower_rating > rating) {
454 lower_rating = rating;
457 if (top_upper ==
nullptr || upper_rating > rating) {
459 upper_rating = rating;
462 if (top_digit ==
nullptr || digit_rating > rating) {
464 digit_rating = rating;
468 if (top_choice ==
nullptr || top_rating > rating) {
474 if (top_choice ==
nullptr)
return -1;
475 bool mixed = (top_lower !=
nullptr || top_upper !=
nullptr) &&
476 top_digit !=
nullptr;
477 if (top_lower ==
nullptr) top_lower = top_choice;
479 if (top_upper ==
nullptr) top_upper = top_choice;
481 if (top_digit ==
nullptr) top_digit = top_choice;
485 (top_choice->top_choice_flags &
489 top_choice->top_choice_flags |=
492 return mixed ? 1 : 0;
◆ UpdateBestChoice()
Definition at line 1241 of file language_model.cpp.
1249 blamer_bundle, &truth_path);
1254 vse->Print(word_str.
c_str());
1257 word->
print(
"UpdateBestChoice() constructed word");
1260 ParamsTrainingHypothesis curr_hyp;
1261 if (blamer_bundle !=
nullptr) {
1262 if (vse->dawg_info !=
nullptr) vse->dawg_info->permuter =
1263 static_cast<PermuterType>(word->
permuter());
1266 curr_hyp.cost = vse->cost;
1268 tprintf(
"Raw features extracted from %s (cost=%g) [ ",
1269 curr_hyp.str.c_str(), curr_hyp.cost);
1270 for (
float feature : curr_hyp.features) {
1292 tprintf(
"Updated raw choice\n");
1301 vse->consistency_info.xht_decision, 0.0,
1316 best_choice_bundle->updated =
true;
1317 best_choice_bundle->best_vse = vse;
1319 tprintf(
"Updated best choice\n");
1323 if (vse->dawg_info !=
nullptr) {
1331 if (blamer_bundle !=
nullptr) {
1333 vse->dawg_info !=
nullptr && vse->top_choice_flags);
◆ UpdateState()
UpdateState has the job of combining the ViterbiStateEntry lists on each of the choices on parent_list with each of the blob choices in curr_list, making a new ViterbiStateEntry for each sensible path.
This could be a huge set of combinations, creating a lot of work only to be truncated by some beam limit, but only certain kinds of paths will continue at the next step:
- paths that are liked by the language model: either a DAWG or the n-gram model, where active.
- paths that represent some kind of top choice. The old permuter permuted the top raw classifier score, the top upper case word and the top lower- case word. UpdateState now concentrates its top-choice paths on top lower-case, top upper-case (or caseless alpha), and top digit sequence, with allowance for continuation of these paths through blobs where such a character does not appear in the choices list.
GetNextParentVSE enforces some of these models to minimize the number of calls to AddViterbiStateEntry, even prior to looking at the language model. Thus an n-blob sequence of [l1I] will produce 3n calls to AddViterbiStateEntry instead of 3^n.
Of course it isn't quite that simple as Title Case is handled by allowing lower case to continue an upper case initial, but it has to be detected in the combiner so it knows which upper case letters are initial alphas.
Definition at line 253 of file language_model.cpp.
263 tprintf(
"\nUpdateState: col=%d row=%d %s",
264 curr_col, curr_row, just_classified ?
"just_classified" :
"");
266 tprintf(
"(parent=%p)\n", parent_node);
272 bool new_changed =
false;
278 bool has_alnum_mix =
false;
279 if (parent_node !=
nullptr) {
283 tprintf(
"No parents found to process\n");
287 has_alnum_mix =
true;
291 has_alnum_mix =
false;;
292 ScanParentsForCaseMix(unicharset, parent_node);
294 parent_node->Print(
"Parent viterbi list");
296 LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
299 ViterbiStateEntry_IT vit;
300 BLOB_CHOICE_IT c_it(curr_list);
301 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
313 if (c_it.at_first() || !new_changed)
317 if (first_digit == choice) blob_choice_flags |=
kDigitFlag;
319 if (parent_node ==
nullptr) {
331 if (HasBetterCaseVariant(unicharset, choice, curr_list))
337 blob_choice_flags, denom, word_end, curr_col, curr_row,
338 choice, curr_state,
nullptr, pain_points,
339 word_res, best_choice_bundle, blamer_bundle);
342 vit.set_to_list(&parent_node->viterbi_state_entries);
345 ViterbiStateEntry* parent_vse =
nullptr;
348 c_it.data(), blob_choice_flags,
349 unicharset, word_res, &vit,
350 &top_choice_flags)) !=
nullptr) {
362 if (!parent_vse->HasAlnumChoice(unicharset) &&
363 HasBetterCaseVariant(unicharset, choice, curr_list))
368 top_choice_flags, denom, word_end, curr_col, curr_row,
369 c_it.data(), curr_state, parent_vse, pain_points,
370 word_res, best_choice_bundle, blamer_bundle);
◆ acceptable_choice_found_
bool tesseract::LanguageModel::acceptable_choice_found_ = false |
|
protected |
◆ beginning_active_dawgs_
◆ correct_segmentation_explored_
bool tesseract::LanguageModel::correct_segmentation_explored_ = false |
|
protected |
◆ dawg_args_
DawgArgs tesseract::LanguageModel::dawg_args_ |
|
protected |
◆ dict_
Dict* tesseract::LanguageModel::dict_ = nullptr |
|
protected |
◆ fixed_pitch_
bool tesseract::LanguageModel::fixed_pitch_ = false |
|
protected |
◆ fontinfo_table_
◆ kDigitFlag
◆ kLowerCaseFlag
◆ kMaxAvgNgramCost
const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f |
|
static |
◆ kSmallestRatingFlag
◆ kUpperCaseFlag
◆ kXhtConsistentFlag
◆ language_model_debug_level
int tesseract::LanguageModel::language_model_debug_level = 0 |
◆ language_model_min_compound_length
int tesseract::LanguageModel::language_model_min_compound_length = 3 |
◆ language_model_ngram_nonmatch_score
double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0 |
"Average classifier score of a non-matching unichar"
Definition at line 330 of file language_model.h.
◆ language_model_ngram_on
bool tesseract::LanguageModel::language_model_ngram_on = false |
"Turn on/off the use of character ngram model"
Definition at line 318 of file language_model.h.
◆ language_model_ngram_order
int tesseract::LanguageModel::language_model_ngram_order = 8 |
◆ language_model_ngram_rating_factor
double tesseract::LanguageModel::language_model_ngram_rating_factor = 16.0 |
"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "
Definition at line 339 of file language_model.h.
◆ language_model_ngram_scale_factor
double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03 |
"Strength of the character ngram model relative to the" " character classifier "
Definition at line 336 of file language_model.h.
◆ language_model_ngram_small_prob
double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001 |
"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"
Definition at line 328 of file language_model.h.
◆ language_model_ngram_space_delimited_language
bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true |
◆ language_model_ngram_use_only_first_uft8_step
bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false |
"Use only the first UTF8 step of the given string" " when computing log probabilities"
Definition at line 333 of file language_model.h.
◆ language_model_penalty_case
double tesseract::LanguageModel::language_model_penalty_case = 0.1 |
◆ language_model_penalty_chartype
double tesseract::LanguageModel::language_model_penalty_chartype = 0.3 |
◆ language_model_penalty_font
double tesseract::LanguageModel::language_model_penalty_font = 0.00 |
◆ language_model_penalty_increment
double tesseract::LanguageModel::language_model_penalty_increment = 0.01 |
◆ language_model_penalty_non_dict_word
double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15 |
◆ language_model_penalty_non_freq_dict_word
double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1 |
"Penalty for words not in the frequent word dictionary"
Definition at line 346 of file language_model.h.
◆ language_model_penalty_punc
double tesseract::LanguageModel::language_model_penalty_punc = 0.2 |
◆ language_model_penalty_script
double tesseract::LanguageModel::language_model_penalty_script = 0.5 |
◆ language_model_penalty_spacing
double tesseract::LanguageModel::language_model_penalty_spacing = 0.05 |
◆ language_model_use_sigmoidal_certainty
bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false |
◆ language_model_viterbi_list_max_num_prunable
int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10 |
"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"
Definition at line 323 of file language_model.h.
◆ language_model_viterbi_list_max_size
int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500 |
"Maximum size of viterbi lists recorded in BLOB_CHOICEs"
Definition at line 325 of file language_model.h.
◆ max_char_wh_ratio_
float tesseract::LanguageModel::max_char_wh_ratio_ = 0.0f |
|
protected |
◆ params_model_
◆ prev_word_str_
STRING tesseract::LanguageModel::prev_word_str_ |
|
protected |
◆ prev_word_unichar_step_len_
int tesseract::LanguageModel::prev_word_unichar_step_len_ = 0 |
|
protected |
◆ rating_cert_scale_
float tesseract::LanguageModel::rating_cert_scale_ = 0.0f |
|
protected |
◆ very_beginning_active_dawgs_
◆ wordrec_display_segmentations
int tesseract::LanguageModel::wordrec_display_segmentations = 0 |
The documentation for this class was generated from the following files:
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
const STRING & unichar_string() const
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
int correct_segmentation_length() const
bool get_islower(UNICHAR_ID unichar_id) const
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
GenericVector< int > blob_widths
bool language_model_ngram_on
int wordrec_display_segmentations
ParamsModel params_model_
double language_model_penalty_non_dict_word
bool get_isdigit(UNICHAR_ID unichar_id) const
bool get_isalpha(UNICHAR_ID unichar_id) const
bool correct_segmentation_explored_
bool LogNewRawChoice(WERD_CHOICE *word_choice)
double language_model_penalty_script
static const LanguageModelFlagsType kXhtConsistentFlag
int tessedit_truncate_wordchoice_log
#define INT_MEMBER(name, val, comment, vec)
double language_model_penalty_spacing
double language_model_penalty_non_freq_dict_word
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
DawgPositionVector very_beginning_active_dawgs_
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
static const float kBadRating
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
UNICHAR_ID unichar_id() const
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
#define BOOL_INIT_MEMBER(name, val, comment, vec)
int get_script(UNICHAR_ID unichar_id) const
bool get_ispunctuation(UNICHAR_ID unichar_id) const
const MATRIX_COORD & matrix_cell()
const UNICHARSET * uch_set
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
bool language_model_ngram_use_only_first_uft8_step
void UpdateBestRating(float rating)
double language_model_ngram_small_prob
double language_model_ngram_rating_factor
WERD_CHOICE * best_choice
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
const char * c_str() const
bool language_model_ngram_space_delimited_language
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
DawgPositionVector * updated_dawgs
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
bool GuidedSegsearchStillGoing() const
GenericVector< TBLOB * > blobs
static int utf8_step(const char *utf8_str)
#define double_MEMBER(name, val, comment, vec)
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
double language_model_penalty_case
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
void set_rating(float new_val)
float ComputeAdjustment(int num_problems, float penalty)
int language_model_ngram_order
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
void set_best_choice_is_dict_and_top_choice(bool value)
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
void DisplaySegmentation(TWERD *word)
static const LanguageModelFlagsType kUpperCaseFlag
int prev_word_unichar_step_len_
bool compound_marker(UNICHAR_ID unichar_id)
bool acceptable_choice_found_
const UnicityTable< FontInfo > * fontinfo_table_
bool language_model_use_sigmoidal_certainty
double language_model_penalty_increment
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
int16_t fontinfo_id() const
static int Compare(const void *e1, const void *e2)
bool get_isupper(UNICHAR_ID unichar_id) const
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
void print_state(const char *msg) const
int GetBlobsGap(int blob_index)
float ComputeCost(const float features[]) const
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
int language_model_viterbi_list_max_num_prunable
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
int16_t fontinfo_id2() const
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
void reset_hyphen_vars(bool last_word_on_line)
double language_model_penalty_chartype
float CertaintyScore(float cert)
int language_model_min_compound_length
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
int language_model_viterbi_list_max_size
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
static const LanguageModelFlagsType kLowerCaseFlag
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
double language_model_ngram_nonmatch_score
DLLSYM void tprintf(const char *format,...)
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
const UNICHARSET & getUnicharset() const
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
static const UNICHAR_ID kPatternUnicharID
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
DawgPositionVector * active_dawgs
static const LanguageModelFlagsType kSmallestRatingFlag
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
bool is_apostrophe(UNICHAR_ID unichar_id)
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
static const LanguageModelFlagsType kDigitFlag
DawgPositionVector beginning_active_dawgs_
const char * id_to_unichar(UNICHAR_ID id) const
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
double language_model_penalty_font
int language_model_debug_level
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
#define BOOL_MEMBER(name, val, comment, vec)
bool PrunablePath(const ViterbiStateEntry &vse)
double language_model_penalty_punc
bool AcceptablePath(const ViterbiStateEntry &vse)
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
double language_model_ngram_scale_factor