46 static inline double log2(
double n) {
55 :
INT_MEMBER(language_model_debug_level, 0,
"Language model debug level",
56 dict->getCCUtil()->params()),
58 "Turn on/off the use of character ngram model",
59 dict->getCCUtil()->params()),
61 "Maximum order of the character ngram model",
62 dict->getCCUtil()->params()),
63 INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
64 "Maximum number of prunable (those for which"
65 " PrunablePath() is true) entries in each viterbi list"
66 " recorded in BLOB_CHOICEs",
67 dict->getCCUtil()->params()),
68 INT_MEMBER(language_model_viterbi_list_max_size, 500,
69 "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
70 dict->getCCUtil()->params()),
72 "To avoid overly small denominators use this as the "
73 "floor of the probability returned by the ngram model.",
74 dict->getCCUtil()->params()),
76 "Average classifier score of a non-matching unichar.",
77 dict->getCCUtil()->params()),
78 BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
79 "Use only the first UTF8 step of the given string"
80 " when computing log probabilities.",
81 dict->getCCUtil()->params()),
83 "Strength of the character ngram model relative to the"
84 " character classifier ",
85 dict->getCCUtil()->params()),
87 "Factor to bring log-probs into the same range as ratings"
88 " when multiplied by outline length ",
89 dict->getCCUtil()->params()),
90 BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
91 "Words are delimited by space", dict->getCCUtil()->params()),
92 INT_MEMBER(language_model_min_compound_length, 3,
93 "Minimum length of compound words",
94 dict->getCCUtil()->params()),
96 "Penalty for words not in the frequent word dictionary",
97 dict->getCCUtil()->params()),
99 "Penalty for non-dictionary words",
100 dict->getCCUtil()->params()),
102 "Penalty for inconsistent punctuation",
103 dict->getCCUtil()->params()),
105 "Penalty for inconsistent case",
106 dict->getCCUtil()->params()),
108 "Penalty for inconsistent script",
109 dict->getCCUtil()->params()),
111 "Penalty for inconsistent character type",
112 dict->getCCUtil()->params()),
116 "Penalty for inconsistent font",
117 dict->getCCUtil()->params()),
119 "Penalty for inconsistent spacing",
120 dict->getCCUtil()->params()),
121 double_MEMBER(language_model_penalty_increment, 0.01,
"Penalty increment",
122 dict->getCCUtil()->params()),
123 INT_MEMBER(wordrec_display_segmentations, 0,
"Display Segmentations",
124 dict->getCCUtil()->params()),
126 "Use sigmoidal score for certainty",
127 dict->getCCUtil()->params()),
129 fontinfo_table_(fontinfo_table),
137 bool fixed_pitch,
float max_char_wh_ratio,
138 float rating_cert_scale) {
154 if (prev_word !=
nullptr && prev_word->
unichar_string() !=
nullptr) {
176 static void ScanParentsForCaseMix(
const UNICHARSET& unicharset,
178 if (parent_node ==
nullptr)
return;
180 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
187 if (other_case == unichar_id)
continue;
193 for (vit2.mark_cycle_pt(); !vit2.cycled_list() &&
194 vit2.data()->curr_b->unichar_id() != other_case;
196 if (!vit2.cycled_list()) {
207 static bool HasBetterCaseVariant(
const UNICHARSET& unicharset,
209 BLOB_CHOICE_LIST* choices) {
212 if (other_case == choice_id || other_case == INVALID_UNICHAR_ID)
216 BLOB_CHOICE_IT bc_it(choices);
217 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
219 if (better_choice->
unichar_id() == other_case)
221 else if (better_choice == choice)
254 bool just_classified,
255 int curr_col,
int curr_row,
256 BLOB_CHOICE_LIST *curr_list,
263 tprintf(
"\nUpdateState: col=%d row=%d %s",
264 curr_col, curr_row, just_classified ?
"just_classified" :
"");
266 tprintf(
"(parent=%p)\n", parent_node);
272 bool new_changed =
false;
278 bool has_alnum_mix =
false;
279 if (parent_node !=
nullptr) {
283 tprintf(
"No parents found to process\n");
287 has_alnum_mix =
true;
291 has_alnum_mix =
false;;
292 ScanParentsForCaseMix(unicharset, parent_node);
294 parent_node->
Print(
"Parent viterbi list");
299 ViterbiStateEntry_IT vit;
300 BLOB_CHOICE_IT c_it(curr_list);
301 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
313 if (c_it.at_first() || !new_changed)
317 if (first_digit == choice) blob_choice_flags |=
kDigitFlag;
319 if (parent_node ==
nullptr) {
331 if (HasBetterCaseVariant(unicharset, choice, curr_list))
337 blob_choice_flags, denom, word_end, curr_col, curr_row,
338 choice, curr_state,
nullptr, pain_points,
339 word_res, best_choice_bundle, blamer_bundle);
348 c_it.data(), blob_choice_flags,
349 unicharset, word_res, &vit,
350 &top_choice_flags)) !=
nullptr) {
363 HasBetterCaseVariant(unicharset, choice, curr_list))
368 top_choice_flags, denom, word_end, curr_col, curr_row,
369 c_it.data(), curr_state, parent_vse, pain_points,
370 word_res, best_choice_bundle, blamer_bundle);
387 BLOB_CHOICE_IT c_it(curr_list);
390 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
393 if (first_unichar ==
nullptr) first_unichar = c_it.data();
394 if (*first_lower ==
nullptr && unicharset.
get_islower(unichar_id)) {
395 *first_lower = c_it.data();
397 if (*first_upper ==
nullptr && unicharset.
get_isalpha(unichar_id) &&
399 *first_upper = c_it.data();
401 if (*first_digit ==
nullptr && unicharset.
get_isdigit(unichar_id)) {
402 *first_digit = c_it.data();
406 bool mixed = (*first_lower !=
nullptr || *first_upper !=
nullptr) &&
407 *first_digit !=
nullptr;
408 if (*first_lower ==
nullptr) *first_lower = first_unichar;
409 if (*first_upper ==
nullptr) *first_upper = first_unichar;
410 if (*first_digit ==
nullptr) *first_digit = first_unichar;
425 if (parent_node ==
nullptr)
return -1;
431 float lower_rating = 0.0f;
432 float upper_rating = 0.0f;
433 float digit_rating = 0.0f;
434 float top_rating = 0.0f;
437 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
444 while (unichar_id == INVALID_UNICHAR_ID &&
450 if (unichar_id != INVALID_UNICHAR_ID) {
452 if (top_lower ==
nullptr || lower_rating > rating) {
454 lower_rating = rating;
457 if (top_upper ==
nullptr || upper_rating > rating) {
459 upper_rating = rating;
462 if (top_digit ==
nullptr || digit_rating > rating) {
464 digit_rating = rating;
468 if (top_choice ==
nullptr || top_rating > rating) {
474 if (top_choice ==
nullptr)
return -1;
475 bool mixed = (top_lower !=
nullptr || top_upper !=
nullptr) &&
476 top_digit !=
nullptr;
477 if (top_lower ==
nullptr) top_lower = top_choice;
479 if (top_upper ==
nullptr) top_upper = top_choice;
481 if (top_digit ==
nullptr) top_digit = top_choice;
492 return mixed ? 1 : 0;
501 bool just_classified,
bool mixed_alnum,
const BLOB_CHOICE* bc,
503 WERD_RES* word_res, ViterbiStateEntry_IT* vse_it,
505 for (; !vse_it->cycled_list(); vse_it->forward()) {
509 if (!just_classified && !parent_vse->
updated)
continue;
511 parent_vse->
Print(
"Considering");
513 *top_choice_flags = blob_choice_flags;
526 (mixed_alnum || *top_choice_flags == 0))
532 (mixed_alnum || *top_choice_flags == 0))
541 tprintf(
"Parent %s has competition %s\n",
565 int curr_col,
int curr_row,
573 ViterbiStateEntry_IT vit;
575 tprintf(
"AddViterbiStateEntry for unichar %s rating=%.4f"
576 " certainty=%.4f top_choice_flags=0x%x",
580 tprintf(
" parent_vse=%p\n", parent_vse);
589 tprintf(
"AddViterbiStateEntry: viterbi list is full!\n");
598 float outline_length =
605 denom, curr_col, curr_row, outline_length, parent_vse);
608 bool liked_by_language_model = dawg_info !=
nullptr ||
609 (ngram_info !=
nullptr && !ngram_info->
pruned);
612 if (!liked_by_language_model && top_choice_flags == 0) {
614 tprintf(
"Language model components very early pruned this entry\n");
635 if (!liked_by_language_model && top_choice_flags == 0) {
637 tprintf(
"Language model components early pruned this entry\n");
646 word_res, &consistency_info);
647 if (dawg_info !=
nullptr && consistency_info.
invalid_punc) {
654 parent_vse, word_res, &associate_stats);
655 if (parent_vse !=
nullptr) {
662 parent_vse, b, 0.0, outline_length,
663 consistency_info, associate_stats, top_choice_flags, dawg_info,
668 tprintf(
"Adjusted cost = %g\n", new_vse->cost);
677 bool keep = new_vse->top_choice_flags || liked_by_language_model;
684 tprintf(
"Language model components did not like this entry\n");
698 tprintf(
"Discarded ViterbiEntry with high cost %g max cost %g\n",
709 best_choice_bundle, blamer_bundle);
712 new_vse != best_choice_bundle->
best_vse) {
714 tprintf(
"Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
733 new_vse->top_choice_flags) {
737 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
743 curr_vse->
cost > new_vse->cost) {
746 if (prunable_counter > 0 &&
PrunablePath(*curr_vse)) --prunable_counter;
748 if (prunable_counter == 0) {
751 tprintf(
"Set viterbi_state_entries_prunable_max_cost to %g\n",
754 prunable_counter = -1;
761 new_vse->Print(
"New");
763 curr_state->
Print(
"Updated viterbi list");
773 for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->
top_choice_flags &&
774 new_vse->
cost >= vit.data()->cost; vit.forward()) {
780 tprintf(
"GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
787 int curr_col,
int curr_row,
792 if (parent_vse ==
nullptr) {
796 if (parent_vse->
dawg_info ==
nullptr)
return nullptr;
816 if (parent_vse ==
nullptr || word_end ||
823 bool has_word_ending =
false;
831 has_word_ending =
true;
835 if (!has_word_ending)
return nullptr;
849 for (
int i = 0; i < normed_ids.
size(); ++i) {
851 tprintf(
"Test Letter OK for unichar %d, normed %d\n",
854 word_end && i == normed_ids.
size() - 1);
857 }
else if (i < normed_ids.
size() - 1) {
862 tprintf(
"Letter was OK for unichar %d, normed %d\n",
878 const char *unichar,
float certainty,
float denom,
879 int curr_col,
int curr_row,
float outline_length,
882 const char *pcontext_ptr =
"";
883 int pcontext_unichar_step_len = 0;
884 if (parent_vse ==
nullptr) {
889 pcontext_unichar_step_len =
893 int unichar_step_len = 0;
896 float ngram_and_classifier_cost =
898 pcontext_ptr, &unichar_step_len,
899 &pruned, &ngram_cost);
903 ngram_and_classifier_cost *=
906 if (parent_vse !=
nullptr) {
907 ngram_and_classifier_cost +=
913 int num_remove = (unichar_step_len + pcontext_unichar_step_len -
915 if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
916 while (num_remove > 0 && *pcontext_ptr !=
'\0') {
926 pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
927 ngram_and_classifier_cost);
928 ngram_info->context += unichar;
929 ngram_info->context_unichar_step_len += unichar_step_len;
938 int *unichar_step_len,
939 bool *found_small_prob,
941 const char *context_ptr = context;
942 char *modified_context =
nullptr;
943 char *modified_context_end =
nullptr;
944 const char *unichar_ptr = unichar;
945 const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
948 while (unichar_ptr < unichar_end &&
951 tprintf(
"prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
955 ++(*unichar_step_len);
961 if (unichar_ptr < unichar_end) {
962 if (modified_context ==
nullptr) {
963 size_t context_len = strlen(context);
965 new char[context_len + strlen(unichar_ptr) + step + 1];
966 memcpy(modified_context, context, context_len);
967 modified_context_end = modified_context + context_len;
968 context_ptr = modified_context;
970 strncpy(modified_context_end, unichar_ptr - step, step);
971 modified_context_end += step;
972 *modified_context_end =
'\0';
975 prob /= static_cast<float>(*unichar_step_len);
978 *found_small_prob =
true;
981 *ngram_cost = -1.0*log2(prob);
982 float ngram_and_classifier_cost =
986 tprintf(
"-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
988 ngram_and_classifier_cost);
990 delete[] modified_context;
991 return ngram_and_classifier_cost;
995 if (curr_list->empty())
return 1.0f;
998 BLOB_CHOICE_IT c_it(curr_list);
999 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
1033 consistency_info->
punc_ref = NO_EDGE;
1036 bool prev_is_numalpha = (parent_b !=
nullptr &&
1042 (is_apos && prev_is_numalpha)) ?
1044 if (consistency_info->
punc_ref == NO_EDGE ||
1052 node, pattern_unichar_id, word_end) : NO_EDGE;
1053 if (consistency_info->
punc_ref == NO_EDGE) {
1068 }
else if ((parent_b !=
nullptr) && unicharset.
get_isupper(unichar_id)) {
1091 if (parent_vse !=
nullptr &&
1097 consistency_info->
script_id = parent_script_id;
1099 if (consistency_info->
script_id != parent_script_id) {
1115 int fontinfo_id = -1;
1124 tprintf(
"pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1136 bool expected_gap_found =
false;
1137 float expected_gap = 0.0f;
1139 if (fontinfo_id >= 0) {
1140 ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1142 parent_b->
unichar_id(), unichar_id, &temp_gap)) {
1143 expected_gap = temp_gap;
1144 expected_gap_found =
true;
1149 int num_addends = 0;
1151 for (
int i = 0; i < 4; ++i) {
1154 }
else if (i == 1) {
1156 }
else if (i == 2) {
1161 ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1163 parent_b->
unichar_id(), unichar_id, &temp_gap)) {
1164 expected_gap += temp_gap;
1168 if (num_addends > 0) {
1169 expected_gap /= static_cast<float>(num_addends);
1170 expected_gap_found =
true;
1173 if (expected_gap_found) {
1174 int actual_gap = word_res->
GetBlobsGap(curr_col-1);
1175 if (actual_gap == 0) {
1178 float gap_ratio = expected_gap / actual_gap;
1184 if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1189 tprintf(
"spacing for %s(%d) %s(%d) col %d: expected %g actual %d\n",
1192 unichar_id, curr_col, expected_gap, actual_gap);
1206 tprintf(
"ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1209 tprintf(
"%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1215 float adjustment = 1.0f;
1228 static_cast<float>(vse->
length);
1249 blamer_bundle, &truth_path);
1257 word->
print(
"UpdateBestChoice() constructed word");
1261 if (blamer_bundle !=
nullptr) {
1263 static_cast<PermuterType>(word->
permuter());
1268 tprintf(
"Raw features extracted from %s (cost=%g) [ ",
1270 for (
float feature : curr_hyp.
features) {
1292 tprintf(
"Updated raw choice\n");
1316 best_choice_bundle->
updated =
true;
1317 best_choice_bundle->
best_vse = vse;
1319 tprintf(
"Updated best choice\n");
1331 if (blamer_bundle !=
nullptr) {
1345 int len = vse.
length <= kMaxSmallWordUnichars ? 0 :
1346 vse.
length <= kMaxMediumWordUnichars ? 1 : 2;
1396 if (truth_path !=
nullptr) {
1398 (blamer_bundle !=
nullptr &&
1409 float full_wh_ratio_mean = 0.0f;
1413 static_cast<float>(vse->
length));
1419 word->set_length(vse->
length);
1420 int total_blobs = 0;
1421 for (i = (vse->
length-1); i >= 0; --i) {
1422 if (blamer_bundle !=
nullptr && truth_path !=
nullptr && *truth_path &&
1424 *truth_path =
false;
1428 total_blobs += num_blobs;
1429 word->set_blob_choice(i, num_blobs, curr_b);
1433 if ((full_wh_ratio_mean != 0.0f &&
1434 ((curr_vse != vse && curr_vse->
parent_vse !=
nullptr) ||
1439 tprintf(
"full_wh_ratio_var += (%g-%g)^2\n",
1452 if (curr_vse ==
nullptr)
break;
1453 curr_b = curr_vse->
curr_b;
1458 if (full_wh_ratio_mean != 0.0f) {