23 #include "config_auto.h" 33 #ifndef DISABLED_LEGACY_ENGINE 50 #define MIN_FONT_ROW_COUNT 8 51 #define MAX_XHEIGHT_DIFF 3 68 TBOX &selection_box) {
84 int16_t good_char_qual;
89 if (lstm_recognizer_ ==
nullptr) {
90 #ifndef DISABLED_LEGACY_ENGINE 92 #endif // ndef DISABLED_LEGACY_ENGINE 96 #ifndef DISABLED_LEGACY_ENGINE 100 tprintf(
"\n%d chars; word_blob_quality: %d; outline_errs: %d; " 101 "char_quality: %d; good_char_quality: %d\n",
106 #endif // ndef DISABLED_LEGACY_ENGINE 126 const TBOX& target_word_box,
127 const char* word_config,
129 if (word_config !=
nullptr) {
131 if (backup_config_file_ ==
nullptr) {
133 FILE* config_fp = fopen(backup_config_file_,
"wb");
134 if (config_fp ==
nullptr) {
135 tprintf(
"Error, failed to open file \"%s\"\n", backup_config_file_);
145 if (backup_config_file_ !=
nullptr) {
149 backup_config_file_ =
nullptr;
152 }
else if (pass > 1 && !word_box.
major_overlap(target_word_box)) {
160 const TBOX* target_word_box,
161 const char* word_config,
168 if (target_word_box ==
nullptr ||
170 *target_word_box, word_config, 1)) {
175 for (
int w = 0; w < words->
size(); ++w) {
177 if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
183 if (pass_n == 1 || !word->
word->
done) {
191 }
else if (pass_n == 2) {
198 for (
int s = 0; s <= sub_langs_.size(); ++s) {
200 Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] :
this;
227 for (
int w = 0; w < words->
size(); ++w) {
229 if (w > 0) word->
prev_word = &(*words)[w - 1];
230 if (monitor !=
nullptr) {
249 for (; w < words->
size(); ++w) {
263 while (pr_it->
word() !=
nullptr && pr_it->
word() != word->
word)
266 bool make_next_word_fuzzy =
false;
275 tprintf(
"Pass%d: %s [%s]\n", pass_n,
280 if (make_next_word_fuzzy && pr_it->
word() !=
nullptr) {
310 const TBOX* target_word_box,
311 const char* word_config,
320 if (dopasses==0 || dopasses==1) {
324 #ifndef DISABLED_LEGACY_ENGINE 335 for (
int i = 0; i < sub_langs_.size(); ++i) {
337 sub_langs_[i]->SwitchAdaptiveClassifier();
339 sub_langs_[i]->StartBackupAdaptiveClassifier();
343 #endif // ndef DISABLED_LEGACY_ENGINE 349 #ifndef DISABLED_LEGACY_ENGINE 353 #endif // ndef DISABLED_LEGACY_ENGINE 364 most_recently_used_ =
this;
389 if (dopasses == 1)
return true;
391 #ifndef DISABLED_LEGACY_ENGINE 402 most_recently_used_ =
this;
433 #endif // ndef DISABLED_LEGACY_ENGINE 440 #ifndef DISABLED_LEGACY_ENGINE 446 #endif //ndef DISABLED_LEGACY_ENGINE 465 if (monitor !=
nullptr) {
471 #ifndef DISABLED_LEGACY_ENGINE 480 while (word_it.
forward() !=
nullptr &&
484 if (!word_it.
word())
break;
491 tprintf(
"Skipping because one of the words is W_REP_CHAR\n");
516 tprintf(
"Top choice \"%s %s\" verified by bigram model.\n",
522 tprintf(
"Examining alt choices for \"%s %s\".\n",
533 float best_rating = 0.0;
536 for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
545 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
556 if (overrides_word1.
size() == 1 ||
559 best_idx = overrides_word1.
size() - 1;
564 if (!overrides_word1.
empty()) {
567 *overrides_word1[best_idx]) &&
569 *overrides_word2[best_idx])) {
571 tprintf(
"Top choice \"%s %s\" verified (sans case) by bigram " 572 "model.\n", orig_w1_str.
string(), orig_w2_str.
string());
576 const STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
577 const STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
578 if (new_w1_str != orig_w1_str) {
581 if (new_w2_str != orig_w2_str) {
585 STRING choices_description;
586 int num_bigram_choices
587 = overrides_word1.
size() * overrides_word2.
size();
588 if (num_bigram_choices == 1) {
589 choices_description =
"This was the unique bigram choice.";
593 const int kMaxChoicesToPrint = 20;
594 for (
int i = 0; i < overrides_word1.
size() &&
595 i < kMaxChoicesToPrint; i++) {
596 if (i > 0) { bigrams_list +=
", "; }
601 choices_description =
"There were many choices: {";
602 choices_description += bigrams_list;
603 choices_description +=
"}";
605 choices_description.
add_str_int(
"There were ", num_bigram_choices);
606 choices_description +=
" compatible bigrams.";
609 tprintf(
"Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
612 choices_description.
string());
620 const TBOX* target_word_box,
621 const char* word_config) {
630 if (monitor !=
nullptr) {
643 if (target_word_box &&
645 *target_word_box, word_config, 4)) {
659 int16_t all_char_quality;
660 int16_t accepted_all_char_quality;
662 &all_char_quality, &accepted_all_char_quality);
673 (blob_quality == 0) && (outline_errs >= chars_in_word))
681 (
"QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f" 682 " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
696 bool good_quality_doc =
714 #endif // ndef DISABLED_LEGACY_ENGINE 728 static_cast<IncorrectResultReason>(bl)),
750 float word_x_height = word->
x_height;
751 if (word_x_height < word->best_choice->min_x_height() ||
759 const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
761 small_cap_xheight - small_cap_delta <= word_x_height &&
762 word_x_height <= small_cap_xheight + small_cap_delta) {
772 if (num_upper > 0 && num_lower == 0)
783 *next_left = INT32_MAX;
784 if (index < words.
size()) {
785 *right = words[index]->word->bounding_box().right();
786 if (index + 1 < words.
size())
787 *next_left = words[index + 1]->word->bounding_box().left();
793 static void EvaluateWordSpan(
const PointerVector<WERD_RES>& words,
794 int first_index,
int end_index,
float* rating,
795 float* certainty,
bool* bad,
796 bool* valid_permuter) {
797 if (end_index <= first_index) {
799 *valid_permuter =
false;
801 for (
int index = first_index; index < end_index && index < words.size();
804 if (choice ==
nullptr) {
807 *rating += choice->
rating();
808 *certainty = std::min(*certainty, choice->
certainty());
810 *valid_permuter =
false;
822 static int SelectBestWords(
double rating_ratio,
823 double certainty_margin,
825 PointerVector<WERD_RES>* new_words,
826 PointerVector<WERD_RES>* best_words) {
832 int num_best = 0, num_new = 0;
833 while (b < best_words->size() || n < new_words->size()) {
835 int start_b = b, start_n = n;
836 while (b < best_words->size() || n < new_words->size()) {
837 int b_right = -INT32_MAX;
838 int next_b_left = INT32_MAX;
839 WordGap(*best_words, b, &b_right, &next_b_left);
840 int n_right = -INT32_MAX;
841 int next_n_left = INT32_MAX;
842 WordGap(*new_words, n, &n_right, &next_n_left);
843 if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) {
848 if ((b_right < n_right && b < best_words->size()) ||
849 n == new_words->size())
855 float b_rating = 0.0f, n_rating = 0.0f;
857 float b_certainty = 0.0f, n_certainty = 0.0f;
859 bool b_bad =
false, n_bad =
false;
861 bool b_valid_permuter =
true, n_valid_permuter =
true;
862 const int end_b = b < best_words->size() ? b + 1 : b;
863 const int end_n = n < new_words->size() ? n + 1 : n;
864 EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty,
865 &b_bad, &b_valid_permuter);
866 EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty,
867 &n_bad, &n_valid_permuter);
868 bool new_better =
false;
869 if (!n_bad && (b_bad || (n_certainty > b_certainty &&
870 n_rating < b_rating) ||
871 (!b_valid_permuter && n_valid_permuter &&
872 n_rating < b_rating * rating_ratio &&
873 n_certainty > b_certainty - certainty_margin))) {
875 for (
int i = start_n; i < end_n; ++i) {
877 (*new_words)[i] =
nullptr;
883 for (
int i = start_b; i < end_b; ++i) {
885 (*best_words)[i] =
nullptr;
890 tprintf(
"%d new words %s than %d old words: r: %g v %g c: %g v %g" 891 " valid dict: %d v %d\n",
892 end_n - start_n, new_better ?
"better" :
"worse",
893 end_b - start_b, n_rating, b_rating,
894 n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
902 for (
int i = 0; i < out_words.
size(); ++i)
903 best_words->push_back(out_words[i]);
904 return num_new - num_best;
915 tprintf(
"Trying word using lang %s, oem %d\n",
920 (this->*recognizer)(word_data, in_word, &new_words);
921 if (new_words.
empty()) {
928 for (
int i = 0; i < new_words.
size(); ++i)
929 new_words[i]->DebugTopChoice(
"Lang result");
935 debug, &new_words, best_words);
940 for (
int w = 0; w < words.
size(); ++w) {
941 if (words[w]->tess_failed || !words[w]->tess_accepted)
return false;
950 bool* make_next_word_fuzzy) {
951 #ifdef DISABLED_LEGACY_ENGINE 954 *make_next_word_fuzzy =
false;
968 &word_wanted, &overlapped_any_blob,
976 int num_overlapped = 0;
977 int num_overlapped_used = 0;
978 for (
int i = 0; i < overlapped_any_blob.
size(); ++i) {
979 if (overlapped_any_blob[i]) {
981 if (word_wanted[i]) ++num_overlapped_used;
985 outlines[i] =
nullptr;
991 int non_overlapped = 0;
992 int non_overlapped_used = 0;
993 for (
int i = 0; i < word_wanted.
size(); ++i) {
994 if (word_wanted[i]) ++non_overlapped_used;
995 if (outlines[i] !=
nullptr) ++non_overlapped_used;
998 tprintf(
"Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
999 num_overlapped_used, num_overlapped, non_overlapped_used,
1005 make_next_word_fuzzy)) {
1010 return num_overlapped_used != 0 || non_overlapped_used != 0;
1011 #endif // ndef DISABLED_LEGACY_ENGINE 1024 #ifndef DISABLED_LEGACY_ENGINE 1034 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1035 C_BLOB* blob = blob_it.data();
1038 int num_blob_outlines = 0;
1039 for (
int i = 0; i < outlines.
size(); ++i) {
1041 !(*word_wanted)[i]) {
1042 blob_wanted[i] =
true;
1043 (*overlapped_any_blob)[i] =
true;
1044 ++num_blob_outlines;
1048 tprintf(
"%d noise outlines overlap blob at:", num_blob_outlines);
1057 outlines, num_blob_outlines,
1059 for (
int i = 0; i < blob_wanted.
size(); ++i) {
1060 if (blob_wanted[i]) {
1062 (*word_wanted)[i] =
true;
1063 (*target_blobs)[i] = blob;
1069 #endif // ndef DISABLED_LEGACY_ENGINE 1078 #ifndef DISABLED_LEGACY_ENGINE 1083 for (
int i = 0; i < outlines.
size(); ++i) {
1084 if (outlines[i] ==
nullptr)
continue;
1087 int num_blob_outlines = 0;
1088 TBOX total_ol_box(outlines[i]->bounding_box());
1089 while (i < outlines.
size() && outlines[i] !=
nullptr) {
1090 blob_wanted[i] =
true;
1091 total_ol_box += outlines[i]->bounding_box();
1093 ++num_blob_outlines;
1097 while (!blob_it.at_last() &&
1098 blob_it.data_relative(1)->bounding_box().left() <=
1099 total_ol_box.
left()) {
1105 tprintf(
"Num blobless outlines = %d\n", num_blob_outlines);
1106 C_BLOB* left_blob = blob_it.data();
1108 C_BLOB* right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1109 if ((left_box.
x_overlap(total_ol_box) || right_blob ==
nullptr ||
1112 outlines, num_blob_outlines,
1115 for (
int j = 0; j < blob_wanted.
size(); ++j) {
1116 if (blob_wanted[j]) {
1117 (*word_wanted)[j] =
true;
1118 (*target_blobs)[j] = left_blob;
1121 }
else if (right_blob !=
nullptr &&
1125 right_blob, outlines,
1126 num_blob_outlines, &blob_wanted)) {
1128 for (
int j = 0; j < blob_wanted.
size(); ++j) {
1129 if (blob_wanted[j]) {
1130 (*word_wanted)[j] =
true;
1131 (*target_blobs)[j] = right_blob;
1135 outlines, num_blob_outlines,
1138 for (
int j = 0; j < blob_wanted.
size(); ++j) {
1139 if (blob_wanted[j]) {
1140 (*word_wanted)[j] =
true;
1141 (*target_blobs)[j] =
nullptr;
1146 #endif // ndef DISABLED_LEGACY_ENGINE 1156 #ifndef DISABLED_LEGACY_ENGINE 1158 float target_cert = certainty_threshold;
1159 if (blob !=
nullptr) {
1163 tprintf(
"No Noise blob classified as %s=%g(%g) at:", best_str.
string(),
1164 target_cert, target_c2);
1174 pr_it, blob, &all_str);
1177 for (
int i = 0; i < test_outlines.
size(); ++i) {
1178 if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
1180 tprintf(
"All Noise blob classified as %s=%g, delta=%g at:",
1181 all_str.
string(), best_cert, best_cert - target_cert);
1187 while (num_outlines > 1 && best_index >= 0 &&
1188 (blob ==
nullptr || best_cert < target_cert || blob !=
nullptr)) {
1191 for (
int i = 0; i < outlines.
size(); ++i) {
1192 if (test_outlines[i]) {
1193 test_outlines[i] =
false;
1199 for (
int j = 0; j < outlines.
size(); ++j) {
1200 if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
1201 tprintf(
"%d", test_outlines[j]);
1203 tprintf(
" blob classified as %s=%g, delta=%g) at:", str.
string(),
1204 cert, cert - target_cert);
1207 if (cert > best_cert) {
1210 best_outlines = test_outlines;
1212 test_outlines[i] =
true;
1215 if (best_index >= 0) {
1216 test_outlines[best_index] =
false;
1220 if (best_cert >= target_cert) {
1222 *ok_outlines = best_outlines;
1224 tprintf(
"%s noise combination ", blob ?
"Adding" :
"New");
1225 for (
int i = 0; i < best_outlines.
size(); ++i) {
1226 tprintf(
"%d", best_outlines[i]);
1228 tprintf(
" yields certainty %g, beating target of %g\n", best_cert,
1233 #endif // ndef DISABLED_LEGACY_ENGINE 1243 #ifndef DISABLED_LEGACY_ENGINE 1246 C_BLOB* local_blob =
nullptr;
1247 if (blob !=
nullptr) {
1249 ol_it.set_to_list(blob->
out_list());
1250 first_to_keep = ol_it.data();
1252 for (
int i = 0; i < ok_outlines.
size(); ++i) {
1253 if (ok_outlines[i]) {
1255 if (blob ==
nullptr) {
1256 local_blob =
new C_BLOB(outlines[i]);
1258 ol_it.set_to_list(blob->
out_list());
1260 ol_it.add_before_stay_put(outlines[i]);
1266 ol_it.move_to_first();
1267 if (first_to_keep ==
nullptr) {
1269 for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
1274 for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1281 #endif // ndef DISABLED_LEGACY_ENGINE 1289 #ifndef DISABLED_LEGACY_ENGINE 1303 if (wd.word->raw_choice != NULL) {
1304 tprintf(
"word xheight=%g, row=%g, range=[%g,%g]\n", word_res->
x_height,
1305 wd.row->x_height(), wd.word->raw_choice->min_x_height(),
1306 wd.word->raw_choice->max_x_height());
1308 tprintf(
"Got word with null raw choice xheight=%g, row=%g\n", word_res->
x_height,
1309 wd.row->x_height());
1313 if (wd.word->raw_choice != NULL) {
1314 cert = wd.word->raw_choice->certainty();
1315 float rat = wd.word->raw_choice->rating();
1316 *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1317 *best_str = wd.word->raw_choice->unichar_string();
1327 #endif // ndef DISABLED_LEGACY_ENGINE 1340 #ifdef DISABLED_LEGACY_ENGINE 1345 #endif // def DISABLED_LEGACY_ENGINE 1351 clock_t start_t = clock();
1354 tprintf(
"%s word with lang %s at:",
1355 word->
done ?
"Already done" :
"Processing",
1365 int sub = sub_langs_.size();
1366 if (most_recently_used_ !=
this) {
1368 for (sub = 0; sub < sub_langs_.size() &&
1369 most_recently_used_ != sub_langs_[sub]; ++sub) {}
1372 *word_data, recognizer, debug, &word_data->
lang_words[sub], &best_words);
1373 Tesseract* best_lang_tess = most_recently_used_;
1374 if (!WordsAcceptable(best_words)) {
1376 if (most_recently_used_ !=
this &&
1380 best_lang_tess =
this;
1382 for (
int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
1384 if (most_recently_used_ != sub_langs_[i] &&
1388 best_lang_tess = sub_langs_[i];
1392 most_recently_used_ = best_lang_tess;
1393 if (!best_words.
empty()) {
1394 if (best_words.
size() == 1 && !best_words[0]->combination) {
1399 word_data->
word = best_words.
back();
1406 clock_t ocr_t = clock();
1408 tprintf(
"%s (ocr took %.2f sec)\n",
1410 static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
1423 ROW* row = word_data.
row;
1427 #ifndef ANDROID_BUILD 1428 #ifdef DISABLED_LEGACY_ENGINE 1433 #endif // def DISABLED_LEGACY_ENGINE 1436 if (!out_words->
empty())
1445 #ifndef DISABLED_LEGACY_ENGINE 1452 #endif // ndef DISABLED_LEGACY_ENGINE 1454 #endif // ndef ANDROID_BUILD 1456 #ifndef DISABLED_LEGACY_ENGINE 1477 #endif // ndef DISABLED_LEGACY_ENGINE 1483 tprintf(
"New XHT Match:%s = %s ",
1494 new_x_ht > 0.1 ?
"STILL DOUBT" :
"OK",
1495 accept_new_word ?
"ACCEPTED" :
"");
1498 #ifndef DISABLED_LEGACY_ENGINE 1506 if (original_misfits == 0)
1508 float baseline_shift = 0.0f;
1510 if (baseline_shift != 0.0f) {
1516 if (original_misfits > 0) {
1517 float new_baseline_shift;
1539 float baseline_shift,
float new_x_ht,
1541 bool accept_new_x_ht =
false;
1558 tprintf(
"Old misfits=%d with x-height %f, new=%d with x-height %f\n",
1560 new_misfits, new_x_ht);
1561 tprintf(
"Old rating= %f, certainty=%f, new=%f, %f\n",
1567 accept_new_x_ht = new_misfits < original_misfits &&
1576 if (accept_new_x_ht) {
1583 #endif // ndef DISABLED_LEGACY_ENGINE 1598 #ifndef DISABLED_LEGACY_ENGINE 1599 ROW* row = word_data.
row;
1626 #ifndef GRAPHICS_DISABLED 1640 #endif // ndef DISABLED_LEGACY_ENGINE 1643 #ifndef DISABLED_LEGACY_ENGINE 1661 tprintf(
"POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;" 1678 #endif // ndef DISABLED_LEGACY_ENGINE 1689 if (choice !=
nullptr) {
1690 if (best_choice ==
nullptr || choice->
rating() < best_choice->
rating())
1691 best_choice = choice;
1700 static void CorrectRepcharChoices(
BLOB_CHOICE* blob_choice,
1706 if (choice ==
nullptr) {
1708 choice_it.add_before_stay_put(
new BLOB_CHOICE(*blob_choice));
1712 for (
int i = 0; i < word->
length(); ++i) {
1731 for (
int i = 0; i < word.
length(); ++i) {
1737 int max_count = rep_ch.MaxCount(&maxch_id);
1739 BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1740 if (best_choice ==
nullptr) {
1741 tprintf(
"Failed to find a choice for %s, occurring %d times\n",
1751 C_BLOB* prev_blob = blob_it.data();
1752 for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1753 C_BLOB* blob = blob_it.data();
1755 gap -= prev_blob->bounding_box().right();
1760 CorrectRepcharChoices(best_choice, word_res);
1765 const UNICHARSET& char_set,
const char *s,
const char *lengths) {
1768 int leading_punct_count;
1769 int upper_count = 0;
1770 int hyphen_pos = -1;
1773 if (strlen (lengths) > 20)
1779 offset += lengths[i++];
1780 leading_punct_count = i;
1783 while (s[offset] !=
'\0' && char_set.
get_isupper(s + offset, lengths[i])) {
1784 offset += lengths[i++];
1787 if (upper_count > 1) {
1791 while (s[offset] !=
'\0' && char_set.
get_islower(s + offset, lengths[i])) {
1792 offset += lengths[i++];
1800 if (lengths[i] == 1 && s[offset] ==
'-') {
1802 offset += lengths[i++];
1803 if (s[offset] !=
'\0') {
1804 while ((s[offset] !=
'\0') &&
1806 offset += lengths[i++];
1808 if (i < hyphen_pos + 3)
1813 if (lengths[i] == 1 && (s[offset] ==
'\'') &&
1814 lengths[i + 1] == 1 && (s[offset + lengths[i]] ==
's')) {
1815 offset += lengths[i++];
1816 offset += lengths[i++];
1819 if (upper_count > 0)
1826 if (lengths[i] == 1 && s[offset] !=
'\0' &&
1828 offset += lengths[i++];
1829 if (lengths[i] == 1 && s[offset] !=
'\0' && i > 0 &&
1830 s[offset - lengths[i - 1]] != s[offset] &&
1832 offset += lengths[i++];
1834 if (s[offset] !=
'\0')
1843 if (s[0] !=
'\0' && char_set.
get_isupper(s, lengths[0])) {
1845 while (s[offset] !=
'\0' &&
1847 lengths[i + 1] == 1 && s[offset + lengths[i]] ==
'.') {
1848 offset += lengths[i++];
1849 offset += lengths[i++];
1852 else if (s[0] !=
'\0' && char_set.
get_islower(s, lengths[0])) {
1854 while (s[offset] !=
'\0' &&
1856 lengths[i + 1] == 1 && s[offset + lengths[i]] ==
'.') {
1857 offset += lengths[i++];
1858 offset += lengths[i++];
1861 if (s[offset] !=
'\0')
1869 bool show_map_detail =
false;
1886 tprintf (
"classify_word_pass1 start\n");
1890 tprintf (
"make_reject_map: initial map");
1893 tprintf (
"make_reject_map: after NN");
1896 tprintf (
"classify_word_pass2 - START");
1899 tprintf (
"classify_word_pass2 - Pre Xht");
1902 tprintf (
"classify_word_pass2 - END");
1903 show_map_detail =
true;
1915 tprintf (
"After Poor quality rejection");
1918 tprintf (
"unrej_good_quality_words - START");
1921 tprintf (
"unrej_good_quality_words - END");
1924 tprintf (
"Write results pass");
1925 show_map_detail =
true;
1932 if (show_map_detail) {
1940 tprintf(
"null best choice\n");
1943 tprintf (
"Done flag: %s\n\n", word->
done ?
"TRUE" :
"FALSE");
1955 static void find_modal_font(
1964 font = (int16_t) fonts->
mode ();
1967 *font_count =
count < INT8_MAX ?
count : INT8_MAX;
1968 fonts->
add (font, -*font_count);
1987 #ifndef DISABLED_LEGACY_ENGINE 1989 if (fontinfo_size == 0)
return;
1997 tprintf(
"Examining fonts in %s\n",
2002 if (choice ==
nullptr)
continue;
2004 for (
int f = 0; f < fonts.
size(); ++f) {
2005 const int fontinfo_id = fonts[f].fontinfo_id;
2006 if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
2007 font_total_score[fontinfo_id] += fonts[f].score;
2012 int score1 = 0, score2 = 0;
2013 int16_t font_id1 = -1, font_id2 = -1;
2014 for (
int f = 0; f < fontinfo_size; ++f) {
2016 tprintf(
"Font %s, total score = %d\n",
2019 if (font_total_score[f] > score1) {
2021 font_id2 = font_id1;
2022 score1 = font_total_score[f];
2024 }
else if (font_total_score[f] > score2) {
2025 score2 = font_total_score[f];
2039 tprintf(
"Word modal font=%s, score=%d, 2nd choice %s/%d\n",
2044 tprintf(
"Word modal font=%s, score=%d. No 2nd choice\n",
2051 #endif // ndef DISABLED_LEGACY_ENGINE 2063 STATS doc_fonts(0, font_table_size_);
2068 word = page_res_it.
word();
2077 int8_t doc_font_count;
2078 find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2079 if (doc_font_count == 0)
2082 const FontInfo* modal_font =
nullptr;
2085 word = page_res_it.
word();
2100 word = page_res_it.
word();
2104 if (!(
count == length || (length > 3 &&
count >= length * 3 / 4))) {
2121 if (word->best_choices.singleton())
2125 if (word->tesseract->getDict().valid_word(*best) != 0)
2128 WERD_CHOICE_IT choice_it(&word->best_choices);
2129 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2130 choice_it.forward()) {
2132 if (word->tesseract->getDict().valid_word(*alternate)) {
2135 tprintf(
"Dictionary correction replaces best choice '%s' with '%s'\n",
2140 word->ReplaceBestChoice(alternate);
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
const UNICHARSET & GetUnicharset() const
BLOCK_RES * block() const
void set_unichar_id(UNICHAR_ID unichar_id, int index)
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
WERD_CHOICE_LIST best_choices
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
void ConsumeWordResults(WERD_RES *word)
double classify_max_certainty_margin
void font_recognition_pass(PAGE_RES *page_res)
bool get_islower(UNICHAR_ID unichar_id) const
int32_t pile_count(int32_t value) const
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
bool tessedit_minimal_rejection
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs)
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
GenericVector< STRING > misadaption_log
WERD_CHOICE * prev_word_best_choice_
void set_global_subloc_code(int loc_code)
Dict & getDict() override
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
bool tessedit_fix_hyphens
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
bool tessedit_rejection_debug
void Add(T value, int count)
UnicityTable< FontInfo > & get_fontinfo_table()
void * cancel_this
monitor-aware progress callback
void tess_add_doc_word(WERD_CHOICE *word_choice)
void ZoomToRectangle(int x1, int y1, int x2, int y2)
int tessedit_ocr_engine_mode
const char * string() const
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
double noise_cert_disjoint
void full_print(FILE *fp)
TBOX bounding_box() const
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
bool classify_bln_numeric_mode
void GetNoiseOutlines(GenericVector< C_OUTLINE *> *outlines)
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
bool x_overlap(const TBOX &box) const
TBOX bounding_box() const
bool tessedit_word_for_word
bool tessedit_debug_quality_metrics
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
bool tessedit_timing_debug
bool tessedit_dump_choices
bool AdaptableWord(WERD_RES *word)
bool tessedit_enable_bigram_correction
const FontInfo * fontinfo
void set_word_fonts(WERD_RES *word)
bool AdaptiveClassifierIsEmpty() const
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
bool script_has_xheight() const
bool AdaptiveClassifierIsFull() const
void CopyTruth(const BlamerBundle &other)
C_BLOB_LIST * rej_cblob_list()
bool tessedit_minimal_rej_pass1
volatile int8_t ocr_alive
true if not last
bool SubAndSuperscriptFix(WERD_RES *word_res)
static const char * IncorrectReasonName(IncorrectResultReason irr)
WERD_CHOICE shallow_copy(int start, int end) const
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
char * chs_trailing_punct2
void SetScriptPositions()
int8_t fontinfo_id2_count
PointerVector< WERD_RES > lang_words
int quality_min_initial_alphas_reqd
float max_x_height() const
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
int16_t word_blob_quality(WERD_RES *word, ROW *row)
WERD_RES * restart_page()
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB *> &target_blobs, const GenericVector< C_OUTLINE *> &outlines, bool *make_next_word_fuzzy)
bool tessedit_enable_doc_dict
bool flag(WERD_FLAGS mask) const
bool tessedit_test_adaption
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
void GetNonSuperscriptSpan(int *start, int *end) const
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
#define LOC_WRITE_RESULTS
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
bool word_adaptable(WERD_RES *word, uint16_t mode)
void script_pos_pass(PAGE_RES *page_res)
void PrintBestChoices() const
void init_to_size(int size, const T &t)
bool textord_use_cjk_fp_model
bool major_overlap(const TBOX &box) const
int16_t doc_good_char_quality
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
bool major_x_overlap(const TBOX &box) const
bool deadline_exceeded() const
IncorrectResultReason incorrect_result_reason() const
STRING debug_str(UNICHAR_ID id) const
bool right_to_left() const
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
const double kMinRefitXHeightFraction
bool tessedit_fix_fuzzy_spaces
const GenericVector< tesseract::ScoredFont > & fonts() const
double classify_max_rating_ratio
FCOORD classify_rotation() const
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
void plot(ScrollView *window)
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
POLY_BLOCK * poly_block() const
void BestChoiceToCorrectText()
int tessedit_bigram_debug
bool tessedit_reject_bad_qual_wds
UNICHAR_ID unichar_id(int index) const
DLLSYM void tprintf(const char *format,...)
int16_t word_outline_errs(WERD_RES *word)
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
C_BLOB_LIST * cblob_list()
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
void add(int32_t value, int32_t count)
float min_x_height() const
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
bool tessedit_debug_fonts
void add_str_int(const char *str, int number)
void MakeCurrentWordFuzzy()
void fix_rep_char(PAGE_RES_IT *page_res_it)
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
bool tess_acceptable_word(WERD_RES *word)
tesseract::Tesseract * tesseract
EXTERN ScrollView * fx_win
void bigram_correction_pass(PAGE_RES *page_res)
void SwitchAdaptiveClassifier()
UnicityTable< FontInfo > fontinfo_table_
const STRING & misadaption_debug() const
static int SortByXMiddle(const void *v1, const void *v2)
char * chs_trailing_punct1
bool contains(const char c) const
void SetupWordPassN(int pass_n, WordData *word)
bool tessedit_enable_dict_correction
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
void set_global_loc_code(int loc_code)
const STRING debug_string() const
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
TBOX bounding_box() const
int CountMisfitTops(WERD_RES *word_res)
GenericVector< int > blame_reasons
const char *const kBackUpConfigFile
bool contains(const FCOORD pt) const
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
int tessedit_pageseg_mode
const UNICHARSET * uch_set
C_OUTLINE_LIST * out_list()
static const double kXHeightCapRatio
BlamerBundle * blamer_bundle
CANCEL_FUNC cancel
for errcode use
void InitForRetryRecognition(const WERD_RES &source)
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
const STRING & unichar_string() const
PROGRESS_FUNC2 progress_callback2
called whenever progress increases
double noise_cert_basechar
bool check_debug_pt(WERD_RES *word, int location)
void StartBackupAdaptiveClassifier()
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
double quality_outline_pc
bool get_isupper(UNICHAR_ID unichar_id) const
static void LastChanceBlame(bool debug, WERD_RES *word)
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
bool wordrec_debug_blamer
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
void ReplaceBestChoice(WERD_CHOICE *choice)
void blamer_pass(PAGE_RES *page_res)
int tessedit_tess_adaption_mode
bool recog_interactive(PAGE_RES_IT *pr_it)
bool tessedit_display_outwords
int multilang_debug_level
void dictionary_correction_pass(PAGE_RES *page_res)
void tess_segment_pass_n(int pass_n, WERD_RES *word)
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
bool top_bottom_useful() const
int16_t progress
chars in this buffer(0)
UNICHAR_ID unichar_id() const
void PrerecAllWordsPar(const GenericVector< WordData > &words)
WERD_CHOICE * best_choice
tesseract::BoxWord * box_word
void LearnWord(const char *fontname, WERD_RES *word)
int32_t get_total() const
static C_BLOB * deep_copy(const C_BLOB *src)
bool poly_allow_detailed_fx
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs)
void rej_word_bad_quality()
BLOB_CHOICE * GetBlobChoice(int index) const
void initialise(int16_t length)
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
int32_t x_height() const
return xheight
const FontInfo * fontinfo2
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)