22 #include "allheaders.h"
73 #ifndef DISABLED_LEGACY_ENGINE
74 static void clear_any_old_text(BLOCK_LIST *block_list) {
75 BLOCK_IT block_it(block_list);
76 for (block_it.mark_cycle_pt();
77 !block_it.cycled_list(); block_it.forward()) {
78 ROW_IT row_it(block_it.data()->row_list());
79 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
80 WERD_IT word_it(row_it.data()->word_list());
81 for (word_it.mark_cycle_pt();
82 !word_it.cycled_list(); word_it.forward()) {
83 word_it.data()->set_text(
"");
110 bool find_segmentation,
111 BLOCK_LIST *block_list) {
119 const int box_count = boxes.
size();
120 int box_failures = 0;
124 PAGE_RES* page_res = find_segmentation ?
126 clear_any_old_text(block_list);
128 for (
int i = 0; i < box_count; i++) {
129 bool foundit =
false;
130 if (page_res !=
nullptr) {
132 (i == 0) ?
nullptr : &boxes[i - 1],
134 (i == box_count - 1) ?
nullptr : &boxes[i + 1],
135 full_texts[i].c_str());
138 (i == box_count - 1) ?
nullptr : &boxes[i + 1],
144 "FAILURE! Couldn't find a matching blob");
148 if (page_res ==
nullptr) {
156 tprintf(
" Boxes read from boxfile: %6d\n", box_count);
157 if (box_failures > 0)
158 tprintf(
" Boxes failed resegmentation: %6d\n", box_failures);
163 #endif // ndef DISABLED_LEGACY_ENGINE
166 static double MedianXHeight(BLOCK_LIST *block_list) {
167 BLOCK_IT block_it(block_list);
168 STATS xheights(0, block_it.data()->pdblk.bounding_box().height());
169 for (block_it.mark_cycle_pt();
170 !block_it.cycled_list(); block_it.forward()) {
171 ROW_IT row_it(block_it.data()->row_list());
172 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
176 return xheights.median();
182 const double median_xheight = MedianXHeight(block_list);
185 BLOCK_IT b_it(block_list);
186 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
187 BLOCK* block = b_it.data();
189 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
190 ROW* row = r_it.data();
191 const double diff = fabs(row->
x_height() - median_xheight);
192 if (diff > max_deviation) {
194 tprintf(
"row xheight=%g, but median xheight = %g\n",
203 #ifndef DISABLED_LEGACY_ENGINE
208 BLOCK_LIST *block_list) {
211 BLOCK_IT b_it(block_list);
212 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
213 BLOCK* block = b_it.data();
215 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
216 ROW* row = r_it.data();
218 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
219 WERD* word = w_it.data();
221 delete w_it.extract();
229 auto* page_res =
new PAGE_RES(
false, block_list,
nullptr);
232 while ((word_res = pr_it.word()) !=
nullptr) {
234 pr_it.row()->row, word_res);
256 tprintf(
"Maximally chopping word at:");
261 auto rating = static_cast<float>(INT8_MAX);
276 const double e = exp(1.0);
278 int right_chop_index = 0;
281 SEAM* seam =
nullptr;
283 &blob_number)) !=
nullptr) {
285 BLOB_CHOICE* left_choice = blob_choices[blob_number];
286 rating = left_choice->
rating() / e;
290 auto* right_choice =
new BLOB_CHOICE(++right_chop_index,
291 rating - 0.125f, -rating, -1,
293 blob_choices.
insert(right_choice, blob_number + 1);
311 static double BoxMissMetric(
const TBOX& box1,
const TBOX& box2) {
313 const int a = box1.
area();
314 const int b = box2.
area();
316 return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
330 const TBOX& box,
const TBOX* next_box,
331 const char* correct_text) {
333 tprintf(
"\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
337 for (word_res = page_res_it.word(); word_res !=
nullptr;
338 word_res = page_res_it.forward()) {
346 for (
int i = 0; i < word_len; ++i) {
349 for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
353 if (word_res->
correct_text[i + blob_count].length() > 0)
355 if (next_box !=
nullptr) {
356 const double current_box_miss_metric = BoxMissMetric(blob_box, box);
357 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
361 tprintf(
"Current miss metric = %g, next = %g\n",
362 current_box_miss_metric, next_box_miss_metric);
364 if (current_box_miss_metric > next_box_miss_metric)
367 char_box += blob_box;
369 if (blob_count > 0) {
371 tprintf(
"Index [%d, %d) seem good.\n", i, i + blob_count);
374 ((next_box !=
nullptr && box.
x_gap(*next_box) < -3)||
375 (prev_box !=
nullptr && prev_box->
x_gap(box) < -3))) {
386 tprintf(
"%d Blobs match: blob box:", blob_count);
390 if (next_box !=
nullptr) {
397 for (
int j = 1; j < blob_count; ++j) {
432 const TBOX& box,
const TBOX* next_box,
433 const char* correct_text) {
435 tprintf(
"\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
437 WERD* new_word =
nullptr;
438 BLOCK_IT b_it(block_list);
439 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
440 BLOCK* block = b_it.data();
444 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
445 ROW* row = r_it.data();
449 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
450 WERD* word = w_it.data();
455 if (word->
text() !=
nullptr && word->
text()[0] !=
'\0')
460 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
462 C_BLOB* blob = blob_it.data();
466 if (next_box !=
nullptr) {
467 const double current_box_miss_metric = BoxMissMetric(blob_box, box);
468 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
472 tprintf(
"Current miss metric = %g, next = %g\n",
473 current_box_miss_metric, next_box_miss_metric);
475 if (current_box_miss_metric > next_box_miss_metric)
483 if (next_box !=
nullptr) {
488 if (new_word ==
nullptr) {
492 w_it.add_to_end(new_word);
494 C_BLOB_IT new_blob_it(new_word->
cblob_list());
495 new_blob_it.add_to_end(blob_it.extract());
501 return new_word !=
nullptr;
509 for (; (word_res = pr_it.
word()) !=
nullptr; pr_it.forward()) {
511 if (word->
text() ==
nullptr || word->
text()[0] ==
'\0')
516 tprintf(
"APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
518 pr_it.DeleteCurrentWord();
522 tprintf(
"APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
524 pr_it.DeleteCurrentWord();
530 #endif // ndef DISABLED_LEGACY_ENGINE
536 for (
int step = 0; *utf8 !=
'\0'; utf8 += step) {
537 const char* next_space = strchr(utf8,
' ');
538 if (next_space ==
nullptr)
539 next_space = utf8 + strlen(utf8);
540 step = next_space - utf8;
542 if (class_id == INVALID_UNICHAR_ID) {
545 while (utf8[step] ==
' ')
552 #ifndef DISABLED_LEGACY_ENGINE
567 for (
int i = 0; i < word_length; ++i) {
568 for (
int j = 1; j <=
kMaxGroupSize && i + j <= word_length; ++j) {
570 word_res->
seam_array, i, i + j - 1,
"Applybox",
576 choices[i].push_back(match_result);
584 float best_rating = 0.0f;
586 &search_segmentation, &best_rating, &word_res->
best_state);
587 for (
int i = 0; i < word_length; ++i)
588 choices[i].delete_data_pointers();
610 for (
int i = 0; i < target_text.
size(); ++i) {
632 int choices_pos,
int choices_length,
639 for (
int length = 1; length <= choices[choices_pos].
size(); ++length) {
641 float choice_rating = 0.0f;
643 BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
644 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
645 choice_it.forward()) {
647 choice_rating = choice->
rating();
649 if (class_id == target_text[text_index]) {
653 if (class_id < table.size() && table[class_id] !=
nullptr) {
654 AmbigSpec_IT spec_it(table[class_id]);
655 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
657 const AmbigSpec *ambig_spec = spec_it.data();
659 if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
660 ambig_spec->correct_ngram_id == target_text[text_index])
663 if (!spec_it.cycled_list())
667 if (choice_it.cycled_list())
670 if (choices_pos + length == choices_length &&
671 text_index + 1 == target_text.
size()) {
674 tprintf(
"Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
675 rating + choice_rating, *best_rating, segmentation->
size(),
676 best_segmentation->
size());
678 if (best_segmentation->
empty() || rating + choice_rating < *best_rating) {
679 *best_segmentation = *segmentation;
680 *best_rating = rating + choice_rating;
682 }
else if (choices_pos + length < choices_length &&
683 text_index + 1 < target_text.
size()) {
685 tprintf(
"Match found for %d=%s:%s, at %d+%d, recursing...\n",
686 target_text[text_index],
688 choice_it.data()->unichar_id() == target_text[text_index]
690 choices_pos, length);
692 SearchForText(choices, choices_pos + length, choices_length, target_text,
693 text_index + 1, rating + choice_rating, segmentation,
694 best_rating, best_segmentation);
696 tprintf(
"End recursion for %d=%s\n", target_text[text_index],
709 int ok_blob_count = 0;
710 int bad_blob_count = 0;
711 int ok_word_count = 0;
712 int unlabelled_words = 0;
715 for (; (word_res = pr_it.
word()) !=
nullptr; pr_it.forward()) {
720 for (
int c = 0; c < blob_count; ++c) {
728 word_choice->append_unichar_id_space_allocated(
729 INVALID_UNICHAR_ID, word_res->
best_state[c], 1.0f, -1.0f);
731 if (ok_in_word > 0) {
732 ok_blob_count += ok_in_word;
739 tprintf(
"APPLY_BOXES: Unlabelled word at :");
742 pr_it.DeleteCurrentWord();
746 pr_it.restart_page();
747 for (; (word_res = pr_it.
word()) !=
nullptr; pr_it.forward()) {
755 tprintf(
" Found %d good blobs.\n", ok_blob_count);
756 if (bad_blob_count > 0) {
757 tprintf(
" Leaving %d unlabelled blobs in %d words.\n",
758 bad_blob_count, ok_word_count);
760 if (unlabelled_words > 0)
761 tprintf(
" %d remaining unlabelled words deleted.\n", unlabelled_words);
765 #endif // ndef DISABLED_LEGACY_ENGINE
769 const char *box_ch,
const char *err_msg) {
770 tprintf(
"APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
771 boxfile_lineno + 1, box_ch,
778 for (
WERD_RES *word_res = pr_it.
word(); word_res !=
nullptr;
779 word_res = pr_it.forward()) {
788 choice->append_unichar_id_space_allocated(char_id,
798 #ifndef DISABLED_LEGACY_ENGINE
806 for (
WERD_RES *word_res = pr_it.
word(); word_res !=
nullptr;
807 word_res = pr_it.forward()) {
811 tprintf(
"Generated training data for %d words\n", word_count);
814 #endif // ndef DISABLED_LEGACY_ENGINE