23 #include "allheaders.h" 25 #ifndef DISABLED_LEGACY_ENGINE 77 #ifndef DISABLED_LEGACY_ENGINE 78 static void clear_any_old_text(BLOCK_LIST *block_list) {
79 BLOCK_IT block_it(block_list);
80 for (block_it.mark_cycle_pt();
81 !block_it.cycled_list(); block_it.forward()) {
82 ROW_IT row_it(block_it.data()->row_list());
83 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
84 WERD_IT word_it(row_it.data()->word_list());
85 for (word_it.mark_cycle_pt();
86 !word_it.cycled_list(); word_it.forward()) {
87 word_it.data()->set_text(
"");
114 bool find_segmentation,
115 BLOCK_LIST *block_list) {
123 const int box_count = boxes.
size();
124 int box_failures = 0;
128 PAGE_RES* page_res = find_segmentation ?
130 clear_any_old_text(block_list);
132 for (
int i = 0; i < box_count; i++) {
133 bool foundit =
false;
134 if (page_res !=
nullptr) {
136 (i == 0) ?
nullptr : &boxes[i - 1],
138 (i == box_count - 1) ?
nullptr : &boxes[i + 1],
139 full_texts[i].
string());
142 (i == box_count - 1) ?
nullptr : &boxes[i + 1],
148 "FAILURE! Couldn't find a matching blob");
152 if (page_res ==
nullptr) {
160 tprintf(
" Boxes read from boxfile: %6d\n", box_count);
161 if (box_failures > 0)
162 tprintf(
" Boxes failed resegmentation: %6d\n", box_failures);
167 #endif // ndef DISABLED_LEGACY_ENGINE 170 static double MedianXHeight(BLOCK_LIST *block_list) {
171 BLOCK_IT block_it(block_list);
172 STATS xheights(0, block_it.data()->pdblk.bounding_box().height());
173 for (block_it.mark_cycle_pt();
174 !block_it.cycled_list(); block_it.forward()) {
175 ROW_IT row_it(block_it.data()->row_list());
176 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
180 return xheights.median();
186 const double median_xheight = MedianXHeight(block_list);
189 BLOCK_IT b_it(block_list);
190 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
191 BLOCK* block = b_it.data();
193 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
194 ROW* row = r_it.data();
195 const double diff = fabs(row->
x_height() - median_xheight);
196 if (diff > max_deviation) {
198 tprintf(
"row xheight=%g, but median xheight = %g\n",
207 #ifndef DISABLED_LEGACY_ENGINE 212 BLOCK_LIST *block_list) {
215 BLOCK_IT b_it(block_list);
216 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
217 BLOCK* block = b_it.data();
219 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
220 ROW* row = r_it.data();
222 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
223 WERD* word = w_it.data();
225 delete w_it.extract();
236 while ((word_res = pr_it.
word()) !=
nullptr) {
238 pr_it.
row()->
row, word_res);
260 tprintf(
"Maximally chopping word at:");
265 float rating =
static_cast<float>(INT8_MAX);
280 const double e = exp(1.0);
282 int right_chop_index = 0;
285 SEAM* seam =
nullptr;
287 &blob_number)) !=
nullptr) {
289 BLOB_CHOICE* left_choice = blob_choices[blob_number];
290 rating = left_choice->
rating() / e;
295 rating - 0.125f, -rating, -1,
297 blob_choices.
insert(right_choice, blob_number + 1);
304 #endif // ndef DISABLED_LEGACY_ENGINE 317 static double BoxMissMetric(
const TBOX& box1,
const TBOX& box2) {
319 const int a = box1.
area();
320 const int b = box2.
area();
322 return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
325 #ifndef DISABLED_LEGACY_ENGINE 338 const TBOX& box,
const TBOX* next_box,
339 const char* correct_text) {
341 tprintf(
"\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
345 for (word_res = page_res_it.
word(); word_res !=
nullptr;
346 word_res = page_res_it.
forward()) {
354 for (
int i = 0; i < word_len; ++i) {
357 for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
363 if (next_box !=
nullptr) {
364 const double current_box_miss_metric = BoxMissMetric(blob_box, box);
365 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
369 tprintf(
"Current miss metric = %g, next = %g\n",
370 current_box_miss_metric, next_box_miss_metric);
372 if (current_box_miss_metric > next_box_miss_metric)
375 char_box += blob_box;
377 if (blob_count > 0) {
379 tprintf(
"Index [%d, %d) seem good.\n", i, i + blob_count);
382 ((next_box !=
nullptr && box.
x_gap(*next_box) < -3)||
383 (prev_box !=
nullptr && prev_box->
x_gap(box) < -3))) {
394 tprintf(
"%d Blobs match: blob box:", blob_count);
398 if (next_box !=
nullptr) {
405 for (
int j = 1; j < blob_count; ++j) {
440 const TBOX& box,
const TBOX* next_box,
441 const char* correct_text) {
443 tprintf(
"\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
445 WERD* new_word =
nullptr;
446 BLOCK_IT b_it(block_list);
447 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
448 BLOCK* block = b_it.data();
452 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
453 ROW* row = r_it.data();
457 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
458 WERD* word = w_it.data();
463 if (word->
text() !=
nullptr && word->
text()[0] !=
'\0')
468 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
470 C_BLOB* blob = blob_it.data();
474 if (next_box !=
nullptr) {
475 const double current_box_miss_metric = BoxMissMetric(blob_box, box);
476 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
480 tprintf(
"Current miss metric = %g, next = %g\n",
481 current_box_miss_metric, next_box_miss_metric);
483 if (current_box_miss_metric > next_box_miss_metric)
491 if (next_box !=
nullptr) {
496 if (new_word ==
nullptr) {
500 w_it.add_to_end(new_word);
502 C_BLOB_IT new_blob_it(new_word->
cblob_list());
503 new_blob_it.add_to_end(blob_it.extract());
509 return new_word !=
nullptr;
517 for (; (word_res = pr_it.
word()) !=
nullptr; pr_it.
forward()) {
519 if (word->
text() ==
nullptr || word->
text()[0] ==
'\0')
524 tprintf(
"APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
530 tprintf(
"APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
538 #endif // ndef DISABLED_LEGACY_ENGINE 544 for (
int step = 0; *utf8 !=
'\0'; utf8 += step) {
545 const char* next_space = strchr(utf8,
' ');
546 if (next_space ==
nullptr)
547 next_space = utf8 + strlen(utf8);
548 step = next_space - utf8;
550 if (class_id == INVALID_UNICHAR_ID) {
553 while (utf8[step] ==
' ')
560 #ifndef DISABLED_LEGACY_ENGINE 575 for (
int i = 0; i < word_length; ++i) {
576 for (
int j = 1; j <=
kMaxGroupSize && i + j <= word_length; ++j) {
578 word_res->
seam_array, i, i + j - 1,
"Applybox",
592 float best_rating = 0.0f;
594 &search_segmentation, &best_rating, &word_res->
best_state);
595 for (
int i = 0; i < word_length; ++i)
596 choices[i].delete_data_pointers();
618 for (
int i = 0; i < target_text.
size(); ++i) {
640 int choices_pos,
int choices_length,
647 for (
int length = 1; length <= choices[choices_pos].
size(); ++length) {
649 float choice_rating = 0.0f;
651 BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
652 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
653 choice_it.forward()) {
655 choice_rating = choice->
rating();
657 if (class_id == target_text[text_index]) {
661 if (class_id < table.size() && table[class_id] !=
nullptr) {
662 AmbigSpec_IT spec_it(table[class_id]);
663 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
665 const AmbigSpec *ambig_spec = spec_it.data();
667 if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
668 ambig_spec->correct_ngram_id == target_text[text_index])
671 if (!spec_it.cycled_list())
675 if (choice_it.cycled_list())
678 if (choices_pos + length == choices_length &&
679 text_index + 1 == target_text.
size()) {
682 tprintf(
"Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
683 rating + choice_rating, *best_rating, segmentation->
size(),
684 best_segmentation->
size());
686 if (best_segmentation->
empty() || rating + choice_rating < *best_rating) {
687 *best_segmentation = *segmentation;
688 *best_rating = rating + choice_rating;
690 }
else if (choices_pos + length < choices_length &&
691 text_index + 1 < target_text.
size()) {
693 tprintf(
"Match found for %d=%s:%s, at %d+%d, recursing...\n",
694 target_text[text_index],
696 choice_it.data()->unichar_id() == target_text[text_index]
698 choices_pos, length);
700 SearchForText(choices, choices_pos + length, choices_length, target_text,
701 text_index + 1, rating + choice_rating, segmentation,
702 best_rating, best_segmentation);
704 tprintf(
"End recursion for %d=%s\n", target_text[text_index],
717 int ok_blob_count = 0;
718 int bad_blob_count = 0;
719 int ok_word_count = 0;
720 int unlabelled_words = 0;
723 for (; (word_res = pr_it.
word()) !=
nullptr; pr_it.
forward()) {
728 for (
int c = 0; c < blob_count; ++c) {
737 INVALID_UNICHAR_ID, word_res->
best_state[c], 1.0f, -1.0f);
739 if (ok_in_word > 0) {
740 ok_blob_count += ok_in_word;
747 tprintf(
"APPLY_BOXES: Unlabelled word at :");
755 for (; (word_res = pr_it.
word()) !=
nullptr; pr_it.
forward()) {
763 tprintf(
" Found %d good blobs.\n", ok_blob_count);
764 if (bad_blob_count > 0) {
765 tprintf(
" Leaving %d unlabelled blobs in %d words.\n",
766 bad_blob_count, ok_word_count);
768 if (unlabelled_words > 0)
769 tprintf(
" %d remaining unlabelled words deleted.\n", unlabelled_words);
773 #endif // ndef DISABLED_LEGACY_ENGINE 777 const char *box_ch,
const char *err_msg) {
778 tprintf(
"APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
779 boxfile_lineno + 1, box_ch,
786 for (
WERD_RES *word_res = pr_it.
word(); word_res !=
nullptr;
806 #ifndef DISABLED_LEGACY_ENGINE 814 for (
WERD_RES *word_res = pr_it.
word(); word_res !=
nullptr;
819 tprintf(
"Generated training data for %d words\n", word_count);
822 #endif // ndef DISABLED_LEGACY_ENGINE BLOCK_RES * block() const
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
void ReSegmentByClassification(PAGE_RES *page_res)
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
Dict & getDict() override
bool almost_equal(const TBOX &box, int tolerance) const
TBOX intersection(const TBOX &box) const
const UnicharAmbigs & getUnicharAmbigs() const
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
int tessedit_ocr_engine_mode
const char * string() const
TBOX bounding_box() const
bool classify_bln_numeric_mode
void InsertSeam(int blob_number, SEAM *seam)
void MergeBoxes(int start, int end)
ROW_LIST * row_list()
get rows
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
int x_gap(const TBOX &box) const
void set_text(const char *new_text)
void set_flag(WERD_FLAGS mask, bool value)
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
const TBOX & BlobBox(int index) const
ROW_RES * next_row() const
GenericVector< STRING > correct_text
WERD_RES * restart_page()
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
void set_certainty(float newrat)
void insert(const T &t, int index)
const double kMaxXHeightDeviationFraction
ROW_RES * prev_row() const
void set_x_height(float new_xheight)
const char * text() const
void set_rating(float newrat)
bool textord_use_cjk_fp_model
bool major_overlap(const TBOX &box) const
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
const TBOX & bounding_box() const
bool LogNewRawChoice(WERD_CHOICE *word_choice)
int IntCastRounded(double x)
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
const UnicharAmbigsVector & dang_ambigs() const
void SearchForText(const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
GenericVector< int > best_state
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
void CloneChoppedToRebuild()
TBOX bounding_box() const
DLLSYM void tprintf(const char *format,...)
C_BLOB_LIST * cblob_list()
GenericVector< SEAM * > seam_array
void TidyUp(PAGE_RES *page_res)
void add(int32_t value, int32_t count)
GenericVector< TBLOB * > blobs
TBOX bounding_box() const
const UNICHARSET * uch_set
const char * id_to_unichar(UNICHAR_ID id) const
bool assume_fixed_pitch_char_segment
BlamerBundle * blamer_bundle
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET ¤t_unicharset)
void CorrectClassifyWords(PAGE_RES *page_res)
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM *> &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
bool HasAnySplits() const
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, int *blob_number)
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
UNICHAR_ID unichar_id() const
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
void PreenXHeights(BLOCK_LIST *block_list)
tesseract::BoxWord * box_word
void LearnWord(const char *fontname, WERD_RES *word)
bool poly_allow_detailed_fx
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
void set_permuter(uint8_t perm)