33 #define PERFECT_WERDS 999
34 #define MAXSPACING 128
51 BLOCK_RES_IT block_res_it;
52 ROW_RES_IT row_res_it;
53 WERD_RES_IT word_res_it_from;
54 WERD_RES_IT word_res_it_to;
56 WERD_RES_LIST fuzzy_space_words;
58 BOOL8 prevent_null_wd_fixsp;
63 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
64 block_res_it.forward()) {
65 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
66 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
67 row_res_it.forward()) {
68 word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
69 while (!word_res_it_from.at_last()) {
70 word_res = word_res_it_from.data();
71 while (!word_res_it_from.at_last() &&
73 word_res_it_from.data_relative(1)->word->flag(
W_FUZZY_NON) ||
74 word_res_it_from.data_relative(1)->word->flag(
W_FUZZY_SP))) {
76 block_res_it.data()->block);
77 word_res = word_res_it_from.forward();
79 if (monitor !=
NULL) {
81 monitor->
progress = 90 + 5 * word_index / word_count;
89 if (!word_res_it_from.at_last()) {
90 word_res_it_to = word_res_it_from;
91 prevent_null_wd_fixsp =
95 word_res_it_to.forward();
97 if (monitor !=
NULL) {
99 monitor->
progress = 90 + 5 * word_index / word_count;
105 while (!word_res_it_to.at_last () &&
106 (word_res_it_to.data_relative(1)->word->flag(
W_FUZZY_NON) ||
107 word_res_it_to.data_relative(1)->word->flag(
W_FUZZY_SP))) {
111 prevent_null_wd_fixsp =
TRUE;
112 word_res = word_res_it_to.forward();
117 prevent_null_wd_fixsp =
TRUE;
118 if (prevent_null_wd_fixsp) {
119 word_res_it_from = word_res_it_to;
121 fuzzy_space_words.assign_to_sublist(&word_res_it_from,
124 row_res_it.data()->row,
125 block_res_it.data()->block);
126 new_length = fuzzy_space_words.length();
127 word_res_it_from.add_list_before(&fuzzy_space_words);
129 !word_res_it_from.at_last() && new_length > 0;
131 word_res_it_from.forward();
138 block_res_it.data()->block);
149 WERD_RES_LIST current_perm;
154 dump_words(best_perm, best_score, 1, improved);
159 while ((best_score !=
PERFECT_WERDS) && !current_perm.empty()) {
162 dump_words(current_perm, current_score, 2, improved);
163 if (current_score > best_score) {
166 best_score = current_score;
172 dump_words(best_perm, best_score, 3, improved);
178 WERD_RES_IT src_it(&src_list);
179 WERD_RES_IT new_it(&new_list);
183 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
184 src_wd = src_it.data();
189 new_it.add_after_then_move(new_wd);
198 WERD_RES_IT word_it(&words);
203 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
204 word = word_it.data();
206 WordData word_data(block, row, word);
241 WERD_RES_IT word_res_it(&word_res_list);
242 inT16 total_score = 0;
243 inT16 word_count = 0;
244 inT16 done_word_count = 0;
249 inT16 prev_word_score = 0;
254 BOOL8 current_word_ok_so_far;
255 STRING punct_chars =
"!\"`',.:;";
261 word = word_res_it.data();
265 total_score += prev_word_score;
270 prev_char_digit =
FALSE;
271 prev_word_done =
FALSE;
279 current_word_ok_so_far =
FALSE;
281 (prev_char_digit && (
287 total_score += prev_word_score;
290 current_word_ok_so_far = word_done;
293 if (current_word_ok_so_far) {
294 prev_word_done =
TRUE;
295 prev_word_score = word_len;
297 prev_word_done =
FALSE;
303 for (i = 0, prev_char_1 =
FALSE; i < word_len; i++) {
305 if (prev_char_1 || (current_char_1 && (i > 0)))
307 prev_char_1 = current_char_1;
313 for (i = 0, offset = 0, prev_char_punct =
FALSE; i < word_len;
317 if (prev_char_punct || (current_char_punct && i > 0))
319 prev_char_punct = current_char_punct;
323 for (i = 0, offset = 0; i < word_len - 1;
332 word_res_it.forward();
333 }
while (word_res_it.data()->part_of_combo);
334 }
while (!word_res_it.at_first());
335 total_score += prev_word_score;
338 if (done_word_count == word_count)
348 for (i = 0, offset = 0; i < char_position;
374 WERD_RES_IT word_it(&words);
375 WERD_RES_IT prev_word_it(&words);
385 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
386 word = word_it.data();
390 gap = box.
left() - prev_right;
394 prev_right = box.
right();
399 word_it.set_to_list(&words);
401 for (; (prev_right == -
MAX_INT16) || !word_it.at_first();
403 word = word_it.data();
407 gap = box.
left() - prev_right;
408 if (gap <= min_gap) {
409 prev_word = prev_word_it.data();
415 copy_word =
new WERD;
416 *copy_word = *(prev_word->
word);
422 prev_word_it.add_before_then_move(combo);
429 delete word_it.extract();
438 prev_word_it = word_it;
441 prev_right = box.
right();
452 WERD_RES_IT word_res_it(&perm);
457 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
458 word_res_it.forward()) {
459 if (!word_res_it.data()->part_of_combo) {
461 word_res_it.data()->best_choice->unichar_string();
470 tprintf(
"EXTRACTED (%d): \"", score);
473 tprintf(
"TESTED (%d): \"", score);
476 tprintf(
"RETURNED (%d): \"", score);
480 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
481 word_res_it.forward()) {
482 if (!word_res_it.data()->part_of_combo) {
484 word_res_it.data()->best_choice->unichar_string().string(),
485 (int)word_res_it.data()->best_choice->permuter());
489 }
else if (improved) {
491 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
492 word_res_it.forward()) {
493 if (!word_res_it.data()->part_of_combo) {
495 word_res_it.data()->best_choice->unichar_string().string(),
496 (int)word_res_it.data()->best_choice->permuter());
539 WERD_RES_LIST sub_word_list;
540 WERD_RES_IT sub_word_list_it(&sub_word_list);
545 word_res = word_res_it.data();
557 tprintf(
"FP fixspace working on \"%s\"\n",
561 sub_word_list_it.add_after_stay_put(word_res_it.extract());
563 new_length = sub_word_list.length();
564 word_res_it.add_list_before(&sub_word_list);
565 for (; !word_res_it.at_last() && new_length > 1; new_length--) {
566 word_res_it.forward();
573 WERD_RES_IT best_perm_it(&best_perm);
574 WERD_RES_LIST current_perm;
575 WERD_RES_IT current_perm_it(¤t_perm);
582 dump_words(best_perm, best_score, 1, improved);
584 old_word_res = best_perm_it.data();
593 while (best_score !=
PERFECT_WERDS && !current_perm.empty()) {
596 dump_words(current_perm, current_score, 2, improved);
597 if (current_score > best_score) {
600 best_score = current_score;
607 dump_words(best_perm, best_score, 3, improved);
617 WERD_RES_IT word_it(&words);
618 WERD_RES_IT worst_word_it;
619 float worst_noise_score = 9999;
620 int worst_blob_index = -1;
625 C_BLOB_IT rej_cblob_it;
626 C_BLOB_LIST new_blob_list;
627 C_BLOB_IT new_blob_it;
628 C_BLOB_IT new_rej_cblob_it;
630 inT16 start_of_noise_blob;
633 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
635 if (blob_index > -1 && worst_noise_score > noise_score) {
636 worst_noise_score = noise_score;
637 worst_blob_index = blob_index;
638 worst_word_it = word_it;
641 if (worst_blob_index < 0) {
648 word_res = worst_word_it.data();
652 new_blob_it.set_to_list(&new_blob_list);
654 for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
655 new_blob_it.add_after_then_move(blob_it.extract());
657 start_of_noise_blob = blob_it.data()->bounding_box().left();
658 delete blob_it.extract();
660 new_word =
new WERD(&new_blob_list, word_res->
word);
668 (!rej_cblob_it.empty() &&
669 (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
670 rej_cblob_it.forward()) {
671 new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
676 worst_word_it.add_before_then_move(new_word_res);
682 float *worst_noise_score) {
683 float noise_score[512];
705 tprintf(
"FP fixspace Noise metrics for \"%s\": ",
712 noise_score[i] = non_noise_limit;
717 tprintf(
"%1.1f ", noise_score[i]);
726 if (noise_score[i] >= non_noise_limit) {
730 if (non_noise_count < fixsp_non_noise_limit)
738 if (noise_score[i] >= non_noise_limit) {
742 if (non_noise_count < fixsp_non_noise_limit)
747 if (min_noise_blob > max_noise_blob)
750 *worst_noise_score = small_limit;
751 worst_noise_blob = -1;
752 for (i = min_noise_blob; i <= max_noise_blob; i++) {
753 if (noise_score[i] < *worst_noise_score) {
754 worst_noise_blob = i;
755 *worst_noise_score = noise_score[i];
763 inT16 outline_count = 0;
765 inT16 largest_outline_dimension = 0;
769 box = ol->bounding_box();
771 max_dimension = box.
height();
773 max_dimension = box.
width();
776 if (largest_outline_dimension < max_dimension)
777 largest_outline_dimension = max_dimension;
780 if (outline_count > 5) {
782 largest_outline_dimension *= 2;
789 largest_outline_dimension /= 2;
792 return largest_outline_dimension;
803 tprintf(
"Blob count: %d (word); %d/%d (rebuild word)\n",
809 if (show_map_detail) {
818 tprintf(
"Done flag: %s\n\n", word->
done ?
"TRUE" :
"FALSE");
832 WERD_RES_IT word_it(&word_res_list);
839 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
840 word = word_it.data();
void join_on(WERD *other)
tesseract::BoxWord * box_word
void break_noisiest_blob_word(WERD_RES_LIST &words)
BLOCK_RES_LIST block_res_list
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
char * numeric_punctuation
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
WERD_CHOICE * best_choice
void transform_to_next_perm(WERD_RES_LIST &words)
void fixspace_dbg(WERD_RES *word)
const STRING & unichar_lengths() const
TBOX bounding_box() const
int debug_fix_space_level
inT16 safe_dict_word(const WERD_RES *werd_res)
inT16 eval_word_spacing(WERD_RES_LIST &word_res_list)
static WERD_RES * deep_copy(const WERD_RES *src)
WERD_CHOICE * prev_word_best_choice_
const STRING & unichar_string() const
bool get_isdigit(UNICHAR_ID unichar_id) const
void full_print(FILE *fp)
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
BOOL8 check_debug_pt(WERD_RES *word, int location)
const UNICHAR_ID unichar_id(int index) const
bool deadline_exceeded() const
const UNICHARSET * uch_set
char * conflict_set_I_l_1
int fixsp_non_noise_limit
const int kBlnBaselineOffset
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
void SetupWordPassN(int pass_n, WordData *word)
double fixsp_small_outlines_size
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
int c_blob_comparator(const void *blob1p, const void *blob2p)
float blob_noise_score(TBLOB *blob)
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
GenericVector< TBLOB * > blobs
BOOL8 fixspace_thinks_word_done(WERD_RES *word)
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
BOOL8 flag(WERD_FLAGS mask) const
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
TBOX bounding_box() const
void copy_on(WERD_RES *word_res)
const char * string() const
void set_flag(WERD_FLAGS mask, BOOL8 value)
bool tessedit_prefer_joined_punct
void fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position)
void set_blanks(uinT8 new_blanks)
C_BLOB_LIST * rej_cblob_list()
C_BLOB_LIST * cblob_list()
void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
BOOL8 contains(const char c) const