46 #define PERFECT_WERDS 999 47 #define MAXSPACING 128 58 static int c_blob_comparator(
62 const C_BLOB *blob1 = *
reinterpret_cast<const C_BLOB* const*
>(blob1p);
63 const C_BLOB *blob2 = *
reinterpret_cast<const C_BLOB* const*
>(blob2p);
81 BLOCK_RES_IT block_res_it;
82 ROW_RES_IT row_res_it;
83 WERD_RES_IT word_res_it_from;
84 WERD_RES_IT word_res_it_to;
86 WERD_RES_LIST fuzzy_space_words;
88 bool prevent_null_wd_fixsp;
93 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
94 block_res_it.forward()) {
95 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
96 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
97 row_res_it.forward()) {
98 word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
99 while (!word_res_it_from.at_last()) {
100 word_res = word_res_it_from.data();
101 while (!word_res_it_from.at_last() &&
103 word_res_it_from.data_relative(1)->word->flag(
W_FUZZY_NON) ||
104 word_res_it_from.data_relative(1)->word->flag(
W_FUZZY_SP))) {
106 block_res_it.data()->block);
107 word_res = word_res_it_from.forward();
109 if (monitor !=
nullptr) {
111 monitor->
progress = 90 + 5 * word_index / word_count;
113 (monitor->
cancel !=
nullptr &&
119 if (!word_res_it_from.at_last()) {
120 word_res_it_to = word_res_it_from;
121 prevent_null_wd_fixsp =
125 word_res_it_to.forward();
127 if (monitor !=
nullptr) {
129 monitor->
progress = 90 + 5 * word_index / word_count;
131 (monitor->
cancel !=
nullptr &&
135 while (!word_res_it_to.at_last () &&
136 (word_res_it_to.data_relative(1)->word->flag(
W_FUZZY_NON) ||
137 word_res_it_to.data_relative(1)->word->flag(
W_FUZZY_SP))) {
141 prevent_null_wd_fixsp =
true;
142 word_res = word_res_it_to.forward();
147 prevent_null_wd_fixsp =
true;
148 if (prevent_null_wd_fixsp) {
149 word_res_it_from = word_res_it_to;
151 fuzzy_space_words.assign_to_sublist(&word_res_it_from,
154 row_res_it.data()->row,
155 block_res_it.data()->block);
156 new_length = fuzzy_space_words.length();
157 word_res_it_from.add_list_before(&fuzzy_space_words);
159 !word_res_it_from.at_last() && new_length > 0;
161 word_res_it_from.forward();
168 block_res_it.data()->block);
179 WERD_RES_LIST current_perm;
180 int16_t current_score;
181 bool improved =
false;
184 dump_words(best_perm, best_score, 1, improved);
189 while ((best_score !=
PERFECT_WERDS) && !current_perm.empty()) {
192 dump_words(current_perm, current_score, 2, improved);
193 if (current_score > best_score) {
196 best_score = current_score;
202 dump_words(best_perm, best_score, 3, improved);
208 WERD_RES_IT src_it(&src_list);
209 WERD_RES_IT new_it(&new_list);
213 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
214 src_wd = src_it.data();
219 new_it.add_after_then_move(new_wd);
228 WERD_RES_IT word_it(&words);
233 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
234 word = word_it.data();
236 WordData word_data(block, row, word);
270 WERD_RES_IT word_res_it(&word_res_list);
271 int16_t total_score = 0;
272 int16_t word_count = 0;
273 int16_t done_word_count = 0;
278 int16_t prev_word_score = 0;
279 bool prev_word_done =
false;
280 bool prev_char_1 =
false;
281 bool prev_char_digit =
false;
282 bool current_char_1 =
false;
283 bool current_word_ok_so_far;
284 STRING punct_chars =
"!\"`',.:;";
285 bool prev_char_punct =
false;
286 bool current_char_punct =
false;
287 bool word_done =
false;
290 word = word_res_it.data();
294 total_score += prev_word_score;
299 prev_char_digit =
false;
300 prev_word_done =
false;
308 current_word_ok_so_far =
false;
310 (prev_char_digit && (
316 total_score += prev_word_score;
319 current_word_ok_so_far = word_done;
322 if (current_word_ok_so_far) {
323 prev_word_done =
true;
324 prev_word_score = word_len;
326 prev_word_done =
false;
332 for (i = 0, prev_char_1 =
false; i < word_len; i++) {
334 if (prev_char_1 || (current_char_1 && (i > 0)))
336 prev_char_1 = current_char_1;
342 for (i = 0, offset = 0, prev_char_punct =
false; i < word_len;
346 if (prev_char_punct || (current_char_punct && i > 0))
348 prev_char_punct = current_char_punct;
352 for (i = 0, offset = 0; i < word_len - 1;
361 word_res_it.forward();
362 }
while (word_res_it.data()->part_of_combo);
363 }
while (!word_res_it.at_first());
364 total_score += prev_word_score;
367 if (done_word_count == word_count)
377 for (i = 0, offset = 0; i < char_position;
403 WERD_RES_IT word_it(&words);
404 WERD_RES_IT prev_word_it(&words);
409 int16_t prev_right = -INT16_MAX;
412 int16_t min_gap = INT16_MAX;
414 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
415 word = word_it.data();
418 if (prev_right > -INT16_MAX) {
419 gap = box.
left() - prev_right;
423 prev_right = box.
right();
426 if (min_gap < INT16_MAX) {
427 prev_right = -INT16_MAX;
428 word_it.set_to_list(&words);
430 for (; (prev_right == -INT16_MAX) || !word_it.at_first();
432 word = word_it.data();
435 if (prev_right > -INT16_MAX) {
436 gap = box.
left() - prev_right;
437 if (gap <= min_gap) {
438 prev_word = prev_word_it.data();
444 copy_word =
new WERD;
445 *copy_word = *(prev_word->
word);
451 prev_word_it.add_before_then_move(combo);
458 delete word_it.extract();
467 prev_word_it = word_it;
470 prev_right = box.
right();
480 int16_t mode,
bool improved) {
481 WERD_RES_IT word_res_it(&perm);
486 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
487 word_res_it.forward()) {
488 if (!word_res_it.data()->part_of_combo) {
490 word_res_it.data()->best_choice->unichar_string();
499 tprintf(
"EXTRACTED (%d): \"", score);
502 tprintf(
"TESTED (%d): \"", score);
505 tprintf(
"RETURNED (%d): \"", score);
509 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
510 word_res_it.forward()) {
511 if (!word_res_it.data()->part_of_combo) {
513 word_res_it.data()->best_choice->unichar_string().string(),
514 (int)word_res_it.data()->best_choice->permuter());
518 }
else if (improved) {
520 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
521 word_res_it.forward()) {
522 if (!word_res_it.data()->part_of_combo) {
524 word_res_it.data()->best_choice->unichar_string().string(),
525 (int)word_res_it.data()->best_choice->permuter());
568 WERD_RES_LIST sub_word_list;
569 WERD_RES_IT sub_word_list_it(&sub_word_list);
574 word_res = word_res_it.data();
586 tprintf(
"FP fixspace working on \"%s\"\n",
590 sub_word_list_it.add_after_stay_put(word_res_it.extract());
592 new_length = sub_word_list.length();
593 word_res_it.add_list_before(&sub_word_list);
594 for (; !word_res_it.at_last() && new_length > 1; new_length--) {
595 word_res_it.forward();
602 WERD_RES_IT best_perm_it(&best_perm);
603 WERD_RES_LIST current_perm;
604 WERD_RES_IT current_perm_it(¤t_perm);
606 int16_t current_score;
607 bool improved =
false;
611 dump_words(best_perm, best_score, 1, improved);
613 old_word_res = best_perm_it.data();
622 while (best_score !=
PERFECT_WERDS && !current_perm.empty()) {
625 dump_words(current_perm, current_score, 2, improved);
626 if (current_score > best_score) {
629 best_score = current_score;
636 dump_words(best_perm, best_score, 3, improved);
646 WERD_RES_IT word_it(&words);
647 WERD_RES_IT worst_word_it;
648 float worst_noise_score = 9999;
649 int worst_blob_index = -1;
654 C_BLOB_IT rej_cblob_it;
655 C_BLOB_LIST new_blob_list;
656 C_BLOB_IT new_blob_it;
657 C_BLOB_IT new_rej_cblob_it;
659 int16_t start_of_noise_blob;
662 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
664 if (blob_index > -1 && worst_noise_score > noise_score) {
665 worst_noise_score = noise_score;
666 worst_blob_index = blob_index;
667 worst_word_it = word_it;
670 if (worst_blob_index < 0) {
677 word_res = worst_word_it.data();
681 new_blob_it.set_to_list(&new_blob_list);
683 for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
684 new_blob_it.add_after_then_move(blob_it.extract());
686 start_of_noise_blob = blob_it.data()->bounding_box().left();
687 delete blob_it.extract();
689 new_word =
new WERD(&new_blob_list, word_res->
word);
697 (!rej_cblob_it.empty() &&
698 (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
699 rej_cblob_it.forward()) {
700 new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
705 worst_word_it.add_before_then_move(new_word_res);
711 float *worst_noise_score) {
712 float noise_score[512];
734 tprintf(
"FP fixspace Noise metrics for \"%s\": ",
741 noise_score[i] = non_noise_limit;
746 tprintf(
"%1.1f ", noise_score[i]);
755 if (noise_score[i] >= non_noise_limit) {
767 if (noise_score[i] >= non_noise_limit) {
776 if (min_noise_blob > max_noise_blob)
779 *worst_noise_score = small_limit;
781 for (i = min_noise_blob; i <= max_noise_blob; i++) {
782 if (noise_score[i] < *worst_noise_score) {
784 *worst_noise_score = noise_score[i];
792 int16_t outline_count = 0;
793 int16_t max_dimension;
794 int16_t largest_outline_dimension = 0;
798 box = ol->bounding_box();
800 max_dimension = box.
height();
802 max_dimension = box.
width();
805 if (largest_outline_dimension < max_dimension)
806 largest_outline_dimension = max_dimension;
809 if (outline_count > 5) {
811 largest_outline_dimension *= 2;
818 largest_outline_dimension /= 2;
821 return largest_outline_dimension;
827 const bool show_map_detail =
false;
832 tprintf(
"Blob count: %d (word); %d/%d (rebuild word)\n",
838 if (show_map_detail) {
847 tprintf(
"Done flag: %s\n\n", word->
done ?
"TRUE" :
"FALSE");
861 WERD_RES_IT word_it(&word_res_list);
867 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
868 word = word_it.data();
bool tessedit_prefer_joined_punct
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
WERD_CHOICE * prev_word_best_choice_
void * cancel_this
monitor-aware progress callback
const char * string() const
static WERD_RES * deep_copy(const WERD_RES *src)
void full_print(FILE *fp)
TBOX bounding_box() const
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
const int kBlnBaselineOffset
void set_flag(WERD_FLAGS mask, bool value)
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
C_BLOB_LIST * rej_cblob_list()
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
volatile int8_t ocr_alive
true if not last
BLOCK_RES_LIST block_res_list
float blob_noise_score(TBLOB *blob)
void fixspace_dbg(WERD_RES *word)
char * numeric_punctuation
double fixsp_small_outlines_size
bool flag(WERD_FLAGS mask) const
const STRING & unichar_lengths() const
bool get_isdigit(UNICHAR_ID unichar_id) const
int16_t safe_dict_word(const WERD_RES *werd_res)
int debug_fix_space_level
bool deadline_exceeded() const
bool fixspace_thinks_word_done(WERD_RES *word)
void set_blanks(uint8_t new_blanks)
void copy_on(WERD_RES *word_res)
UNICHAR_ID unichar_id(int index) const
DLLSYM void tprintf(const char *format,...)
TBOX bounding_box() const
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
C_BLOB_LIST * cblob_list()
GenericVector< TBLOB * > blobs
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
bool contains(const char c) const
void SetupWordPassN(int pass_n, WordData *word)
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
TBOX bounding_box() const
const UNICHARSET * uch_set
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
CANCEL_FUNC cancel
for errcode use
const STRING & unichar_string() const
char * conflict_set_I_l_1
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
bool check_debug_pt(WERD_RES *word, int location)
void join_on(WERD *other)
void transform_to_next_perm(WERD_RES_LIST &words)
int16_t progress
chars in this buffer(0)
WERD_CHOICE * best_choice
tesseract::BoxWord * box_word
int fixsp_non_noise_limit
void break_noisiest_blob_word(WERD_RES_LIST &words)
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)