27 static void countMatchingBlobs(int16_t& match_count,
int ) {
31 static void countAcceptedBlobs(
WERD_RES* word, int16_t& match_count,
32 int16_t& accepted_match_count,
int index) {
34 ++accepted_match_count;
39 static void acceptIfGoodQuality(
WERD_RES* word,
int index) {
40 if (word->
reject_map[index].accept_if_good_quality()) {
41 word->
reject_map[index].setrej_quality_accept();
52 int16_t match_count = 0;
55 using namespace std::placeholders;
58 std::bind(countMatchingBlobs, match_count, _1));
65 int16_t err_count = 0;
84 int16_t* accepted_match_count) {
86 *accepted_match_count = 0;
89 using namespace std::placeholders;
92 std::bind(countAcceptedBlobs,
93 word, *match_count, *accepted_match_count, _1));
104 using namespace std::placeholders;
106 *word->
rebuild_word, std::bind(acceptIfGoodQuality, word, _1));
111 int expected_outline_count;
116 expected_outline_count = 2;
118 expected_outline_count = 1;
119 return abs (outline_count - expected_outline_count);
123 bool good_quality_doc) {
152 while (page_res_it.
word () !=
nullptr) {
155 word = page_res_it.
word ();
157 if (word->
reject_map[i].accept_if_good_quality ())
166 word = page_res_it.
word ();
179 current_row = page_res_it.
row ();
180 while ((page_res_it.
word () !=
nullptr) &&
181 (page_res_it.
row () == current_row))
189 current_block =
nullptr;
190 current_row =
nullptr;
191 while (page_res_it.
word () !=
nullptr) {
192 if (current_block != page_res_it.
block ()) {
193 current_block = page_res_it.
block ();
197 if (current_row != page_res_it.
row ()) {
198 current_row = page_res_it.
row ();
218 bool good_quality_doc) {
219 int16_t block_no = 0;
225 bool prev_word_rejected;
226 int16_t char_quality = 0;
227 int16_t accepted_char_quality;
233 tprintf(
"REJECT ALL #chars: %d #Rejects: %d; \n",
239 tprintf(
"NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
248 while ((word = page_res_it.
word()) !=
nullptr) {
249 current_block = page_res_it.
block();
255 tprintf(
"REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
259 prev_word_rejected =
false;
260 while ((word = page_res_it.
word()) !=
nullptr &&
261 (page_res_it.
block() == current_block)) {
285 prev_word_rejected &&
291 prev_word_rejected = rej_word;
296 tprintf(
"NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
303 while (page_res_it.
word() !=
nullptr &&
304 page_res_it.
block() == current_block) {
305 current_row = page_res_it.
row();
319 tprintf(
"REJECTING ROW %d #chars: %d; #Rejects: %d\n",
323 prev_word_rejected =
false;
324 while ((word = page_res_it.
word()) !=
nullptr &&
325 page_res_it.
row () == current_row) {
342 &accepted_char_quality);
355 prev_word_rejected &&
361 prev_word_rejected = rej_word;
366 tprintf(
"NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
369 while (page_res_it.
word() !=
nullptr &&
370 page_res_it.
row() == current_row)
389 while (page_res_it.
word () !=
nullptr) {
402 bool prev_potential_marked =
false;
403 bool found_terrible_word =
false;
407 while (page_res_it.
word() !=
nullptr) {
409 if (pb !=
nullptr && !pb->
IsText()) {
413 word = page_res_it.
word();
422 found_terrible_word =
false;
424 prev_potential_marked =
false;
433 tprintf (
"T CRUNCHING: \"%s\"\n",
437 if (prev_potential_marked) {
438 while (copy_it.
word () != word) {
440 tprintf (
"P1 CRUNCHING: \"%s\"\n",
446 prev_potential_marked =
false;
448 found_terrible_word =
true;
452 garbage_level, ok_dict_word))) {
453 if (found_terrible_word) {
455 tprintf (
"P2 CRUNCHING: \"%s\"\n",
460 else if (!prev_potential_marked) {
461 copy_it = page_res_it;
462 prev_potential_marked =
true;
464 tprintf (
"P3 CRUNCHING: \"%s\"\n",
470 found_terrible_word =
false;
472 prev_potential_marked =
false;
474 tprintf (
"NO CRUNCH: \"%s\"\n",
505 (garbage_level !=
G_OK))
508 (garbage_level !=
G_OK))
511 if (crunch_mode > 0) {
513 tprintf (
"Terrible_word_crunch (%d) on \"%s\"\n",
529 bool word_crunchable;
530 int poor_indicator_count = 0;
539 if (adjusted_len > 10)
545 tprintf(
"Potential poor rating on \"%s\"\n",
548 poor_indicator_count++;
551 if (word_crunchable &&
554 tprintf(
"Potential poor cert on \"%s\"\n",
557 poor_indicator_count++;
560 if (garbage_level !=
G_OK) {
562 tprintf(
"Potential garbage on \"%s\"\n",
565 poor_indicator_count++;
573 bool deleting_from_bol =
false;
574 bool marked_delete_point =
false;
575 int16_t debug_delete_mode;
577 int16_t x_debug_delete_mode;
581 while (page_res_it.
word() !=
nullptr) {
582 word = page_res_it.
word();
588 tprintf (
"BOL CRUNCH DELETING(%d): \"%s\"\n",
593 deleting_from_bol =
true;
595 if (marked_delete_point) {
596 while (copy_it.
word() != word) {
598 x_debug_delete_mode);
600 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
609 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
614 deleting_from_bol =
false;
615 marked_delete_point =
false;
618 if (!marked_delete_point) {
619 copy_it = page_res_it;
620 marked_delete_point =
true;
625 deleting_from_bol =
false;
627 marked_delete_point =
false;
675 int isolated_digits = 0;
676 int isolated_alphas = 0;
677 int bad_char_count = 0;
682 int alpha_repetition_count = 0;
683 int longest_alpha_repetition_count = 0;
684 int longest_lower_run_len = 0;
685 int lower_string_count = 0;
686 int longest_upper_run_len = 0;
687 int upper_string_count = 0;
688 int total_alpha_count = 0;
689 int total_digit_count = 0;
691 for (; *str !=
'\0'; str += *(lengths++)) {
696 case SUBSEQUENT_UPPER:
698 state = SUBSEQUENT_UPPER;
699 upper_string_count++;
700 if (longest_upper_run_len < upper_string_count)
701 longest_upper_run_len = upper_string_count;
703 alpha_repetition_count++;
704 if (longest_alpha_repetition_count < alpha_repetition_count) {
705 longest_alpha_repetition_count = alpha_repetition_count;
710 alpha_repetition_count = 1;
719 alpha_repetition_count = 1;
720 upper_string_count = 1;
727 case SUBSEQUENT_LOWER:
729 state = SUBSEQUENT_LOWER;
730 lower_string_count++;
731 if (longest_lower_run_len < lower_string_count)
732 longest_lower_run_len = lower_string_count;
734 alpha_repetition_count++;
735 if (longest_alpha_repetition_count < alpha_repetition_count) {
736 longest_alpha_repetition_count = alpha_repetition_count;
741 alpha_repetition_count = 1;
750 alpha_repetition_count = 1;
751 lower_string_count = 1;
759 state = SUBSEQUENT_NUM;
772 if (*lengths == 1 && *str ==
' ')
802 total_alpha_count += total_digit_count - isolated_digits;
806 2 * (total_alpha_count - isolated_alphas) > len &&
816 strpbrk(str,
" ") ==
nullptr &&
825 ok_chars = len - bad_char_count - isolated_digits -
826 isolated_alphas - tess_rejs;
829 tprintf(
"garbage_word: \"%s\"\n",
831 tprintf(
"LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
833 bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
835 if (bad_char_count == 0 &&
837 (len > isolated_digits + isolated_alphas || len <= 2))
840 if (tess_rejs > ok_chars ||
841 (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
845 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
847 if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5)
852 dodgy_chars = 2 * tess_rejs + bad_char_count;
853 if ((len == 4 && dodgy_chars > 2) ||
854 (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
953 for (; *str !=
'\0'; str++) {
963 int16_t outline_count = 0;
964 int16_t small_outline_count = 0;
965 int16_t max_dimension;
968 for (
int b = 0; b < word->
NumBlobs(); ++b) {
972 box = ol->bounding_box();
974 max_dimension = box.
height();
976 max_dimension = box.
width();
977 if (max_dimension < small_limit)
978 small_outline_count++;
981 return small_outline_count >= outline_count;