21 #pragma warning(disable:4244) // Conversion warnings
100 inT16 *accepted_match_count) {
129 int expected_outline_count;
134 expected_outline_count = 2;
136 expected_outline_count = 1;
137 return abs (outline_count - expected_outline_count);
141 BOOL8 good_quality_doc) {
171 while (page_res_it.
word () !=
NULL) {
174 word = page_res_it.
word ();
176 if (word->
reject_map[i].accept_if_good_quality ())
185 word = page_res_it.
word ();
198 current_row = page_res_it.
row ();
199 while ((page_res_it.
word () !=
NULL) &&
200 (page_res_it.
row () == current_row))
208 current_block =
NULL;
210 while (page_res_it.
word () !=
NULL) {
211 if (current_block != page_res_it.
block ()) {
212 current_block = page_res_it.
block ();
216 if (current_row != page_res_it.
row ()) {
217 current_row = page_res_it.
row ();
237 BOOL8 good_quality_doc) {
244 BOOL8 prev_word_rejected;
245 inT16 char_quality = 0;
246 inT16 accepted_char_quality;
252 tprintf(
"REJECT ALL #chars: %d #Rejects: %d; \n",
258 tprintf(
"NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
267 while ((word = page_res_it.
word()) !=
NULL) {
268 current_block = page_res_it.
block();
274 tprintf(
"REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
278 prev_word_rejected =
FALSE;
279 while ((word = page_res_it.
word()) !=
NULL &&
280 (page_res_it.
block() == current_block)) {
293 &accepted_char_quality);
306 prev_word_rejected &&
312 prev_word_rejected = rej_word;
317 tprintf(
"NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
324 while (page_res_it.
word() !=
NULL &&
325 page_res_it.
block() == current_block) {
326 current_row = page_res_it.
row();
340 tprintf(
"REJECTING ROW %d #chars: %d; #Rejects: %d\n",
344 prev_word_rejected =
FALSE;
345 while ((word = page_res_it.
word()) !=
NULL &&
346 page_res_it.
row () == current_row) {
364 &accepted_char_quality);
377 prev_word_rejected &&
383 prev_word_rejected = rej_word;
388 tprintf(
"NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
391 while (page_res_it.
word() !=
NULL &&
392 page_res_it.
row() == current_row)
412 while (page_res_it.
word () !=
NULL) {
430 while (page_res_it.
word() !=
NULL) {
436 word = page_res_it.
word();
445 found_terrible_word =
FALSE;
447 prev_potential_marked =
FALSE;
456 tprintf (
"T CRUNCHING: \"%s\"\n",
460 if (prev_potential_marked) {
461 while (copy_it.
word () != word) {
463 tprintf (
"P1 CRUNCHING: \"%s\"\n",
469 prev_potential_marked =
FALSE;
471 found_terrible_word =
TRUE;
475 garbage_level, ok_dict_word))) {
476 if (found_terrible_word) {
478 tprintf (
"P2 CRUNCHING: \"%s\"\n",
483 else if (!prev_potential_marked) {
484 copy_it = page_res_it;
485 prev_potential_marked =
TRUE;
487 tprintf (
"P3 CRUNCHING: \"%s\"\n",
493 found_terrible_word =
FALSE;
495 prev_potential_marked =
FALSE;
497 tprintf (
"NO CRUNCH: \"%s\"\n",
528 (garbage_level !=
G_OK))
531 (garbage_level !=
G_OK))
534 if (crunch_mode > 0) {
536 tprintf (
"Terrible_word_crunch (%d) on \"%s\"\n",
547 BOOL8 ok_dict_word) {
552 BOOL8 word_crunchable;
553 int poor_indicator_count = 0;
562 if (adjusted_len > 10)
568 tprintf(
"Potential poor rating on \"%s\"\n",
571 poor_indicator_count++;
574 if (word_crunchable &&
577 tprintf(
"Potential poor cert on \"%s\"\n",
580 poor_indicator_count++;
583 if (garbage_level !=
G_OK) {
585 tprintf(
"Potential garbage on \"%s\"\n",
588 poor_indicator_count++;
598 inT16 debug_delete_mode;
600 inT16 x_debug_delete_mode;
604 while (page_res_it.
word() !=
NULL) {
605 word = page_res_it.
word();
611 tprintf (
"BOL CRUNCH DELETING(%d): \"%s\"\n",
616 deleting_from_bol =
TRUE;
618 if (marked_delete_point) {
619 while (copy_it.
word() != word) {
621 x_debug_delete_mode);
623 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
632 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
637 deleting_from_bol =
FALSE;
638 marked_delete_point =
FALSE;
641 if (!marked_delete_point) {
642 copy_it = page_res_it;
643 marked_delete_point =
TRUE;
648 deleting_from_bol =
FALSE;
650 marked_delete_point =
FALSE;
698 int isolated_digits = 0;
699 int isolated_alphas = 0;
700 int bad_char_count = 0;
705 int alpha_repetition_count = 0;
706 int longest_alpha_repetition_count = 0;
707 int longest_lower_run_len = 0;
708 int lower_string_count = 0;
709 int longest_upper_run_len = 0;
710 int upper_string_count = 0;
711 int total_alpha_count = 0;
712 int total_digit_count = 0;
714 for (; *str !=
'\0'; str += *(lengths++)) {
719 case SUBSEQUENT_UPPER:
721 state = SUBSEQUENT_UPPER;
722 upper_string_count++;
723 if (longest_upper_run_len < upper_string_count)
724 longest_upper_run_len = upper_string_count;
726 alpha_repetition_count++;
727 if (longest_alpha_repetition_count < alpha_repetition_count) {
728 longest_alpha_repetition_count = alpha_repetition_count;
733 alpha_repetition_count = 1;
741 alpha_repetition_count = 1;
742 upper_string_count = 1;
749 case SUBSEQUENT_LOWER:
751 state = SUBSEQUENT_LOWER;
752 lower_string_count++;
753 if (longest_lower_run_len < lower_string_count)
754 longest_lower_run_len = lower_string_count;
756 alpha_repetition_count++;
757 if (longest_alpha_repetition_count < alpha_repetition_count) {
758 longest_alpha_repetition_count = alpha_repetition_count;
763 alpha_repetition_count = 1;
771 alpha_repetition_count = 1;
772 lower_string_count = 1;
780 state = SUBSEQUENT_NUM;
792 if (*lengths == 1 && *str ==
' ')
822 total_alpha_count += total_digit_count - isolated_digits;
826 2 * (total_alpha_count - isolated_alphas) > len &&
836 strpbrk(str,
" ") ==
NULL &&
845 ok_chars = len - bad_char_count - isolated_digits -
846 isolated_alphas - tess_rejs;
849 tprintf(
"garbage_word: \"%s\"\n",
851 tprintf(
"LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
853 bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
855 if (bad_char_count == 0 &&
857 (len > isolated_digits + isolated_alphas || len <= 2))
860 if (tess_rejs > ok_chars ||
861 (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
865 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
867 if (dodgy_chars > 5 || (dodgy_chars / (
float) len) > 0.5)
872 dodgy_chars = 2 * tess_rejs + bad_char_count;
873 if ((len == 4 && dodgy_chars > 2) ||
874 (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
973 for (; *str !=
'\0'; str++) {
983 inT16 outline_count = 0;
984 inT16 small_outline_count = 0;
988 for (
int b = 0; b < word->
NumBlobs(); ++b) {
992 box = ol->bounding_box();
994 max_dimension = box.
height();
996 max_dimension = box.
width();
997 if (max_dimension < small_limit)
998 small_outline_count++;
1001 return small_outline_count >= outline_count;
void convert_bad_unlv_chs(WERD_RES *word_res)
void set_unichar_id(UNICHAR_ID unichar_id, int index)
inT16 word_outline_errs(WERD_RES *word)
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
bool crunch_include_numerals
double crunch_terrible_rating
BOOL8 potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
WERD_CHOICE * best_choice
DocQualCallbacks(WERD_RES *word0)
bool get_isupper(UNICHAR_ID unichar_id) const
inT32 whole_word_rej_count
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
bool tessedit_debug_block_rejection
int crunch_long_repetitions
const STRING & unichar_lengths() const
bool tessedit_use_reject_spaces
bool crunch_leave_ok_strings
void tilde_crunch(PAGE_RES_IT &page_res_it)
inT16 accepted_match_count
inT16 safe_dict_word(const WERD_RES *werd_res)
BOOL8 quality_recoverable_rejects()
const STRING & unichar_string() const
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
bool tessedit_good_quality_unrej
double tessedit_whole_wd_rej_row_percent
CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode)
BLOCK_RES * block() const
double crunch_small_outlines_size
double tessedit_reject_block_percent
bool get_isdigit(UNICHAR_ID unichar_id) const
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
int crunch_pot_indicators
bool tessedit_dont_blkrej_good_wds
bool crunch_early_convert_bad_unlv_chs
WERD_RES * restart_page()
inT16 failure_count(WERD_RES *word)
inT16 count_outline_errs(char c, inT16 outline_count)
BOOL8 check_debug_pt(WERD_RES *word, int location)
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
const UNICHAR_ID unichar_id(int index) const
void rej_word_block_rej()
const UNICHARSET * uch_set
bool unlv_tilde_crunching
double crunch_poor_garbage_cert
bool tessedit_unrej_any_wd
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
const int kBlnBaselineOffset
double crunch_pot_poor_rate
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
bool get_islower(UNICHAR_ID unichar_id) const
int crunch_leave_uc_strings
double tessedit_reject_row_percent
inT16 word_blob_quality(WERD_RES *word, ROW *row)
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
bool tessedit_debug_doc_rejection
TBOX bounding_box() const
BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
bool crunch_early_merge_tess_fails
bool tessedit_row_rej_good_docs
bool tessedit_preserve_row_rej_perfect_wds
GenericVector< TBLOB * > blobs
tesseract::BoxWord * bln_boxes
double tessedit_good_doc_still_rowrej_wd
void CountAcceptedBlobs(int index)
void unrej_good_chs(WERD_RES *word, ROW *row)
BOOL8 noise_outlines(TWERD *word)
bool tessedit_dont_rowrej_good_wds
double crunch_pot_poor_cert
void reject_whole_page(PAGE_RES_IT &page_res_it)
double tessedit_reject_doc_percent
BOOL8 flag(WERD_FLAGS mask) const
double crunch_del_low_word
int tessedit_preserve_min_wd_len
double crunch_del_high_word
CRUNCH_MODE unlv_crunch_mode
double crunch_del_min_width
int crunch_leave_lc_strings
double crunch_poor_garbage_rate
void CountMatchingBlobs(int index)
const char * string() const
POLY_BLOCK * poly_block() const
bool crunch_leave_accept_strings
ROW_RES * prev_row() const
void tilde_delete(PAGE_RES_IT &page_res_it)
bool crunch_terrible_garbage
void AcceptIfGoodQuality(int index)
bool tessedit_preserve_blk_rej_perfect_wds