75 int16_t err_count = 0;
96 int16_t *accepted_match_count) {
100 *accepted_match_count = 0;
128 int expected_outline_count;
133 expected_outline_count = 2;
135 expected_outline_count = 1;
136 return abs (outline_count - expected_outline_count);
140 bool good_quality_doc) {
169 while (page_res_it.
word () !=
nullptr) {
172 word = page_res_it.
word ();
174 if (word->
reject_map[i].accept_if_good_quality ())
183 word = page_res_it.
word ();
196 current_row = page_res_it.
row ();
197 while ((page_res_it.
word () !=
nullptr) &&
198 (page_res_it.
row () == current_row))
206 current_block =
nullptr;
207 current_row =
nullptr;
208 while (page_res_it.
word () !=
nullptr) {
209 if (current_block != page_res_it.
block ()) {
210 current_block = page_res_it.
block ();
214 if (current_row != page_res_it.
row ()) {
215 current_row = page_res_it.
row ();
235 bool good_quality_doc) {
236 int16_t block_no = 0;
242 bool prev_word_rejected;
243 int16_t char_quality = 0;
244 int16_t accepted_char_quality;
250 tprintf(
"REJECT ALL #chars: %d #Rejects: %d; \n",
256 tprintf(
"NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
265 while ((word = page_res_it.
word()) !=
nullptr) {
266 current_block = page_res_it.
block();
272 tprintf(
"REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
276 prev_word_rejected =
false;
277 while ((word = page_res_it.
word()) !=
nullptr &&
278 (page_res_it.
block() == current_block)) {
291 &accepted_char_quality);
304 prev_word_rejected &&
310 prev_word_rejected = rej_word;
315 tprintf(
"NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
322 while (page_res_it.
word() !=
nullptr &&
323 page_res_it.
block() == current_block) {
324 current_row = page_res_it.
row();
338 tprintf(
"REJECTING ROW %d #chars: %d; #Rejects: %d\n",
342 prev_word_rejected =
false;
343 while ((word = page_res_it.
word()) !=
nullptr &&
344 page_res_it.
row () == current_row) {
362 &accepted_char_quality);
375 prev_word_rejected &&
381 prev_word_rejected = rej_word;
386 tprintf(
"NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
389 while (page_res_it.
word() !=
nullptr &&
390 page_res_it.
row() == current_row)
409 while (page_res_it.
word () !=
nullptr) {
422 bool prev_potential_marked =
false;
423 bool found_terrible_word =
false;
427 while (page_res_it.
word() !=
nullptr) {
429 if (pb !=
nullptr && !pb->
IsText()) {
433 word = page_res_it.
word();
442 found_terrible_word =
false;
444 prev_potential_marked =
false;
453 tprintf (
"T CRUNCHING: \"%s\"\n",
457 if (prev_potential_marked) {
458 while (copy_it.
word () != word) {
460 tprintf (
"P1 CRUNCHING: \"%s\"\n",
466 prev_potential_marked =
false;
468 found_terrible_word =
true;
472 garbage_level, ok_dict_word))) {
473 if (found_terrible_word) {
475 tprintf (
"P2 CRUNCHING: \"%s\"\n",
480 else if (!prev_potential_marked) {
481 copy_it = page_res_it;
482 prev_potential_marked =
true;
484 tprintf (
"P3 CRUNCHING: \"%s\"\n",
490 found_terrible_word =
false;
492 prev_potential_marked =
false;
494 tprintf (
"NO CRUNCH: \"%s\"\n",
525 (garbage_level !=
G_OK))
528 (garbage_level !=
G_OK))
531 if (crunch_mode > 0) {
533 tprintf (
"Terrible_word_crunch (%d) on \"%s\"\n",
549 bool word_crunchable;
550 int poor_indicator_count = 0;
559 if (adjusted_len > 10)
565 tprintf(
"Potential poor rating on \"%s\"\n",
568 poor_indicator_count++;
571 if (word_crunchable &&
574 tprintf(
"Potential poor cert on \"%s\"\n",
577 poor_indicator_count++;
580 if (garbage_level !=
G_OK) {
582 tprintf(
"Potential garbage on \"%s\"\n",
585 poor_indicator_count++;
593 bool deleting_from_bol =
false;
594 bool marked_delete_point =
false;
595 int16_t debug_delete_mode;
597 int16_t x_debug_delete_mode;
601 while (page_res_it.
word() !=
nullptr) {
602 word = page_res_it.
word();
608 tprintf (
"BOL CRUNCH DELETING(%d): \"%s\"\n",
613 deleting_from_bol =
true;
615 if (marked_delete_point) {
616 while (copy_it.
word() != word) {
618 x_debug_delete_mode);
620 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
629 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
634 deleting_from_bol =
false;
635 marked_delete_point =
false;
638 if (!marked_delete_point) {
639 copy_it = page_res_it;
640 marked_delete_point =
true;
645 deleting_from_bol =
false;
647 marked_delete_point =
false;
695 int isolated_digits = 0;
696 int isolated_alphas = 0;
697 int bad_char_count = 0;
702 int alpha_repetition_count = 0;
703 int longest_alpha_repetition_count = 0;
704 int longest_lower_run_len = 0;
705 int lower_string_count = 0;
706 int longest_upper_run_len = 0;
707 int upper_string_count = 0;
708 int total_alpha_count = 0;
709 int total_digit_count = 0;
711 for (; *str !=
'\0'; str += *(lengths++)) {
716 case SUBSEQUENT_UPPER:
718 state = SUBSEQUENT_UPPER;
719 upper_string_count++;
720 if (longest_upper_run_len < upper_string_count)
721 longest_upper_run_len = upper_string_count;
723 alpha_repetition_count++;
724 if (longest_alpha_repetition_count < alpha_repetition_count) {
725 longest_alpha_repetition_count = alpha_repetition_count;
730 alpha_repetition_count = 1;
738 alpha_repetition_count = 1;
739 upper_string_count = 1;
746 case SUBSEQUENT_LOWER:
748 state = SUBSEQUENT_LOWER;
749 lower_string_count++;
750 if (longest_lower_run_len < lower_string_count)
751 longest_lower_run_len = lower_string_count;
753 alpha_repetition_count++;
754 if (longest_alpha_repetition_count < alpha_repetition_count) {
755 longest_alpha_repetition_count = alpha_repetition_count;
760 alpha_repetition_count = 1;
768 alpha_repetition_count = 1;
769 lower_string_count = 1;
777 state = SUBSEQUENT_NUM;
789 if (*lengths == 1 && *str ==
' ')
819 total_alpha_count += total_digit_count - isolated_digits;
823 2 * (total_alpha_count - isolated_alphas) > len &&
833 strpbrk(str,
" ") ==
nullptr &&
842 ok_chars = len - bad_char_count - isolated_digits -
843 isolated_alphas - tess_rejs;
846 tprintf(
"garbage_word: \"%s\"\n",
848 tprintf(
"LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
850 bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
852 if (bad_char_count == 0 &&
854 (len > isolated_digits + isolated_alphas || len <= 2))
857 if (tess_rejs > ok_chars ||
858 (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
862 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
864 if (dodgy_chars > 5 || (dodgy_chars / (
float) len) > 0.5)
869 dodgy_chars = 2 * tess_rejs + bad_char_count;
870 if ((len == 4 && dodgy_chars > 2) ||
871 (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
970 for (; *str !=
'\0'; str++) {
980 int16_t outline_count = 0;
981 int16_t small_outline_count = 0;
982 int16_t max_dimension;
985 for (
int b = 0; b < word->
NumBlobs(); ++b) {
989 box = ol->bounding_box();
991 max_dimension = box.
height();
993 max_dimension = box.
width();
994 if (max_dimension < small_limit)
995 small_outline_count++;
998 return small_outline_count >= outline_count;
BLOCK_RES * block() const
void set_unichar_id(UNICHAR_ID unichar_id, int index)
int crunch_leave_uc_strings
int16_t failure_count(WERD_RES *word)
void AcceptIfGoodQuality(int index)
bool get_islower(UNICHAR_ID unichar_id) const
void rej_word_block_rej()
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
tesseract::BoxWord * bln_boxes
int32_t whole_word_rej_count
uint32_t unsigned_size() const
bool tessedit_debug_block_rejection
const char * string() const
bool crunch_leave_accept_strings
TBOX bounding_box() const
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
bool quality_recoverable_rejects()
bool crunch_include_numerals
double tessedit_reject_doc_percent
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
double tessedit_reject_block_percent
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
const int kBlnBaselineOffset
bool crunch_early_convert_bad_unlv_chs
bool noise_outlines(TWERD *word)
void tilde_crunch(PAGE_RES_IT &page_res_it)
int crunch_pot_indicators
int tessedit_preserve_min_wd_len
double tessedit_reject_row_percent
double crunch_del_low_word
void convert_bad_unlv_chs(WERD_RES *word_res)
int16_t accepted_match_count
bool tessedit_preserve_blk_rej_perfect_wds
DocQualCallbacks(WERD_RES *word0)
double tessedit_good_doc_still_rowrej_wd
int16_t word_blob_quality(WERD_RES *word, ROW *row)
void CountAcceptedBlobs(int index)
void CountMatchingBlobs(int index)
WERD_RES * restart_page()
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
bool flag(WERD_FLAGS mask) const
const STRING & unichar_lengths() const
bool unlv_tilde_crunching
ROW_RES * prev_row() const
bool get_isdigit(UNICHAR_ID unichar_id) const
CRUNCH_MODE unlv_crunch_mode
int16_t safe_dict_word(const WERD_RES *werd_res)
double tessedit_whole_wd_rej_row_percent
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
bool tessedit_preserve_row_rej_perfect_wds
int16_t count_outline_errs(char c, int16_t outline_count)
POLY_BLOCK * poly_block() const
UNICHAR_ID unichar_id(int index) const
DLLSYM void tprintf(const char *format,...)
int16_t word_outline_errs(WERD_RES *word)
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
bool crunch_leave_ok_strings
double crunch_del_high_word
int crunch_long_repetitions
GenericVector< TBLOB * > blobs
double crunch_terrible_rating
bool crunch_early_merge_tess_fails
bool tessedit_unrej_any_wd
double crunch_del_min_width
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
double crunch_poor_garbage_cert
bool tessedit_good_quality_unrej
bool tessedit_dont_blkrej_good_wds
const UNICHARSET * uch_set
void tilde_delete(PAGE_RES_IT &page_res_it)
const STRING & unichar_string() const
int crunch_leave_lc_strings
bool check_debug_pt(WERD_RES *word, int location)
bool tessedit_dont_rowrej_good_wds
bool tessedit_use_reject_spaces
double crunch_poor_garbage_rate
bool tessedit_debug_doc_rejection
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
bool get_isupper(UNICHAR_ID unichar_id) const
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
void reject_whole_page(PAGE_RES_IT &page_res_it)
bool tessedit_row_rej_good_docs
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
double crunch_small_outlines_size
WERD_CHOICE * best_choice
bool crunch_terrible_garbage
void unrej_good_chs(WERD_RES *word, ROW *row)
double crunch_pot_poor_rate
double crunch_pot_poor_cert