21 #pragma warning(disable:4244) // Conversion warnings
22 #pragma warning(disable:4305) // int/float warnings
45 #include "config_auto.h"
64 if (word->
done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
65 one_ell_conflict(word,
FALSE)) {
66 if (tessedit_rejection_debug)
tprintf(
"one_ell_conflict detected\n");
69 if (word->
done && ((!word_from_dict &&
71 if (tessedit_rejection_debug)
tprintf(
"non-dict or ambig word detected\n");
74 if (tessedit_rejection_debug) {
93 check_debug_pt(word, -1);
100 if (tessedit_reject_mode == 0) {
103 }
else if (tessedit_reject_mode == 5) {
112 one_ell_conflict(word,
TRUE);
122 if (rej_use_tess_blanks &&
127 if (rej_use_good_perm) {
131 (!rej_use_sensible_wd ||
132 acceptable_word_string(*word->
uch_set,
138 if (rej_alphas_in_number_perm) {
139 for (i = 0, offset = 0;
157 tprintf(
"BAD tessedit_reject_mode\n");
161 if (tessedit_image_border > -1)
162 reject_edge_blobs(word);
164 check_debug_pt (word, 10);
165 if (tessedit_rejection_debug) {
167 tprintf(
"Certainty: %f Rating: %f\n",
173 check_debug_pt(word, 20);
228 float bestgap = 0.0f;
231 BLOB_CHOICE_IT choice_it;
233 int blob_count = word->
length();
236 for (
int i = 0; i < blob_count; ++i) {
240 gapstart = ratings[0] - 1;
241 if (blob_count >= 3) {
242 for (
int index = 0; index < blob_count - 1; index++) {
243 if (ratings[index + 1] - ratings[index] > bestgap) {
244 bestgap = ratings[index + 1] - ratings[index];
246 gapstart = ratings[index];
250 threshold = gapstart + bestgap / 2;
273 for (
int blobindex = 0; blobindex < blobcount; blobindex++) {
279 word->
reject_map[blobindex].setrej_edge_char();
296 inT16 first_alphanum_index_;
297 inT16 first_alphanum_offset_;
300 BOOL8 non_conflict_set_char;
304 BOOL8 dict_perm_type;
310 word_len = strlen (lengths);
323 for (i = 0, offset = 0, non_conflict_set_char =
FALSE;
324 (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
325 non_conflict_set_char =
329 if (!non_conflict_set_char) {
347 dict_word_ok = (dict_word_type > 0) &&
352 (dict_perm_type && dict_word_ok)) {
355 if (lengths[first_alphanum_index_] == 1 &&
356 word[first_alphanum_offset_] ==
'I') {
362 setrej_1Il_conflict();
371 if (lengths[first_alphanum_index_] == 1 &&
372 word[first_alphanum_offset_] ==
'l') {
378 setrej_1Il_conflict();
402 if (lengths[first_alphanum_index_] == 1 &&
403 word[first_alphanum_offset_] ==
'l') {
410 else if (lengths[first_alphanum_index_] == 1 &&
411 word[first_alphanum_offset_] ==
'I') {
430 for (i = 0, offset = 0; word[offset] !=
'\0';
432 if ((!allow_1s || (word[offset] !=
'1')) &&
435 word_res->
reject_map[i].setrej_1Il_conflict ();
452 setrej_1Il_conflict ();
470 const char *word_lengths) {
474 for (i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[i++]) {
483 const char *word_lengths) {
487 for (i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[i++]) {
496 const char *word_lengths) {
501 for (i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[i++]) {
510 const char *word_lengths) {
514 for (i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[i++]) {
516 (word_lengths[i] != 1 || word[offset] !=
'1'))
534 for (i = 0, offset = 0; i < word_len;
549 for (i = 0, offset = 0; i < word_len;
584 inT16 accepted_char_quality;
601 (char_quality == accepted_char_quality))
619 int prev_right = -9999;
629 for (i = 0; i < best_choice->
length() && i < num_blobs; ++i) {
632 if (i + 1 == num_blobs)
638 (out_box.
left() > prev_right) && (out_box.
right() < next_left)) {
639 aspect_ratio = out_box.
width() / (float) out_box.
height();
647 word_res->
reject_map[i].setrej_hyphen_accept();
654 else if (best_choice->
unichar_id(i) == unichar_dash) {
657 word_res->
reject_map[i].setrej_hyphen_accept();
666 prev_right = out_box.
right();
682 for (i = 0; i < best_choice->
length() && i < num_blobs; ++i) {
694 if (unichar_0 == INVALID_UNICHAR_ID ||
696 unichar_O == INVALID_UNICHAR_ID ||
700 for (i = 1; i < best_choice->
length(); ++i) {
701 if (best_choice->
unichar_id(i) == unichar_0 ||
704 if ((i+1) < best_choice->
length() &&
711 (i+1) < best_choice->
length() &&
714 (i+2) < best_choice->
length() &&
723 (((i+1) < best_choice->
length() &&
727 (i == best_choice->
length() - 1))) {
732 (i+1) < best_choice->
length() &&
738 (i+2) < best_choice->
length() &&
750 (i+2) < best_choice->
length() &&
760 (i+1) < best_choice->
length() &&
770 if (best_choice->
unichar_id(i-2) == unichar_O) {
773 while (i < best_choice->length() &&
786 return ch_set.
get_isupper(unichar_id) && !ch_set.
eq(unichar_id,
"O");
790 return ch_set.
get_isdigit(unichar_id) && !ch_set.
eq(unichar_id,
"0");
void set_unichar_id(UNICHAR_ID unichar_id, int index)
inT16 first_alphanum_offset(const char *word, const char *word_lengths)
tesseract::BoxWord * box_word
void reject_mostly_rejects(WERD_RES *word)
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
CLISTIZEH(STRING) CLISTIZE(STRING) namespace tesseract
BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map)
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
WERD_CHOICE * best_choice
void dont_allow_1Il(WERD_RES *word)
bool get_isupper(UNICHAR_ID unichar_id) const
const TBOX & BlobBox(int index) const
char * ok_repeated_ch_non_alphanum_wds
void flip_0O(WERD_RES *word)
void reject_blanks(WERD_RES *word)
void flip_hyphens(WERD_RES *word)
const STRING & unichar_lengths() const
TBOX bounding_box() const
int dict_word(const WERD_CHOICE &word)
bool rej_1Il_trust_permuter_type
inT16 safe_dict_word(const WERD_RES *werd_res)
bool dangerous_ambig_found() const
inT16 count_alphanums(const WERD_CHOICE &word)
const STRING & unichar_string() const
bool get_isdigit(UNICHAR_ID unichar_id) const
void rej_word_small_xht()
inT16 alpha_count(const char *word, const char *word_lengths)
inT16 first_alphanum_index(const char *word, const char *word_lengths)
#define CLISTIZE(CLASSNAME)
void reject_edge_blobs(WERD_RES *word)
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
const UNICHAR_ID unichar_id(int index) const
const UNICHARSET * uch_set
char * conflict_set_I_l_1
void init_to_size(int size, T t)
BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths)
double tessedit_lower_flip_hyphen
tesseract::Tesseract * tesseract
const int kBlnBaselineOffset
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
void rej_word_mostly_rej()
double rej_whole_of_mostly_reject_word_fract
GenericVector< TBLOB * > blobs
double tessedit_upper_flip_hyphen
BOOL8 non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
bool rej_1Il_use_dict_word
bool get_enabled(UNICHAR_ID unichar_id) const
bool get_isalpha(UNICHAR_ID unichar_id) const
void flip_hyphens(WERD_RES *word)
void initialise(inT16 length)
float compute_reject_threshold(WERD_CHOICE *word)
TBOX bounding_box() const
void reject_I_1_L(WERD_RES *word)
const char * string() const
BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
void rej_word_contains_blanks()
void rej_word_bad_permuter()
void reject_poor_matches(WERD_RES *word)
void flip_0O(WERD_RES *word)
int tessedit_image_border
BOOL8 contains(const char c) const
void rej_word_not_tess_accepted()
BOOL8 non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
bool contains_unichar_id(UNICHAR_ID unichar_id) const