25 #pragma warning(disable:4244) // Conversion warnings
35 probability_in_context_(&
tesseract::
Dict::def_probability_in_context),
36 params_model_classify_(
NULL),
39 "A filename of user-provided words.",
40 getCCUtil()->params()),
42 "A suffix of user-provided words located in tessdata.",
43 getCCUtil()->params()),
45 "A filename of user-provided patterns.",
46 getCCUtil()->params()),
48 "A suffix of user-provided patterns located in "
50 getCCUtil()->params()),
52 getCCUtil()->params()),
54 getCCUtil()->params()),
56 getCCUtil()->params()),
58 " patterns.", getCCUtil()->params()),
60 " patterns.", getCCUtil()->params()),
62 "bigrams.", getCCUtil()->params()),
64 "Score penalty (0.1 = 10%) added if there are subscripts "
65 "or superscripts in a word, but it is otherwise OK.",
66 getCCUtil()->params()),
68 "Score penalty (0.1 = 10%) added if an xheight is "
69 "inconsistent.", getCCUtil()->params()),
71 "Score multiplier for word matches which have good case and"
72 "are frequent in the given language (lower is better).",
73 getCCUtil()->params()),
75 "Score multiplier for word matches that have good case "
76 "(lower is better).", getCCUtil()->params()),
78 "Default score multiplier for word matches, which may have "
79 "case issues (lower is better).",
80 getCCUtil()->params()),
82 "Multipler to for the best choice from the ngram model.",
83 getCCUtil()->params()),
85 "Score multiplier for glyph fragment segmentations which "
86 "do not match a dictionary word (lower is better).",
87 getCCUtil()->params()),
89 "Score multiplier for poorly cased strings that are not in"
90 " the dictionary and generally look like garbage (lower is"
91 " better).", getCCUtil()->params()),
93 "Output file for ambiguities found in the dictionary",
94 getCCUtil()->params()),
95 INT_MEMBER(dawg_debug_level, 0,
"Set to 1 for general debug info"
96 ", to 2 for more details, to 3 to see all the debug messages",
97 getCCUtil()->params()),
98 INT_MEMBER(hyphen_debug_level, 0,
"Debug level for hyphenated words.",
99 getCCUtil()->params()),
100 INT_MEMBER(max_viterbi_list_size, 10,
"Maximum size of viterbi list.",
101 getCCUtil()->params()),
103 "Use only the first UTF8 step of the given string"
104 " when computing log probabilities.",
105 getCCUtil()->params()),
106 double_MEMBER(certainty_scale, 20.0,
"Certainty scaling factor",
107 getCCUtil()->params()),
109 "Certainty threshold for non-dict words",
110 getCCUtil()->params()),
111 double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
112 "Reject certainty offset",
113 getCCUtil()->params()),
115 "Size of dict word to be treated as non-dict word",
116 getCCUtil()->params()),
117 double_MEMBER(stopper_certainty_per_char, -0.50,
"Certainty to add"
118 " for each dict char above small word size.",
119 getCCUtil()->params()),
121 "Max certaintly variation allowed in a word (in sigma)",
122 getCCUtil()->params()),
123 INT_MEMBER(stopper_debug_level, 0,
"Stopper debug level",
124 getCCUtil()->params()),
126 "Make AcceptableChoice() always return false. Useful"
127 " when there is a need to explore all segmentations",
128 getCCUtil()->params()),
130 "Deprecated- backward compatablity only",
131 getCCUtil()->params()),
132 INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
133 "Max words to keep in list",
134 getCCUtil()->params()),
135 STRING_MEMBER(word_to_debug,
"",
"Word for which stopper debug"
136 " information should be printed to stdout",
137 getCCUtil()->params()),
139 "Lengths of unichars in word_to_debug",
140 getCCUtil()->params()),
141 INT_MEMBER(fragments_debug, 0,
"Debug character fragments",
142 getCCUtil()->params()),
144 "Don't use any alphabetic-specific tricks."
145 "Set to true in the traineddata config file for"
146 " scripts that are cursive or inherently fixed-pitch",
147 getCCUtil()->params()),
148 BOOL_MEMBER(save_doc_words, 0,
"Save Document Words",
149 getCCUtil()->params()),
151 "Worst certainty for using pending dictionary",
152 getCCUtil()->params()),
154 "Worst certainty for words that can be inserted into the"
155 "document dictionary", getCCUtil()->params()),
156 INT_MEMBER(max_permuter_attempts, 10000,
"Maximum number of different"
157 " character choices to consider during permutation."
158 " This limit is especially useful when user patterns"
159 " are specified, since overly generic patterns can result in"
160 " dawg search exploring an overly large number of options.",
161 getCCUtil()->params()) {
162 dang_ambigs_table_ =
NULL;
163 replace_ambigs_table_ =
NULL;
164 reject_offset_ = 0.0;
167 last_word_on_line_ =
false;
168 hyphen_unichar_id_ = INVALID_UNICHAR_ID;
169 document_words_ =
NULL;
171 dawg_cache_is_ours_ =
false;
172 pending_words_ =
NULL;
176 unambig_dawg_ =
NULL;
177 wordseg_rating_adjust_factor_ = -1.0f;
178 output_ambig_words_file_ =
NULL;
182 if (hyphen_word_ !=
NULL)
delete hyphen_word_;
183 if (output_ambig_words_file_ !=
NULL) fclose(output_ambig_words_file_);
205 if (dawg_cache !=
NULL) {
206 dawg_cache_ = dawg_cache;
207 dawg_cache_is_ours_ =
false;
210 dawg_cache_is_ours_ =
true;
220 if (punc_dawg_) dawgs_ += punc_dawg_;
225 if (system_dawg) dawgs_ += system_dawg;
230 if (number_dawg) dawgs_ += number_dawg;
239 if (freq_dawg_) { dawgs_ += freq_dawg_; }
244 if (unambig_dawg_) dawgs_ += unambig_dawg_;
251 if (((
STRING &)user_words_file).length() > 0) {
287 dawgs_ += document_words_;
297 for (
int i = 0; i < dawgs_.
length(); ++i) {
298 const Dawg *dawg = dawgs_[i];
300 for (
int j = 0; j < dawgs_.
length(); ++j) {
301 const Dawg *other = dawgs_[j];
302 if (dawg !=
NULL && other !=
NULL &&
304 kDawgSuccessors[dawg->
type()][other->
type()]) *lst += j;
313 for (
int i = 0; i < dawgs_.
size(); i++) {
314 if (!dawg_cache_->
FreeDawg(dawgs_[i])) {
318 dawg_cache_->
FreeDawg(bigram_dawg_);
319 if (dawg_cache_is_ours_) {
326 document_words_ =
NULL;
327 if (pending_words_ !=
NULL) {
328 delete pending_words_;
329 pending_words_ =
NULL;
338 bool word_end)
const {
342 tprintf(
"def_letter_is_okay: current unichar=%s word_end=%d"
343 " num active dawgs=%d\n",
352 unichar_id == INVALID_UNICHAR_ID) {
369 if (!dawg && !punc_dawg) {
371 tprintf(
"Received DawgPosition with no dawg or punc_dawg. wth?\n");
379 if (punc_transition_edge != NO_EDGE) {
382 for (
int s = 0; s < slist.
length(); ++s) {
383 int sdawg_index = slist[s];
384 const Dawg *sdawg = dawgs_[sdawg_index];
387 if (dawg_edge != NO_EDGE) {
389 tprintf(
"Letter found in dawg %d\n", sdawg_index);
395 "Append transition from punc dawg to current dawgs: ");
402 if (punc_edge != NO_EDGE) {
404 tprintf(
"Letter found in punctuation dawg\n");
409 "Extend punctuation dawg: ");
419 EDGE_REF punc_edge = punc_node == NO_EDGE ? NO_EDGE
420 : punc_dawg->
edge_char_of(punc_node, unichar_id, word_end);
421 if (punc_edge != NO_EDGE) {
426 "Return to punctuation dawg: ");
446 EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE
454 if (edge != NO_EDGE) {
460 tprintf(
"Punctuation constraint not satisfied at end of word.\n");
469 "Append current dawg to updated active dawgs: ");
481 tprintf(
"Returning %d for permuter code for this character.\n");
494 unichar_id_patterns.
push_back(unichar_id);
496 &unichar_id_patterns);
497 for (
int i = 0; i < unichar_id_patterns.
size(); ++i) {
500 for (
int k = 0; k < 2; ++k) {
502 ? dawg->
edge_char_of(node, unichar_id_patterns[i], word_end)
504 if (edge == NO_EDGE)
continue;
515 "Append current dawg to updated active dawgs: ");
524 bool ambigs_mode)
const {
527 *active_dawgs = hyphen_active_dawgs_;
529 for (i = 0; i < hyphen_active_dawgs_.
size(); ++i) {
531 hyphen_active_dawgs_[i].dawg_index,
532 hyphen_active_dawgs_[i].dawg_ref);
541 bool suppress_patterns)
const {
542 bool punc_dawg_available =
543 (punc_dawg_ !=
NULL) &&
546 for (
int i = 0; i < dawgs_.
length(); i++) {
547 if (dawgs_[i] !=
NULL &&
549 int dawg_ty = dawgs_[i]->type();
552 *dawg_pos_vec +=
DawgPosition(-1, NO_EDGE, i, NO_EDGE,
false);
557 }
else if (!punc_dawg_available || !subsumed_by_punc) {
558 *dawg_pos_vec +=
DawgPosition(i, NO_EDGE, -1, NO_EDGE,
false);
574 if (hyphen_word_)
return;
578 int stringlen = best_choice.
length();
584 if (best_choice.
length() >= kDocDictMaxRepChars) {
585 int num_rep_chars = 1;
587 for (
int i = 1; i < best_choice.
length(); ++i) {
593 if (num_rep_chars == kDocDictMaxRepChars)
return;
615 strcpy(filename,
getCCUtil()->imagefile.string());
616 strcat(filename,
".doc");
617 doc_word_file =
open_file (filename,
"a");
618 fprintf(doc_word_file,
"%s\n",
620 fclose(doc_word_file);
628 float additional_adjust,
636 float adjust_factor = additional_adjust;
637 float new_rating = word->
rating();
638 new_rating += kRatingPad;
639 const char *xheight_triggered =
"";
642 switch (xheight_consistency) {
645 xheight_triggered =
", xhtBAD";
649 xheight_triggered =
", xhtSUB";
659 tprintf(
"Consistency could not be calculated.\n");
663 tprintf(
"%sWord: %s %4.2f%s", nonword ?
"Non-" :
"",
669 if (case_is_ok && punc_is_ok) {
671 new_rating *= adjust_factor;
675 new_rating *= adjust_factor;
677 if (!case_is_ok)
tprintf(
", C");
678 if (!punc_is_ok)
tprintf(
", P");
686 new_rating *= adjust_factor;
690 new_rating *= adjust_factor;
695 new_rating *= adjust_factor;
699 new_rating -= kRatingPad;
700 if (modify_rating) word->
set_rating(new_rating);
701 if (debug)
tprintf(
" %4.2f --> %4.2f\n", adjust_factor, new_rating);
711 word_ptr = &temp_word;
719 int last_index = word_ptr->
length() - 1;
723 i == last_index)))
break;
733 delete[] active_dawgs;
740 if (bigram_dawg_ ==
NULL)
return false;
744 int w1start, w1end, w2start, w2end;
750 if (w1start >= w1end)
return word1.
length() < 3;
751 if (w2start >= w2end)
return word2.
length() < 3;
755 bigram_string.
reserve(w1end + w2end + 1);
756 for (
int i = w1start; i < w1end; i++) {
760 bigram_string.
push_back(question_unichar_id_);
762 bigram_string += normed_ids;
765 for (
int i = w2start; i < w2end; i++) {
769 bigram_string.
push_back(question_unichar_id_);
771 bigram_string += normed_ids;
774 for (
int i = 0; i < bigram_string.
size(); ++i) {
785 int last_index = word.
length() - 1;
787 for (i = 0; i <= last_index; ++i) {
790 new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
794 }
else if ((new_len = new_word.length()) == 0 ||
799 for (i = 0; i < dawgs_.
size(); ++i) {
800 if (dawgs_[i] !=
NULL &&
802 dawgs_[i]->word_in_dawg(new_word))
return true;
const STRING & lang() const
DawgPositionVector * active_dawgs
#define STRING_MEMBER(name, val, comment, vec)
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
virtual bool end_of_word(EDGE_REF edge_ref) const =0
const CCUtil * getCCUtil() const
void punct_stripped(int *start_core, int *end_core) const
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
bool get_isupper(UNICHAR_ID unichar_id) const
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgPositionVector *updated_dawgs, PermuterType *current_permuter) const
double segment_penalty_dict_case_ok
void set_permuter(uinT8 perm)
int def_letter_is_okay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
double xheight_penalty_inconsistent
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
#define BOOL_MEMBER(name, val, comment, vec)
double segment_penalty_dict_case_bad
TessdataManager tessdata_manager
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
char * user_patterns_file
const STRING & unichar_string() const
static const UNICHAR_ID kPatternUnicharID
int GetTopScriptID() const
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
const UNICHARSET * unicharset() const
bool get_isdigit(UNICHAR_ID unichar_id) const
double doc_dict_pending_threshold
#define BOOL_INIT_MEMBER(name, val, comment, vec)
static DawgCache * GlobalDawgCache()
void copy_hyphen_info(WERD_CHOICE *word) const
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
bool valid_punctuation(const WERD_CHOICE &word)
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
double xheight_penalty_subscripts
Dawg * GetSquishedDawg(const STRING &lang, const char *data_file_name, TessdataType tessdata_dawg_type, int debug_level)
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
void initialize_patterns(UNICHARSET *unicharset)
const UNICHAR_ID unichar_id(int index) const
void delete_data_pointers()
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const
double segment_penalty_dict_frequent_word
const STRING debug_string() const
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
#define INT_MEMBER(name, val, comment, vec)
const STRING & GetDataFileName() const
char * user_patterns_suffix
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
double doc_dict_certainty_threshold
bool get_ispunctuation(UNICHAR_ID unichar_id) const
STRING language_data_path_prefix
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
void Load(DawgCache *dawg_cache)
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
GenericVector< int > SuccessorList
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
const UNICHARSET & getUnicharset() const
double segment_penalty_dict_nonword
#define STRING_INIT_MEMBER(name, val, comment, vec)
FILE * open_file(const char *filename, const char *mode)
double segment_penalty_garbage
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
#define double_MEMBER(name, val, comment, vec)
DawgPositionVector * updated_dawgs
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
const char * string() const
PermuterType permuter() const
bool FreeDawg(Dawg *dawg)
void set_adjust_factor(float factor)
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
void set_rating(float new_val)