32 probability_in_context_(&
tesseract::
Dict::def_probability_in_context),
33 params_model_classify_(nullptr),
35 wildcard_unichar_id_(INVALID_UNICHAR_ID),
36 apostrophe_unichar_id_(INVALID_UNICHAR_ID),
37 question_unichar_id_(INVALID_UNICHAR_ID),
38 slash_unichar_id_(INVALID_UNICHAR_ID),
39 hyphen_unichar_id_(INVALID_UNICHAR_ID),
40 STRING_MEMBER(user_words_file,
"",
"A filename of user-provided words.",
41 getCCUtil()->params()),
43 "A suffix of user-provided words located in tessdata.",
44 getCCUtil()->params()),
46 "A filename of user-provided patterns.",
47 getCCUtil()->params()),
49 "A suffix of user-provided patterns located in "
51 getCCUtil()->params()),
53 getCCUtil()->params()),
55 getCCUtil()->params()),
57 getCCUtil()->params()),
59 "Load dawg with punctuation"
61 getCCUtil()->params()),
63 "Load dawg with number"
65 getCCUtil()->params()),
67 "Load dawg with special word "
69 getCCUtil()->params()),
71 "Score penalty (0.1 = 10%) added if there are subscripts "
72 "or superscripts in a word, but it is otherwise OK.",
73 getCCUtil()->params()),
75 "Score penalty (0.1 = 10%) added if an xheight is "
77 getCCUtil()->params()),
79 "Score multiplier for word matches which have good case and"
80 " are frequent in the given language (lower is better).",
81 getCCUtil()->params()),
83 "Score multiplier for word matches that have good case "
85 getCCUtil()->params()),
87 "Default score multiplier for word matches, which may have "
88 "case issues (lower is better).",
89 getCCUtil()->params()),
91 "Score multiplier for glyph fragment segmentations which "
92 "do not match a dictionary word (lower is better).",
93 getCCUtil()->params()),
95 "Score multiplier for poorly cased strings that are not in"
96 " the dictionary and generally look like garbage (lower is"
98 getCCUtil()->params()),
100 "Output file for ambiguities found in the dictionary",
101 getCCUtil()->params()),
103 "Set to 1 for general debug info"
104 ", to 2 for more details, to 3 to see all the debug messages",
105 getCCUtil()->params()),
106 INT_MEMBER(hyphen_debug_level, 0,
"Debug level for hyphenated words.",
107 getCCUtil()->params()),
109 "Use only the first UTF8 step of the given string"
110 " when computing log probabilities.",
111 getCCUtil()->params()),
112 double_MEMBER(certainty_scale, 20.0,
"Certainty scaling factor",
113 getCCUtil()->params()),
115 "Certainty threshold for non-dict words",
116 getCCUtil()->params()),
117 double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
118 "Reject certainty offset", getCCUtil()->params()),
120 "Size of dict word to be treated as non-dict word",
121 getCCUtil()->params()),
124 " for each dict char above small word size.",
125 getCCUtil()->params()),
127 "Max certaintly variation allowed in a word (in sigma)",
128 getCCUtil()->params()),
129 INT_MEMBER(stopper_debug_level, 0,
"Stopper debug level",
130 getCCUtil()->params()),
132 "Make AcceptableChoice() always return false. Useful"
133 " when there is a need to explore all segmentations",
134 getCCUtil()->params()),
135 INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
136 "Max words to keep in list", getCCUtil()->params()),
138 "Word for which stopper debug"
139 " information should be printed to stdout",
140 getCCUtil()->params()),
142 "Don't use any alphabetic-specific tricks."
143 " Set to true in the traineddata config file for"
144 " scripts that are cursive or inherently fixed-pitch",
145 getCCUtil()->params()),
146 BOOL_MEMBER(save_doc_words, 0,
"Save Document Words",
147 getCCUtil()->params()),
149 "Worst certainty for using pending dictionary",
150 getCCUtil()->params()),
152 "Worst certainty for words that can be inserted into the"
153 " document dictionary",
154 getCCUtil()->params()),
156 "Maximum number of different"
157 " character choices to consider during permutation."
158 " This limit is especially useful when user patterns"
159 " are specified, since overly generic patterns can result in"
160 " dawg search exploring an overly large number of options.",
161 getCCUtil()->params()) {
162 reject_offset_ = 0.0;
164 hyphen_word_ =
nullptr;
165 last_word_on_line_ =
false;
166 document_words_ =
nullptr;
167 dawg_cache_ =
nullptr;
168 dawg_cache_is_ours_ =
false;
169 pending_words_ =
nullptr;
170 bigram_dawg_ =
nullptr;
171 freq_dawg_ =
nullptr;
172 punc_dawg_ =
nullptr;
173 unambig_dawg_ =
nullptr;
174 wordseg_rating_adjust_factor_ = -1.0f;
175 output_ambig_words_file_ =
nullptr;
181 if (output_ambig_words_file_ !=
nullptr) fclose(output_ambig_words_file_);
193 if (dawgs_.
size() != 0) this->
End();
200 if (dawg_cache !=
nullptr) {
201 dawg_cache_ = dawg_cache;
202 dawg_cache_is_ours_ =
false;
205 dawg_cache_is_ours_ =
true;
215 if (punc_dawg_) dawgs_ += punc_dawg_;
220 if (system_dawg) dawgs_ += system_dawg;
225 if (number_dawg) dawgs_ += number_dawg;
236 if (freq_dawg_) dawgs_ += freq_dawg_;
241 if (unambig_dawg_) dawgs_ += unambig_dawg_;
283 dawgs_ += document_words_;
296 if (punc_dawg_) dawgs_ += punc_dawg_;
301 if (system_dawg) dawgs_ += system_dawg;
306 if (number_dawg) dawgs_ += number_dawg;
352 if (dawgs_.
empty())
return false;
357 for (
int i = 0; i < dawgs_.
size(); ++i) {
358 const Dawg* dawg = dawgs_[i];
360 for (
int j = 0; j < dawgs_.
size(); ++j) {
361 const Dawg* other = dawgs_[j];
362 if (dawg !=
nullptr && other !=
nullptr &&
364 kDawgSuccessors[dawg->
type()][other->
type()])
373 if (dawgs_.
size() == 0)
return;
374 for (
int i = 0; i < dawgs_.
size(); i++) {
375 if (!dawg_cache_->
FreeDawg(dawgs_[i])) {
379 dawg_cache_->
FreeDawg(bigram_dawg_);
380 if (dawg_cache_is_ours_) {
382 dawg_cache_ =
nullptr;
387 document_words_ =
nullptr;
388 delete pending_words_;
389 pending_words_ =
nullptr;
397 auto* dawg_args = static_cast<DawgArgs*>(void_dawg_args);
403 "def_letter_is_okay: current unichar=%s word_end=%d"
404 " num active dawgs=%d\n",
406 dawg_args->active_dawgs->
size());
413 unichar_id == INVALID_UNICHAR_ID) {
420 dawg_args->updated_dawgs->clear();
421 dawg_args->valid_end =
false;
426 for (
int a = 0; a < dawg_args->active_dawgs->size(); ++a) {
427 const DawgPosition& pos = (*dawg_args->active_dawgs)[a];
428 const Dawg* punc_dawg =
432 if (!dawg && !punc_dawg) {
434 tprintf(
"Received DawgPosition with no dawg or punc_dawg. wth?\n");
442 if (punc_transition_edge != NO_EDGE) {
445 for (
int s = 0; s < slist.
size(); ++s) {
446 int sdawg_index = slist[s];
447 const Dawg* sdawg = dawgs_[sdawg_index];
450 if (dawg_edge != NO_EDGE) {
452 tprintf(
"Letter found in dawg %d\n", sdawg_index);
454 dawg_args->updated_dawgs->add_unique(
456 punc_transition_edge,
false),
458 "Append transition from punc dawg to current dawgs: ");
462 dawg_args->valid_end =
true;
467 punc_dawg->
edge_char_of(punc_node, unichar_id, word_end);
468 if (punc_edge != NO_EDGE) {
470 tprintf(
"Letter found in punctuation dawg\n");
472 dawg_args->updated_dawgs->add_unique(
476 if (punc_dawg->
end_of_word(punc_edge)) dawg_args->valid_end =
true;
488 : punc_dawg->
edge_char_of(punc_node, unichar_id, word_end);
489 if (punc_edge != NO_EDGE) {
490 dawg_args->updated_dawgs->add_unique(
495 if (punc_dawg->
end_of_word(punc_edge)) dawg_args->valid_end =
true;
518 node,
char_for_dawg(unicharset, unichar_id, dawg), word_end);
525 if (edge != NO_EDGE) {
531 tprintf(
"Punctuation constraint not satisfied at end of word.\n");
538 dawg_args->valid_end =
true;
539 dawg_args->updated_dawgs->add_unique(
543 "Append current dawg to updated active dawgs: ");
552 dawg_args->permuter = curr_perm;
555 tprintf(
"Returning %d for permuter code for this character.\n",
556 dawg_args->permuter);
558 return dawg_args->permuter;
569 unichar_id_patterns.
push_back(unichar_id);
571 &unichar_id_patterns);
572 for (
int i = 0; i < unichar_id_patterns.
size(); ++i) {
575 for (
int k = 0; k < 2; ++k) {
577 (k == 0) ? dawg->
edge_char_of(node, unichar_id_patterns[i], word_end)
579 unichar_id_patterns[i], word_end);
580 if (edge == NO_EDGE)
continue;
592 "Append current dawg to updated active dawgs: ");
601 bool ambigs_mode)
const {
604 *active_dawgs = hyphen_active_dawgs_;
606 for (i = 0; i < hyphen_active_dawgs_.
size(); ++i) {
608 hyphen_active_dawgs_[i].dawg_index,
609 hyphen_active_dawgs_[i].dawg_ref);
618 bool suppress_patterns)
const {
619 bool punc_dawg_available =
620 (punc_dawg_ !=
nullptr) &&
623 for (
int i = 0; i < dawgs_.
size(); i++) {
624 if (dawgs_[i] !=
nullptr &&
626 int dawg_ty = dawgs_[i]->type();
629 *dawg_pos_vec +=
DawgPosition(-1, NO_EDGE, i, NO_EDGE,
false);
634 }
else if (!punc_dawg_available || !subsumed_by_punc) {
635 *dawg_pos_vec +=
DawgPosition(i, NO_EDGE, -1, NO_EDGE,
false);
651 if (hyphen_word_)
return;
653 int stringlen = best_choice.
length();
655 if (
valid_word(best_choice) || stringlen < 2)
return;
658 if (best_choice.
length() >= kDocDictMaxRepChars) {
659 int num_rep_chars = 1;
661 for (
int i = 1; i < best_choice.
length(); ++i) {
667 if (num_rep_chars == kDocDictMaxRepChars)
return;
690 FILE* doc_word_file = fopen(filename.
c_str(),
"a");
691 if (doc_word_file ==
nullptr) {
692 tprintf(
"Error: Could not open file %s\n", filename.
c_str());
696 fclose(doc_word_file);
703 float additional_adjust,
bool modify_rating,
707 bool case_is_ok = (is_han ||
case_ok(*word));
710 float adjust_factor = additional_adjust;
711 float new_rating = word->
rating();
712 new_rating += kRatingPad;
713 const char* xheight_triggered =
"";
716 switch (xheight_consistency) {
719 xheight_triggered =
", xhtBAD";
723 xheight_triggered =
", xhtSUB";
733 tprintf(
"Consistency could not be calculated.\n");
737 tprintf(
"%sWord: %s %4.2f%s", nonword ?
"Non-" :
"",
742 if (case_is_ok && punc_is_ok) {
744 new_rating *= adjust_factor;
748 new_rating *= adjust_factor;
750 if (!case_is_ok)
tprintf(
", C");
751 if (!punc_is_ok)
tprintf(
", P");
756 if (!is_han && freq_dawg_ !=
nullptr && freq_dawg_->
word_in_dawg(*word)) {
759 new_rating *= adjust_factor;
763 new_rating *= adjust_factor;
768 new_rating *= adjust_factor;
772 new_rating -= kRatingPad;
773 if (modify_rating) word->
set_rating(new_rating);
774 if (debug)
tprintf(
" %4.2f --> %4.2f\n", adjust_factor, new_rating);
784 word_ptr = &temp_word;
792 int last_index = word_ptr->
length() - 1;
807 delete[] active_dawgs;
815 if (bigram_dawg_ ==
nullptr)
return false;
819 int w1start, w1end, w2start, w2end;
825 if (w1start >= w1end)
return word1.
length() < 3;
826 if (w2start >= w2end)
return word2.
length() < 3;
830 bigram_string.
reserve(w1end + w2end + 1);
831 for (
int i = w1start; i < w1end; i++) {
835 bigram_string.
push_back(question_unichar_id_);
837 bigram_string += normed_ids;
840 for (
int i = w2start; i < w2end; i++) {
844 bigram_string.
push_back(question_unichar_id_);
846 bigram_string += normed_ids;
849 for (
int i = 0; i < bigram_string.
size(); ++i) {
860 int last_index = word.
length() - 1;
862 for (i = 0; i <= last_index; ++i) {
865 new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
869 }
else if ((new_len = new_word.length()) == 0 ||
874 for (i = 0; i < dawgs_.
size(); ++i) {
876 dawgs_[i]->word_in_dawg(new_word))
885 if (u_set.
han_sid() > 0)
return false;
887 if (u_set.
thai_sid() > 0)
return false;