tesseract
5.0.0-alpha-619-ge9db
|
#include <dict.h>
|
| Dict (CCUtil *image_ptr) |
|
| ~Dict () |
|
const CCUtil * | getCCUtil () const |
|
CCUtil * | getCCUtil () |
|
const UNICHARSET & | getUnicharset () const |
|
UNICHARSET & | getUnicharset () |
|
const UnicharAmbigs & | getUnicharAmbigs () const |
|
bool | compound_marker (UNICHAR_ID unichar_id) |
|
bool | is_apostrophe (UNICHAR_ID unichar_id) |
|
bool | hyphenated () const |
| Returns true if we've recorded the beginning of a hyphenated word. More...
|
|
int | hyphen_base_size () const |
| Size of the base word (the part on the line before) of a hyphenated word. More...
|
|
void | copy_hyphen_info (WERD_CHOICE *word) const |
|
bool | has_hyphen_end (const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const |
| Check whether the word has a hyphen at the end. More...
|
|
bool | has_hyphen_end (const WERD_CHOICE &word) const |
| Same as above, but check the unichar at the end of the word. More...
|
|
void | reset_hyphen_vars (bool last_word_on_line) |
|
void | set_hyphen_word (const WERD_CHOICE &word, const DawgPositionVector &active_dawgs) |
|
void | update_best_choice (const WERD_CHOICE &word, WERD_CHOICE *best_choice) |
|
void | init_active_dawgs (DawgPositionVector *active_dawgs, bool ambigs_mode) const |
|
void | default_dawgs (DawgPositionVector *anylength_dawgs, bool suppress_patterns) const |
|
bool | NoDangerousAmbig (WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings) |
|
void | ReplaceAmbig (int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings) |
|
int | LengthOfShortestAlphaRun (const WERD_CHOICE &WordChoice) const |
| Returns the length of the shortest alpha run in WordChoice. More...
|
|
int | UniformCertainties (const WERD_CHOICE &word) |
|
bool | AcceptableChoice (const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency) |
| Returns true if the given best_choice is good enough to stop. More...
|
|
bool | AcceptableResult (WERD_RES *word) const |
|
void | EndDangerousAmbigs () |
|
void | DebugWordChoices () |
| Prints the current choices for this word to stdout. More...
|
|
void | SettupStopperPass1 () |
| Sets up stopper variables in preparation for the first pass. More...
|
|
void | SettupStopperPass2 () |
| Sets up stopper variables in preparation for the second pass. More...
|
|
int | case_ok (const WERD_CHOICE &word) const |
| Check a string to see if it matches a set of lexical rules. More...
|
|
bool | absolute_garbage (const WERD_CHOICE &word, const UNICHARSET &unicharset) |
|
void | SetupForLoad (DawgCache *dawg_cache) |
|
void | Load (const STRING &lang, TessdataManager *data_file) |
|
void | LoadLSTM (const STRING &lang, TessdataManager *data_file) |
|
bool | FinishLoad () |
|
void | End () |
|
void | ResetDocumentDictionary () |
|
int | def_letter_is_okay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const |
|
int | LetterIsOkay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const |
| Calls letter_is_okay_ member function. More...
|
|
double | ProbabilityInContext (const char *context, int context_bytes, const char *character, int character_bytes) |
| Calls probability_in_context_ member function. More...
|
|
double | def_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes) |
| Default (no-op) implementation of probability in context function. More...
|
|
double | ngram_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes) |
|
float | ParamsModelClassify (const char *lang, void *path) |
|
float | CallParamsModelClassify (void *path) |
|
void | SetWildcardID (UNICHAR_ID id) |
|
UNICHAR_ID | WildcardID () const |
|
int | NumDawgs () const |
| Return the number of dawgs in the dawgs_ vector. More...
|
|
const Dawg * | GetDawg (int index) const |
| Return i-th dawg pointer recorded in the dawgs_ vector. More...
|
|
const Dawg * | GetPuncDawg () const |
| Return the points to the punctuation dawg. More...
|
|
const Dawg * | GetUnambigDawg () const |
| Return the points to the unambiguous words dawg. More...
|
|
UNICHAR_ID | char_for_dawg (const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const |
|
void | ProcessPatternEdges (const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const |
|
int | valid_word (const WERD_CHOICE &word, bool numbers_ok) const |
|
int | valid_word (const WERD_CHOICE &word) const |
|
int | valid_word_or_number (const WERD_CHOICE &word) const |
|
int | valid_word (const char *string) const |
| This function is used by api/tesseract_cube_combiner.cpp. More...
|
|
bool | valid_bigram (const WERD_CHOICE &word1, const WERD_CHOICE &word2) const |
|
bool | valid_punctuation (const WERD_CHOICE &word) |
|
int | good_choice (const WERD_CHOICE &choice) |
| Returns true if a good answer is found for the unknown blob rating. More...
|
|
void | add_document_word (const WERD_CHOICE &best_choice) |
| Adds a word found on this document to the document specific dictionary. More...
|
|
void | adjust_word (WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug) |
| Adjusts the rating of the given word. More...
|
|
void | SetWordsegRatingAdjustFactor (float f) |
| Set wordseg_rating_adjust_factor_ to the given value. More...
|
|
bool | IsSpaceDelimitedLang () const |
| Returns true if the language is space-delimited (not CJ, or T). More...
|
|
|
If the choice being composed so far could be a dictionary word keep exploring choices.
|
WERD_CHOICE * | dawg_permute_and_select (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) |
|
void | go_deeper_dawg_fxn (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) |
|
void | permute_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args) |
|
void | append_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args) |
|
|
Given the current char choice and information about previously seen fragments, determines whether adjacent character fragments are present and whether they can be concatenated.
The given prev_char_frag_info contains:
- fragment: if not nullptr contains information about immediately preceding fragmented character choice
- num_fragments: number of fragments that have been used so far to construct a character
- certainty: certainty of the current choice or minimum certainty of all fragments concatenated so far
- rating: rating of the current choice or sum of fragment ratings concatenated so far
The output char_frag_info is filled in as follows:
- character: is set to be nullptr if the choice is a non-matching or non-ending fragment piece; is set to unichar of the given choice if it represents a regular character or a matching ending fragment
- fragment,num_fragments,certainty,rating are set as described above
- Returns
- false if a non-matching fragment is discovered, true otherwise.
|
bool | fragment_state_okay (UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info) |
|
|
void(Dict::* | go_deeper_fxn_ )(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) |
| Pointer to go_deeper function. More...
|
|
int(Dict::* | letter_is_okay_ )(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const |
|
double(Dict::* | probability_in_context_ )(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes) |
| Probability in context function used by the ngram permuter. More...
|
|
float(Dict::* | params_model_classify_ )(const char *lang, void *path) |
|
char * | user_words_file = "" |
|
char * | user_words_suffix = "" |
|
char * | user_patterns_file = "" |
|
char * | user_patterns_suffix = "" |
|
bool | load_system_dawg = true |
|
bool | load_freq_dawg = true |
|
bool | load_unambig_dawg = true |
|
bool | load_punc_dawg = true |
|
bool | load_number_dawg = true |
|
bool | load_bigram_dawg = true |
|
double | xheight_penalty_subscripts = 0.125 |
|
double | xheight_penalty_inconsistent = 0.25 |
|
double | segment_penalty_dict_frequent_word = 1.0 |
|
double | segment_penalty_dict_case_ok = 1.1 |
|
double | segment_penalty_dict_case_bad = 1.3125 |
|
double | segment_penalty_dict_nonword = 1.25 |
|
double | segment_penalty_garbage = 1.50 |
|
char * | output_ambig_words_file = "" |
|
int | dawg_debug_level = 0 |
|
int | hyphen_debug_level = 0 |
|
bool | use_only_first_uft8_step = false |
|
double | certainty_scale = 20.0 |
|
double | stopper_nondict_certainty_base = -2.50 |
|
double | stopper_phase2_certainty_rejection_offset = 1.0 |
|
int | stopper_smallword_size = 2 |
|
double | stopper_certainty_per_char = -0.50 |
|
double | stopper_allowable_character_badness = 3.0 |
|
int | stopper_debug_level = 0 |
|
bool | stopper_no_acceptable_choices = false |
|
int | tessedit_truncate_wordchoice_log = 10 |
|
char * | word_to_debug = "" |
|
bool | segment_nonalphabetic_script = false |
|
bool | save_doc_words = 0 |
|
double | doc_dict_pending_threshold = 0.0 |
|
double | doc_dict_certainty_threshold = -2.25 |
|
int | max_permuter_attempts = 10000 |
|
Definition at line 91 of file dict.h.
◆ Dict()
tesseract::Dict::Dict |
( |
CCUtil * |
image_ptr | ) |
|
Definition at line 30 of file dict.cpp.
35 wildcard_unichar_id_(INVALID_UNICHAR_ID),
36 apostrophe_unichar_id_(INVALID_UNICHAR_ID),
37 question_unichar_id_(INVALID_UNICHAR_ID),
38 slash_unichar_id_(INVALID_UNICHAR_ID),
39 hyphen_unichar_id_(INVALID_UNICHAR_ID),
43 "A suffix of user-provided words located in tessdata.",
46 "A filename of user-provided patterns.",
49 "A suffix of user-provided patterns located in "
59 "Load dawg with punctuation"
63 "Load dawg with number"
67 "Load dawg with special word "
71 "Score penalty (0.1 = 10%) added if there are subscripts "
72 "or superscripts in a word, but it is otherwise OK.",
75 "Score penalty (0.1 = 10%) added if an xheight is "
79 "Score multiplier for word matches which have good case and"
80 " are frequent in the given language (lower is better).",
83 "Score multiplier for word matches that have good case "
87 "Default score multiplier for word matches, which may have "
88 "case issues (lower is better).",
91 "Score multiplier for glyph fragment segmentations which "
92 "do not match a dictionary word (lower is better).",
95 "Score multiplier for poorly cased strings that are not in"
96 " the dictionary and generally look like garbage (lower is"
100 "Output file for ambiguities found in the dictionary",
103 "Set to 1 for general debug info"
104 ", to 2 for more details, to 3 to see all the debug messages",
109 "Use only the first UTF8 step of the given string"
110 " when computing log probabilities.",
115 "Certainty threshold for non-dict words",
118 "Reject certainty offset",
getCCUtil()->params()),
120 "Size of dict word to be treated as non-dict word",
124 " for each dict char above small word size.",
127 "Max certaintly variation allowed in a word (in sigma)",
132 "Make AcceptableChoice() always return false. Useful"
133 " when there is a need to explore all segmentations",
136 "Max words to keep in list",
getCCUtil()->params()),
138 "Word for which stopper debug"
139 " information should be printed to stdout",
142 "Don't use any alphabetic-specific tricks."
143 " Set to true in the traineddata config file for"
144 " scripts that are cursive or inherently fixed-pitch",
149 "Worst certainty for using pending dictionary",
152 "Worst certainty for words that can be inserted into the"
153 " document dictionary",
156 "Maximum number of different"
157 " character choices to consider during permutation."
158 " This limit is especially useful when user patterns"
159 " are specified, since overly generic patterns can result in"
160 " dawg search exploring an overly large number of options.",
162 reject_offset_ = 0.0;
164 hyphen_word_ =
nullptr;
165 last_word_on_line_ =
false;
166 document_words_ =
nullptr;
167 dawg_cache_ =
nullptr;
168 dawg_cache_is_ours_ =
false;
169 pending_words_ =
nullptr;
170 bigram_dawg_ =
nullptr;
171 freq_dawg_ =
nullptr;
172 punc_dawg_ =
nullptr;
173 unambig_dawg_ =
nullptr;
174 wordseg_rating_adjust_factor_ = -1.0f;
175 output_ambig_words_file_ =
nullptr;
◆ ~Dict()
tesseract::Dict::~Dict |
( |
| ) |
|
Definition at line 178 of file dict.cpp.
181 if (output_ambig_words_file_ !=
nullptr) fclose(output_ambig_words_file_);
◆ absolute_garbage()
Returns true if the word looks like an absolute garbage (e.g. image mistakenly recognized as text).
Definition at line 80 of file context.cpp.
◆ AcceptableChoice()
Returns true if the given best_choice is good enough to stop.
Definition at line 56 of file stopper.cpp.
58 case XH_GOOD: xht =
"NORMAL";
break;
61 default: xht =
"UNKNOWN";
63 tprintf(
"\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
65 (is_valid_word ?
'y' :
'n'),
66 (is_case_ok ?
'y' :
'n'),
72 if (reject_offset_ <= 0.0f && !is_valid_word)
return false;
73 if (is_valid_word && is_case_ok) {
82 tprintf(
"Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
86 best_choice.
certainty() > CertaintyThreshold &&
92 tprintf(
"AcceptableChoice() returned false"
93 " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
108 tprintf(
"\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
◆ AcceptableResult()
bool tesseract::Dict::AcceptableResult |
( |
WERD_RES * |
word | ) |
const |
Returns false if the best choice for the current word is questionable and should be tried again on the second pass or should be flagged to the user.
Definition at line 116 of file stopper.cpp.
127 tprintf(
"Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
142 #if !defined(DISABLED_LEGACY_ENGINE)
146 bool fix_replaceable,
149 tprintf(
"\nRunning NoDangerousAmbig() for %s\n",
◆ add_document_word()
void tesseract::Dict::add_document_word |
( |
const WERD_CHOICE & |
best_choice | ) |
|
Adds a word found on this document to the document specific dictionary.
Definition at line 644 of file dict.cpp.
651 if (hyphen_word_)
return;
653 int stringlen = best_choice.
length();
655 if (
valid_word(best_choice) || stringlen < 2)
return;
658 if (best_choice.
length() >= kDocDictMaxRepChars) {
659 int num_rep_chars = 1;
661 for (
int i = 1; i < best_choice.
length(); ++i) {
667 if (num_rep_chars == kDocDictMaxRepChars)
return;
690 FILE* doc_word_file = fopen(filename.c_str(),
"a");
691 if (doc_word_file ==
nullptr) {
692 tprintf(
"Error: Could not open file %s\n", filename.c_str());
696 fclose(doc_word_file);
◆ adjust_word()
void tesseract::Dict::adjust_word |
( |
WERD_CHOICE * |
word, |
|
|
bool |
nonword, |
|
|
XHeightConsistencyEnum |
xheight_consistency, |
|
|
float |
additional_adjust, |
|
|
bool |
modify_rating, |
|
|
bool |
debug |
|
) |
| |
Adjusts the rating of the given word.
Definition at line 701 of file dict.cpp.
707 bool case_is_ok = (is_han ||
case_ok(*word));
710 float adjust_factor = additional_adjust;
711 float new_rating = word->
rating();
712 new_rating += kRatingPad;
713 const char* xheight_triggered =
"";
716 switch (xheight_consistency) {
719 xheight_triggered =
", xhtBAD";
723 xheight_triggered =
", xhtSUB";
733 tprintf(
"Consistency could not be calculated.\n");
737 tprintf(
"%sWord: %s %4.2f%s", nonword ?
"Non-" :
"",
742 if (case_is_ok && punc_is_ok) {
744 new_rating *= adjust_factor;
748 new_rating *= adjust_factor;
750 if (!case_is_ok)
tprintf(
", C");
751 if (!punc_is_ok)
tprintf(
", P");
756 if (!is_han && freq_dawg_ !=
nullptr && freq_dawg_->
word_in_dawg(*word)) {
759 new_rating *= adjust_factor;
763 new_rating *= adjust_factor;
768 new_rating *= adjust_factor;
772 new_rating -= kRatingPad;
773 if (modify_rating) word->
set_rating(new_rating);
774 if (debug)
tprintf(
" %4.2f --> %4.2f\n", adjust_factor, new_rating);
◆ append_choices()
void tesseract::Dict::append_choices |
( |
const char * |
debug, |
|
|
const BLOB_CHOICE_LIST_VECTOR & |
char_choices, |
|
|
const BLOB_CHOICE & |
blob_choice, |
|
|
int |
char_choice_index, |
|
|
const CHAR_FRAGMENT_INFO * |
prev_char_frag_info, |
|
|
WERD_CHOICE * |
word, |
|
|
float |
certainties[], |
|
|
float * |
limit, |
|
|
WERD_CHOICE * |
best_choice, |
|
|
int * |
attempts_left, |
|
|
void * |
more_args |
|
) |
| |
append_choices
Checks to see whether or not the next choice is worth appending to the word being generated. If so then keeps going deeper into the word.
This function assumes that Dict::go_deeper_fxn_ is set.
Definition at line 253 of file permdawg.cpp.
260 if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
262 &char_frag_info, word, certainties, limit,
263 best_choice, attempts_left, more_args);
268 float old_rating = word->
rating();
270 uint8_t old_permuter = word->
permuter();
271 certainties[word->
length()] = char_frag_info.certainty;
273 char_frag_info.unichar_id, char_frag_info.num_fragments,
274 char_frag_info.rating, char_frag_info.certainty);
278 &char_frag_info, word_ending, word, certainties,
279 limit, best_choice, attempts_left, more_args);
◆ CallParamsModelClassify()
float tesseract::Dict::CallParamsModelClassify |
( |
void * |
path | ) |
|
|
inline |
◆ case_ok()
int tesseract::Dict::case_ok |
( |
const WERD_CHOICE & |
word | ) |
const |
Check a string to see if it matches a set of lexical rules.
Definition at line 61 of file context.cpp.
66 if (word.
length() < kMinAbsoluteGarbageWordLength)
return false;
68 for (
int x = 0; x < word.
length(); ++x) {
69 num_alphanum += (unicharset.get_isalpha(word.
unichar_id(x)) ||
72 return (static_cast<float>(num_alphanum) /
73 static_cast<float>(word.
length()) < kMinAbsoluteGarbageAlphanumFrac);
◆ char_for_dawg()
Definition at line 448 of file dict.h.
450 if (!dawg)
return ch;
451 switch (dawg->type()) {
◆ compound_marker()
bool tesseract::Dict::compound_marker |
( |
UNICHAR_ID |
unichar_id | ) |
|
|
inline |
Definition at line 113 of file dict.h.
118 return normed_ids.
size() == 1 &&
119 (normed_ids[0] == hyphen_unichar_id_ ||
120 normed_ids[0] == slash_unichar_id_);
◆ copy_hyphen_info()
void tesseract::Dict::copy_hyphen_info |
( |
WERD_CHOICE * |
word | ) |
const |
|
inline |
If this word is hyphenated copy the base word (the part on the line before) of a hyphenated word into the given word. This function assumes that word is not nullptr.
Definition at line 145 of file dict.h.
147 *word = *hyphen_word_;
◆ dawg_permute_and_select()
Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to explore all the dawgs in the dawgs_ vector in parallel and discard invalid words.
Allocate and return a WERD_CHOICE with the best valid word found.
dawg_permute_and_select
Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to search all the dawgs in the dawgs_ vector in parallel and discard invalid words.
Allocate and return a WERD_CHOICE with the best valid word found.
Definition at line 182 of file permdawg.cpp.
184 char_choices, 0,
nullptr, &word, certainties, &rating_limit, best_choice,
185 &attempts_left, &dawg_args);
186 delete[] active_dawgs;
199 int char_choice_index,
◆ DebugWordChoices()
void tesseract::Dict::DebugWordChoices |
( |
| ) |
|
Prints the current choices for this word to stdout.
◆ def_letter_is_okay()
int tesseract::Dict::def_letter_is_okay |
( |
void * |
void_dawg_args, |
|
|
const UNICHARSET & |
unicharset, |
|
|
UNICHAR_ID |
unichar_id, |
|
|
bool |
word_end |
|
) |
| const |
Returns the maximal permuter code (from ccstruct/ratngs.h) if in light of the current state the letter at word_index in the given word is allowed according to at least one of the dawgs in dawgs_, otherwise returns NO_PERM.
The state is described by void_dawg_args, which are interpreted as DawgArgs and contain relevant active dawg positions. Each entry in the active_dawgs vector contains an index into the dawgs_ vector and an EDGE_REF that indicates the last edge followed in the dawg. It also may contain a position in the punctuation dawg which describes surrounding punctuation (see struct DawgPosition).
Input: At word_index 0 dawg_args->active_dawgs should contain an entry for each dawg that may start at the beginning of a word, with punc_ref and edge_ref initialized to NO_EDGE. Since the punctuation dawg includes the empty pattern " " (meaning anything without surrounding punctuation), having a single entry for the punctuation dawg will cover all dawgs reachable therefrom – that includes all number and word dawgs. The only dawg non-reachable from the punctuation_dawg is the pattern dawg. If hyphen state needs to be applied, initial dawg_args->active_dawgs can be copied from the saved hyphen state (maintained by Dict). For word_index > 0 the corresponding state (active_dawgs and punc position) can be obtained from dawg_args->updated_dawgs passed to def_letter_is_okay for word_index-1. Note: the function assumes that active_dawgs, and updated_dawgs member variables of dawg_args are not nullptr.
Output: The function fills in dawg_args->updated_dawgs vector with the entries for dawgs that contain the word up to the letter at word_index.
Definition at line 395 of file dict.cpp.
397 auto* dawg_args = static_cast<DawgArgs*>(void_dawg_args);
403 "def_letter_is_okay: current unichar=%s word_end=%d"
404 " num active dawgs=%d\n",
406 dawg_args->active_dawgs->
size());
413 unichar_id == INVALID_UNICHAR_ID) {
420 dawg_args->updated_dawgs->clear();
421 dawg_args->valid_end =
false;
426 for (
int a = 0; a < dawg_args->active_dawgs->size(); ++a) {
427 const DawgPosition& pos = (*dawg_args->active_dawgs)[a];
428 const Dawg* punc_dawg =
429 pos.punc_index >= 0 ? dawgs_[pos.punc_index] :
nullptr;
430 const Dawg* dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] :
nullptr;
432 if (!dawg && !punc_dawg) {
434 tprintf(
"Received DawgPosition with no dawg or punc_dawg. wth?\n");
442 if (punc_transition_edge != NO_EDGE) {
445 for (
int s = 0; s < slist.size(); ++s) {
446 int sdawg_index = slist[s];
447 const Dawg* sdawg = dawgs_[sdawg_index];
449 EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
450 if (dawg_edge != NO_EDGE) {
452 tprintf(
"Letter found in dawg %d\n", sdawg_index);
454 dawg_args->updated_dawgs->add_unique(
455 DawgPosition(sdawg_index, dawg_edge, pos.punc_index,
456 punc_transition_edge,
false),
458 "Append transition from punc dawg to current dawgs: ");
459 if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
460 if (sdawg->end_of_word(dawg_edge) &&
461 punc_dawg->end_of_word(punc_transition_edge))
462 dawg_args->valid_end =
true;
467 punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
468 if (punc_edge != NO_EDGE) {
470 tprintf(
"Letter found in punctuation dawg\n");
472 dawg_args->updated_dawgs->add_unique(
473 DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge,
false),
476 if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end =
true;
481 if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
488 : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
489 if (punc_edge != NO_EDGE) {
490 dawg_args->updated_dawgs->add_unique(
491 DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index,
494 if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
495 if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end =
true;
499 if (pos.back_to_punc)
continue;
517 : dawg->edge_char_of(
518 node,
char_for_dawg(unicharset, unichar_id, dawg), word_end);
522 pos.dawg_index, node, edge);
525 if (edge != NO_EDGE) {
527 tprintf(
"Letter found in dawg %d\n", pos.dawg_index);
529 if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
531 tprintf(
"Punctuation constraint not satisfied at end of word.\n");
535 if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
536 if (dawg->end_of_word(edge) &&
537 (punc_dawg ==
nullptr || punc_dawg->end_of_word(pos.punc_ref)))
538 dawg_args->valid_end =
true;
539 dawg_args->updated_dawgs->add_unique(
540 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
543 "Append current dawg to updated active dawgs: ");
552 dawg_args->permuter = curr_perm;
555 tprintf(
"Returning %d for permuter code for this character.\n",
556 dawg_args->permuter);
558 return dawg_args->permuter;
◆ def_probability_in_context()
double tesseract::Dict::def_probability_in_context |
( |
const char * |
lang, |
|
|
const char * |
context, |
|
|
int |
context_bytes, |
|
|
const char * |
character, |
|
|
int |
character_bytes |
|
) |
| |
|
inline |
Default (no-op) implementation of probability in context function.
Definition at line 401 of file dict.h.
408 (void)character_bytes;
◆ default_dawgs()
void tesseract::Dict::default_dawgs |
( |
DawgPositionVector * |
anylength_dawgs, |
|
|
bool |
suppress_patterns |
|
) |
| const |
Definition at line 617 of file dict.cpp.
619 bool punc_dawg_available =
620 (punc_dawg_ !=
nullptr) &&
623 for (
int i = 0; i < dawgs_.
size(); i++) {
624 if (dawgs_[i] !=
nullptr &&
626 int dawg_ty = dawgs_[i]->type();
629 *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE,
false);
634 }
else if (!punc_dawg_available || !subsumed_by_punc) {
635 *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE,
false);
◆ End()
void tesseract::Dict::End |
( |
| ) |
|
Definition at line 372 of file dict.cpp.
373 if (dawgs_.
size() == 0)
return;
374 for (
int i = 0; i < dawgs_.
size(); i++) {
375 if (!dawg_cache_->
FreeDawg(dawgs_[i])) {
379 dawg_cache_->
FreeDawg(bigram_dawg_);
380 if (dawg_cache_is_ours_) {
382 dawg_cache_ =
nullptr;
387 document_words_ =
nullptr;
388 delete pending_words_;
389 pending_words_ =
nullptr;
◆ EndDangerousAmbigs()
void tesseract::Dict::EndDangerousAmbigs |
( |
| ) |
|
◆ FinishLoad()
bool tesseract::Dict::FinishLoad |
( |
| ) |
|
Definition at line 351 of file dict.cpp.
352 if (dawgs_.
empty())
return false;
357 for (
int i = 0; i < dawgs_.
size(); ++i) {
358 const Dawg* dawg = dawgs_[i];
360 for (
int j = 0; j < dawgs_.
size(); ++j) {
361 const Dawg* other = dawgs_[j];
362 if (dawg !=
nullptr && other !=
nullptr &&
363 (dawg->lang() == other->lang()) &&
364 kDawgSuccessors[dawg->type()][other->type()])
◆ fragment_state_okay()
bool tesseract::Dict::fragment_state_okay |
( |
UNICHAR_ID |
curr_unichar_id, |
|
|
float |
curr_rating, |
|
|
float |
curr_certainty, |
|
|
const CHAR_FRAGMENT_INFO * |
prev_char_frag_info, |
|
|
const char * |
debug, |
|
|
int |
word_ending, |
|
|
CHAR_FRAGMENT_INFO * |
char_frag_info |
|
) |
| |
Definition at line 328 of file permdawg.cpp.
329 tprintf(
"prev_fragment %s\n", prev_fragment->to_string().c_str());
332 tprintf(
"this_fragment %s\n", this_fragment->to_string().c_str());
337 char_frag_info->
fragment = this_fragment;
338 char_frag_info->
rating = curr_rating;
339 char_frag_info->
certainty = curr_certainty;
341 if (prev_fragment && !this_fragment) {
342 if (debug)
tprintf(
"Skip choice with incomplete fragment\n");
347 char_frag_info->
unichar_id = INVALID_UNICHAR_ID;
349 if (!this_fragment->is_continuation_of(prev_fragment)) {
350 if (debug)
tprintf(
"Non-matching fragment piece\n");
353 if (this_fragment->is_ending()) {
358 tprintf(
"Built character %s from fragments\n",
363 if (debug)
tprintf(
"Record fragment continuation\n");
364 char_frag_info->
fragment = this_fragment;
368 prev_char_frag_info->
rating + curr_rating;
371 std::min(curr_certainty, prev_char_frag_info->
certainty);
373 if (this_fragment->is_beginning()) {
374 if (debug)
tprintf(
"Record fragment beginning\n");
377 tprintf(
"Non-starting fragment piece with no prev_fragment\n");
383 if (word_ending && char_frag_info->
fragment) {
384 if (debug)
tprintf(
"Word can not end with a fragment\n");
◆ getCCUtil() [1/2]
CCUtil* tesseract::Dict::getCCUtil |
( |
| ) |
|
|
inline |
◆ getCCUtil() [2/2]
const CCUtil* tesseract::Dict::getCCUtil |
( |
| ) |
const |
|
inline |
◆ GetDawg()
const Dawg* tesseract::Dict::GetDawg |
( |
int |
index | ) |
const |
|
inline |
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition at line 432 of file dict.h.
432 {
return dawgs_[index]; }
◆ GetPuncDawg()
const Dawg* tesseract::Dict::GetPuncDawg |
( |
| ) |
const |
|
inline |
Return the points to the punctuation dawg.
Definition at line 434 of file dict.h.
434 {
return punc_dawg_; }
◆ GetStartingNode()
Returns the appropriate next node given the EDGE_REF.
Definition at line 438 of file dict.h.
439 if (edge_ref == NO_EDGE)
return 0;
440 NODE_REF node = dawg->next_node(edge_ref);
441 if (node == 0) node = NO_EDGE;
◆ GetUnambigDawg()
const Dawg* tesseract::Dict::GetUnambigDawg |
( |
| ) |
const |
|
inline |
Return the points to the unambiguous words dawg.
Definition at line 436 of file dict.h.
436 {
return unambig_dawg_; }
◆ getUnicharAmbigs()
◆ getUnicharset() [1/2]
◆ getUnicharset() [2/2]
const UNICHARSET& tesseract::Dict::getUnicharset |
( |
| ) |
const |
|
inline |
◆ GlobalDawgCache()
DawgCache * tesseract::Dict::GlobalDawgCache |
( |
| ) |
|
|
static |
Initialize Dict class - load dawgs from [lang].traineddata and user-specified wordlist and parttern list.
Definition at line 184 of file dict.cpp.
187 static DawgCache cache;
◆ go_deeper_dawg_fxn()
void tesseract::Dict::go_deeper_dawg_fxn |
( |
const char * |
debug, |
|
|
const BLOB_CHOICE_LIST_VECTOR & |
char_choices, |
|
|
int |
char_choice_index, |
|
|
const CHAR_FRAGMENT_INFO * |
prev_char_frag_info, |
|
|
bool |
word_ending, |
|
|
WERD_CHOICE * |
word, |
|
|
float |
certainties[], |
|
|
float * |
limit, |
|
|
WERD_CHOICE * |
best_choice, |
|
|
int * |
attempts_left, |
|
|
void * |
void_more_args |
|
) |
| |
If the choice being composed so far could be a dictionary word and we have not reached the end of the word keep exploring the char_choices further.
Definition at line 58 of file permdawg.cpp.
60 tprintf(
"checking unigrams in an ngram %s\n",
70 bool unigrams_ok =
true;
72 DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
73 DawgPositionVector unigram_updated_dawgs;
74 DawgArgs unigram_dawg_args(&unigram_active_dawgs,
75 &unigram_updated_dawgs,
78 for (
int i = 0; unigrams_ok && i < encoding.
size(); ++i) {
86 word_ending && i == encoding.
size() - 1);
87 (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
91 unigrams_ok ?
"OK" :
"not OK");
98 checked_unigrams =
true;
99 more_args->permuter = unigram_dawg_args.permuter;
100 *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
115 if (output_ambig_words_file_ ==
nullptr) {
116 output_ambig_words_file_ =
118 if (output_ambig_words_file_ ==
nullptr) {
119 tprintf(
"Failed to open output_ambig_words_file %s\n",
126 fprintf(output_ambig_words_file_,
"%s", word_str.
c_str());
131 fprintf(output_ambig_words_file_,
"%s", word_str.
c_str());
139 ++(more_args->updated_dawgs);
141 ++(more_args->active_dawgs);
143 prev_char_frag_info, word, certainties, limit,
144 best_choice, attempts_left, more_args);
146 --(more_args->updated_dawgs);
147 --(more_args->active_dawgs);
151 tprintf(
"last unichar not OK at index %d in %s\n",
◆ good_choice()
int tesseract::Dict::good_choice |
( |
const WERD_CHOICE & |
choice | ) |
|
Returns true if a good answer is found for the unknown blob rating.
◆ has_hyphen_end() [1/2]
bool tesseract::Dict::has_hyphen_end |
( |
const UNICHARSET * |
unicharset, |
|
|
UNICHAR_ID |
unichar_id, |
|
|
bool |
first_pos |
|
) |
| const |
|
inline |
Check whether the word has a hyphen at the end.
Definition at line 152 of file dict.h.
154 if (!last_word_on_line_ || first_pos)
159 return normed_ids.
size() == 1 && normed_ids[0] == hyphen_unichar_id_;
◆ has_hyphen_end() [2/2]
bool tesseract::Dict::has_hyphen_end |
( |
const WERD_CHOICE & |
word | ) |
const |
|
inline |
Same as above, but check the unichar at the end of the word.
Definition at line 162 of file dict.h.
163 int word_index = word.
length() - 1;
◆ hyphen_base_size()
int tesseract::Dict::hyphen_base_size |
( |
| ) |
const |
|
inline |
Size of the base word (the part on the line before) of a hyphenated word.
Definition at line 139 of file dict.h.
◆ hyphenated()
bool tesseract::Dict::hyphenated |
( |
| ) |
const |
|
inline |
Returns true if we've recorded the beginning of a hyphenated word.
Definition at line 135 of file dict.h.
136 !last_word_on_line_ && hyphen_word_;
◆ init_active_dawgs()
void tesseract::Dict::init_active_dawgs |
( |
DawgPositionVector * |
active_dawgs, |
|
|
bool |
ambigs_mode |
|
) |
| const |
Fill the given active_dawgs vector with dawgs that could contain the beginning of the word. If hyphenated() returns true, copy the entries from hyphen_active_dawgs_ instead.
Definition at line 600 of file dict.cpp.
604 *active_dawgs = hyphen_active_dawgs_;
606 for (i = 0; i < hyphen_active_dawgs_.
size(); ++i) {
608 hyphen_active_dawgs_[i].dawg_index,
609 hyphen_active_dawgs_[i].dawg_ref);
◆ is_apostrophe()
bool tesseract::Dict::is_apostrophe |
( |
UNICHAR_ID |
unichar_id | ) |
|
|
inline |
Definition at line 124 of file dict.h.
129 return normed_ids.
size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
◆ IsSpaceDelimitedLang()
bool tesseract::Dict::IsSpaceDelimitedLang |
( |
| ) |
const |
Returns true if the language is space-delimited (not CJ, or T).
Definition at line 883 of file dict.cpp.
885 if (u_set.
han_sid() > 0)
return false;
887 if (u_set.
thai_sid() > 0)
return false;
◆ LengthOfShortestAlphaRun()
int tesseract::Dict::LengthOfShortestAlphaRun |
( |
const WERD_CHOICE & |
WordChoice | ) |
const |
Returns the length of the shortest alpha run in WordChoice.
Definition at line 460 of file stopper.cpp.
467 float WorstCertainty = FLT_MAX;
468 float CertaintyThreshold;
469 double TotalCertainty;
470 double TotalCertaintySquared;
473 int word_length = word.length();
◆ LetterIsOkay()
int tesseract::Dict::LetterIsOkay |
( |
void * |
void_dawg_args, |
|
|
const UNICHARSET & |
unicharset, |
|
|
UNICHAR_ID |
unichar_id, |
|
|
bool |
word_end |
|
) |
| const |
|
inline |
Calls letter_is_okay_ member function.
Definition at line 376 of file dict.h.
379 unicharset, unichar_id, word_end);
◆ Load()
Definition at line 210 of file dict.cpp.
215 if (punc_dawg_) dawgs_ += punc_dawg_;
220 if (system_dawg) dawgs_ += system_dawg;
225 if (number_dawg) dawgs_ += number_dawg;
236 if (freq_dawg_) dawgs_ += freq_dawg_;
241 if (unambig_dawg_) dawgs_ += unambig_dawg_;
283 dawgs_ += document_words_;
◆ LoadLSTM()
Definition at line 291 of file dict.cpp.
296 if (punc_dawg_) dawgs_ += punc_dawg_;
301 if (system_dawg) dawgs_ += system_dawg;
306 if (number_dawg) dawgs_ += number_dawg;
◆ ngram_probability_in_context()
double tesseract::Dict::ngram_probability_in_context |
( |
const char * |
lang, |
|
|
const char * |
context, |
|
|
int |
context_bytes, |
|
|
const char * |
character, |
|
|
int |
character_bytes |
|
) |
| |
◆ NoDangerousAmbig()
bool tesseract::Dict::NoDangerousAmbig |
( |
WERD_CHOICE * |
BestChoice, |
|
|
DANGERR * |
fixpt, |
|
|
bool |
fix_replaceable, |
|
|
MATRIX * |
ratings |
|
) |
| |
Definition at line 158 of file stopper.cpp.
174 bool replace = (fix_replaceable && pass == 0);
182 for (i = 0; i < best_choice->length(); ++i) {
183 auto *lst =
new BLOB_CHOICE_LIST();
184 BLOB_CHOICE_IT lst_it(lst);
186 lst_it.add_to_end(
new BLOB_CHOICE(best_choice->unichar_id(i),
188 ambig_blob_choices.push_back(lst);
192 int wrong_ngram_index;
195 for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
197 UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
199 tprintf(
"Looking for %s ngrams starting with %s:\n",
200 replace ?
"replaceable" :
"ambiguous",
203 int num_wrong_blobs = best_choice->state(i);
204 wrong_ngram_index = 0;
205 wrong_ngram[wrong_ngram_index] = curr_unichar_id;
206 if (curr_unichar_id == INVALID_UNICHAR_ID ||
207 curr_unichar_id >= table.size() ||
208 table[curr_unichar_id] ==
nullptr) {
211 AmbigSpec_IT spec_it(table[curr_unichar_id]);
212 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
213 const AmbigSpec *ambig_spec = spec_it.data();
214 wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
216 ambig_spec->wrong_ngram);
220 tprintf(
"current ngram from spec: ");
222 tprintf(
"comparison result: %d\n", compare);
226 if (fixpt !=
nullptr) {
227 UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
229 blob_index, blob_index + num_wrong_blobs, replace,
233 tprintf(
"fixpt+=(%d %d %d %d %s)\n", blob_index,
234 blob_index + num_wrong_blobs,
false,
236 ambig_spec->correct_ngram_id),
243 tprintf(
"replace ambiguity with %s : ",
245 ambig_spec->correct_ngram_id));
250 ambig_spec->correct_ngram_id,
251 best_choice, ratings);
252 }
else if (i > 0 || ambig_spec->type !=
CASE_AMBIG) {
260 for (
int tmp_index = 0; tmp_index <= wrong_ngram_index;
269 BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
271 ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
276 }
else if (compare == -1) {
277 if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
278 ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
281 wrong_ngram[++wrong_ngram_index] =
282 best_choice->unichar_id(next_index);
283 num_wrong_blobs += best_choice->state(next_index);
298 tprintf(
"\nResulting ambig_blob_choices:\n");
299 for (i = 0; i < ambig_blob_choices.size(); ++i) {
305 ambigs_found = (alt_word->
rating() < 0.0);
308 tprintf (
"Stopper: Possible ambiguous word = %s\n",
311 if (fixpt !=
nullptr) {
317 for (i = 0; i < alt_word->
length(); ++i) {
319 bool replacement_is_ngram =
322 if (replacement_is_ngram) {
325 int step = uchset.
step(str);
328 int end_i = orig_i + alt_word->
state(i);
329 if (alt_word->
state(i) > 1 ||
330 (orig_i + 1 == end_i && replacement_is_ngram)) {
333 for (
int j = 0; j < orig_i; ++j)
334 blob_start += best_choice->state(j);
335 int blob_end = blob_start;
336 for (
int j = orig_i; j < end_i; ++j)
337 blob_end += best_choice->state(j);
339 replacement_is_ngram, leftmost_id));
341 tprintf(
"fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
342 true, replacement_is_ngram,
346 orig_i += alt_word->
state(i);
352 if (output_ambig_words_file_ !=
nullptr) {
353 fprintf(output_ambig_words_file_,
"\n");
356 ambig_blob_choices.delete_data_pointers();
357 return !ambigs_found;
362 #endif // !defined(DISABLED_LEGACY_ENGINE)
365 reject_offset_ = 0.0;
◆ NumDawgs()
int tesseract::Dict::NumDawgs |
( |
| ) |
const |
|
inline |
Return the number of dawgs in the dawgs_ vector.
Definition at line 430 of file dict.h.
430 {
return dawgs_.
size(); }
◆ ParamsModelClassify()
float tesseract::Dict::ParamsModelClassify |
( |
const char * |
lang, |
|
|
void * |
path |
|
) |
| |
◆ permute_choices()
permute_choices
Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST with the given char_choice_index in char_choices.
Definition at line 211 of file permdawg.cpp.
214 BLOB_CHOICE_IT blob_choice_it;
215 blob_choice_it.set_to_list(char_choices.
get(char_choice_index));
216 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
217 blob_choice_it.forward()) {
220 char_choice_index, prev_char_frag_info, word,
221 certainties, limit, best_choice, attempts_left, more_args);
222 if (*attempts_left <= 0) {
223 if (debug)
tprintf(
"permute_choices(): attempts_left is 0\n");
242 int char_choice_index,
◆ ProbabilityInContext()
double tesseract::Dict::ProbabilityInContext |
( |
const char * |
context, |
|
|
int |
context_bytes, |
|
|
const char * |
character, |
|
|
int |
character_bytes |
|
) |
| |
|
inline |
Calls probability_in_context_ member function.
Definition at line 390 of file dict.h.
396 context, context_bytes,
◆ ProcessPatternEdges()
For each of the character classes of the given unichar_id (and the unichar_id itself) finds the corresponding outgoing node or self-loop in the given dawg and (after checking that it is valid) records it in dawg_args->updated_ative_dawgs. Updates current_permuter if any valid edges were found.
Definition at line 561 of file dict.cpp.
569 unichar_id_patterns.
push_back(unichar_id);
571 &unichar_id_patterns);
572 for (
int i = 0; i < unichar_id_patterns.
size(); ++i) {
575 for (
int k = 0; k < 2; ++k) {
577 (k == 0) ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
578 : dawg->pattern_loop_edge(pos.dawg_ref,
579 unichar_id_patterns[i], word_end);
580 if (edge == NO_EDGE)
continue;
583 pos.dawg_index, node, edge);
584 tprintf(
"Letter found in pattern dawg %d\n", pos.dawg_index);
586 if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
587 if (dawg->end_of_word(edge)) dawg_args->valid_end =
true;
588 dawg_args->updated_dawgs->add_unique(
589 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
592 "Append current dawg to updated active dawgs: ");
◆ ReplaceAmbig()
void tesseract::Dict::ReplaceAmbig |
( |
int |
wrong_ngram_begin_index, |
|
|
int |
wrong_ngram_size, |
|
|
UNICHAR_ID |
correct_ngram_id, |
|
|
WERD_CHOICE * |
werd_choice, |
|
|
MATRIX * |
ratings |
|
) |
| |
Definition at line 386 of file stopper.cpp.
396 begin_blob_index += werd_choice->
state(i);
399 new_certainty /= wrong_ngram_size;
402 begin_blob_index + num_blobs_to_replace - 1);
403 if (!coord.Valid(*ratings)) {
406 if (ratings->
get(coord.col, coord.row) ==
nullptr)
407 ratings->
put(coord.col, coord.row,
new BLOB_CHOICE_LIST);
408 BLOB_CHOICE_LIST* new_choices = ratings->
get(coord.col, coord.row);
410 if (choice !=
nullptr) {
412 if (new_rating < choice->rating())
414 if (new_certainty < choice->certainty())
425 BLOB_CHOICE_IT it (new_choices);
426 it.add_to_end(choice);
430 for (
int replaced_count = 0; replaced_count < wrong_ngram_size;
432 if (replaced_count + 1 == wrong_ngram_size) {
434 num_blobs_to_replace, choice);
440 werd_choice->
print(
"ReplaceAmbig() ");
441 tprintf(
"Modified blob_choices: ");
447 int shortest = INT32_MAX;
449 for (
int w = 0; w < WordChoice.
length(); ++w) {
452 }
else if (curr_len > 0) {
453 if (curr_len < shortest) shortest = curr_len;
457 if (curr_len > 0 && curr_len < shortest) {
◆ reset_hyphen_vars()
void tesseract::Dict::reset_hyphen_vars |
( |
bool |
last_word_on_line | ) |
|
Unless the previous word was the last one on the line, and the current one is not (thus it is the first one on the line), erase hyphen_word_, clear hyphen_active_dawgs_, update last_word_on_line_.
Definition at line 42 of file hyphen.cpp.
46 if (hyphen_word_ ==
nullptr) {
50 if (hyphen_word_->
rating() > word.rating()) {
55 hyphen_active_dawgs_ = active_dawgs;
◆ ResetDocumentDictionary()
void tesseract::Dict::ResetDocumentDictionary |
( |
| ) |
|
|
inline |
Definition at line 326 of file dict.h.
327 if (pending_words_ !=
nullptr)
328 pending_words_->
clear();
329 if (document_words_ !=
nullptr)
330 document_words_->
clear();
◆ set_hyphen_word()
Update hyphen_word_, and copy the given DawgPositionVectors into hyphen_active_dawgs_ .
Definition at line 59 of file hyphen.cpp.
◆ SettupStopperPass1()
void tesseract::Dict::SettupStopperPass1 |
( |
| ) |
|
Sets up stopper variables in preparation for the first pass.
Definition at line 378 of file stopper.cpp.
◆ SettupStopperPass2()
void tesseract::Dict::SettupStopperPass2 |
( |
| ) |
|
Sets up stopper variables in preparation for the second pass.
Definition at line 382 of file stopper.cpp.
384 if (i >= wrong_ngram_begin_index) {
◆ SetupForLoad()
void tesseract::Dict::SetupForLoad |
( |
DawgCache * |
dawg_cache | ) |
|
Definition at line 192 of file dict.cpp.
193 if (dawgs_.
size() != 0) this->
End();
200 if (dawg_cache !=
nullptr) {
201 dawg_cache_ = dawg_cache;
202 dawg_cache_is_ours_ =
false;
204 dawg_cache_ =
new DawgCache();
205 dawg_cache_is_ours_ =
true;
◆ SetWildcardID()
void tesseract::Dict::SetWildcardID |
( |
UNICHAR_ID |
id | ) |
|
|
inline |
Definition at line 427 of file dict.h.
427 { wildcard_unichar_id_ = id; }
◆ SetWordsegRatingAdjustFactor()
void tesseract::Dict::SetWordsegRatingAdjustFactor |
( |
float |
f | ) |
|
|
inline |
Set wordseg_rating_adjust_factor_ to the given value.
Definition at line 510 of file dict.h.
511 wordseg_rating_adjust_factor_ = f;
◆ UniformCertainties()
int tesseract::Dict::UniformCertainties |
( |
const WERD_CHOICE & |
word | ) |
|
Returns true if the certainty of the BestChoice word is within a reasonable range of the average certainties for the best choices for each character in the segmentation. This test is used to catch words in which one character is much worse than the other characters in the word (i.e. false will be returned in that case). The algorithm computes the mean and std deviation of the certainties in the word with the worst certainty thrown out.
Definition at line 479 of file stopper.cpp.
481 TotalCertainty += Certainty;
482 TotalCertaintySquared += static_cast<double>(Certainty) * Certainty;
483 if (Certainty < WorstCertainty)
484 WorstCertainty = Certainty;
489 TotalCertainty -= WorstCertainty;
490 TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty;
492 Mean = TotalCertainty / word_length;
493 Variance = ((word_length * TotalCertaintySquared -
494 TotalCertainty * TotalCertainty) /
495 (word_length * (word_length - 1)));
498 StdDev = sqrt(Variance);
504 if (word.
certainty() < CertaintyThreshold) {
506 tprintf(
"Stopper: Non-uniform certainty = %4.1f"
507 " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
◆ update_best_choice()
Copies word into best_choice if its rating is smaller than that of best_choice.
Definition at line 182 of file dict.h.
◆ valid_bigram()
Definition at line 813 of file dict.cpp.
815 if (bigram_dawg_ ==
nullptr)
return false;
819 int w1start, w1end, w2start, w2end;
825 if (w1start >= w1end)
return word1.
length() < 3;
826 if (w2start >= w2end)
return word2.
length() < 3;
830 bigram_string.
reserve(w1end + w2end + 1);
831 for (
int i = w1start; i < w1end; i++) {
835 bigram_string.
push_back(question_unichar_id_);
837 bigram_string += normed_ids;
840 for (
int i = w2start; i < w2end; i++) {
844 bigram_string.
push_back(question_unichar_id_);
846 bigram_string += normed_ids;
849 for (
int i = 0; i < bigram_string.
size(); ++i) {
◆ valid_punctuation()
bool tesseract::Dict::valid_punctuation |
( |
const WERD_CHOICE & |
word | ) |
|
Returns true if the word contains a valid punctuation pattern. Note: Since the domains of punctuation symbols and symblos used in numbers are not disjoint, a valid number might contain an invalid punctuation pattern (e.g. .99).
Definition at line 856 of file dict.cpp.
860 int last_index = word.
length() - 1;
862 for (i = 0; i <= last_index; ++i) {
865 new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
869 }
else if ((new_len = new_word.length()) == 0 ||
874 for (i = 0; i < dawgs_.
size(); ++i) {
876 dawgs_[i]->word_in_dawg(new_word))
◆ valid_word() [1/3]
int tesseract::Dict::valid_word |
( |
const char * |
string | ) |
const |
|
inline |
This function is used by api/tesseract_cube_combiner.cpp.
Definition at line 488 of file dict.h.
◆ valid_word() [2/3]
int tesseract::Dict::valid_word |
( |
const WERD_CHOICE & |
word | ) |
const |
|
inline |
◆ valid_word() [3/3]
int tesseract::Dict::valid_word |
( |
const WERD_CHOICE & |
word, |
|
|
bool |
numbers_ok |
|
) |
| const |
Definition at line 778 of file dict.cpp.
784 word_ptr = &temp_word;
789 auto* active_dawgs =
new DawgPositionVector[2];
791 DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]),
NO_PERM);
792 int last_index = word_ptr->
length() - 1;
799 if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
800 dawg_args.updated_dawgs = &(active_dawgs[0]);
801 ++(dawg_args.active_dawgs);
803 ++(dawg_args.updated_dawgs);
804 dawg_args.active_dawgs = &(active_dawgs[0]);
807 delete[] active_dawgs;
◆ valid_word_or_number()
int tesseract::Dict::valid_word_or_number |
( |
const WERD_CHOICE & |
word | ) |
const |
|
inline |
◆ valid_word_permuter()
static bool tesseract::Dict::valid_word_permuter |
( |
uint8_t |
perm, |
|
|
bool |
numbers_ok |
|
) |
| |
|
inlinestatic |
Check all the DAWGs to see if this word is in any of them.
Read/Write/Access special purpose dawgs which contain words only of a certain length (used for phrase search for non-space-delimited languages).
Definition at line 474 of file dict.h.
◆ WildcardID()
Definition at line 428 of file dict.h.
428 {
return wildcard_unichar_id_; }
◆ certainty_scale
double tesseract::Dict::certainty_scale = 20.0 |
"Certainty scaling factor"
Definition at line 627 of file dict.h.
◆ dawg_debug_level
int tesseract::Dict::dawg_debug_level = 0 |
"Set to 1 for general debug info" ", to 2 for more details, to 3 to see all the debug messages"
Definition at line 622 of file dict.h.
◆ doc_dict_certainty_threshold
double tesseract::Dict::doc_dict_certainty_threshold = -2.25 |
"Worst certainty" " for words that can be inserted into the document dictionary"
Definition at line 653 of file dict.h.
◆ doc_dict_pending_threshold
double tesseract::Dict::doc_dict_pending_threshold = 0.0 |
"Worst certainty for using pending dictionary"
Definition at line 651 of file dict.h.
◆ go_deeper_fxn_
void(Dict::* tesseract::Dict::go_deeper_fxn_) (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) |
Pointer to go_deeper function.
Definition at line 216 of file dict.h.
◆ hyphen_debug_level
int tesseract::Dict::hyphen_debug_level = 0 |
"Debug level for hyphenated words."
Definition at line 623 of file dict.h.
◆ letter_is_okay_
int(Dict::* tesseract::Dict::letter_is_okay_) (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const |
◆ load_bigram_dawg
bool tesseract::Dict::load_bigram_dawg = true |
"Load dawg with special word bigrams."
Definition at line 592 of file dict.h.
◆ load_freq_dawg
bool tesseract::Dict::load_freq_dawg = true |
"Load frequent word dawg."
Definition at line 586 of file dict.h.
◆ load_number_dawg
bool tesseract::Dict::load_number_dawg = true |
"Load dawg with number patterns."
Definition at line 590 of file dict.h.
◆ load_punc_dawg
bool tesseract::Dict::load_punc_dawg = true |
"Load dawg with punctuation patterns."
Definition at line 589 of file dict.h.
◆ load_system_dawg
bool tesseract::Dict::load_system_dawg = true |
"Load system word dawg."
Definition at line 585 of file dict.h.
◆ load_unambig_dawg
bool tesseract::Dict::load_unambig_dawg = true |
"Load unambiguous word dawg."
Definition at line 587 of file dict.h.
◆ max_permuter_attempts
int tesseract::Dict::max_permuter_attempts = 10000 |
"Maximum number of different" " character choices to consider during permutation." " This limit is especially useful when user patterns" " are specified, since overly generic patterns can result in" " dawg search exploring an overly large number of options."
Definition at line 658 of file dict.h.
◆ output_ambig_words_file
char* tesseract::Dict::output_ambig_words_file = "" |
"Output file for ambiguities found in the dictionary"
Definition at line 620 of file dict.h.
◆ params_model_classify_
float(Dict::* tesseract::Dict::params_model_classify_) (const char *lang, void *path) |
◆ probability_in_context_
double(Dict::* tesseract::Dict::probability_in_context_) (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes) |
Probability in context function used by the ngram permuter.
Definition at line 384 of file dict.h.
◆ save_doc_words
bool tesseract::Dict::save_doc_words = 0 |
"Save Document Words"
Definition at line 649 of file dict.h.
◆ segment_nonalphabetic_script
bool tesseract::Dict::segment_nonalphabetic_script = false |
"Don't use any alphabetic-specific tricks." "Set to true in the traineddata config file for" " scripts that are cursive or inherently fixed-pitch"
Definition at line 648 of file dict.h.
◆ segment_penalty_dict_case_bad
double tesseract::Dict::segment_penalty_dict_case_bad = 1.3125 |
"Default score multiplier for word matches, which may have " "case issues (lower is better)."
Definition at line 609 of file dict.h.
◆ segment_penalty_dict_case_ok
double tesseract::Dict::segment_penalty_dict_case_ok = 1.1 |
"Score multiplier for word matches that have good case " "(lower is better)."
Definition at line 605 of file dict.h.
◆ segment_penalty_dict_frequent_word
double tesseract::Dict::segment_penalty_dict_frequent_word = 1.0 |
"Score multiplier for word matches which have good case and" "are frequent in the given language (lower is better)."
Definition at line 601 of file dict.h.
◆ segment_penalty_dict_nonword
double tesseract::Dict::segment_penalty_dict_nonword = 1.25 |
"Score multiplier for glyph fragment segmentations which " "do not match a dictionary word (lower is better)."
Definition at line 613 of file dict.h.
◆ segment_penalty_garbage
double tesseract::Dict::segment_penalty_garbage = 1.50 |
"Score multiplier for poorly cased strings that are not in" " the dictionary and generally look like garbage (lower is" " better)."
Definition at line 618 of file dict.h.
◆ stopper_allowable_character_badness
double tesseract::Dict::stopper_allowable_character_badness = 3.0 |
"Max certaintly variation allowed in a word (in sigma)"
Definition at line 637 of file dict.h.
◆ stopper_certainty_per_char
double tesseract::Dict::stopper_certainty_per_char = -0.50 |
"Certainty to add for each dict char above small word size."
Definition at line 635 of file dict.h.
◆ stopper_debug_level
int tesseract::Dict::stopper_debug_level = 0 |
"Stopper debug level"
Definition at line 638 of file dict.h.
◆ stopper_no_acceptable_choices
bool tesseract::Dict::stopper_no_acceptable_choices = false |
"Make AcceptableChoice() always return false. Useful" " when there is a need to explore all segmentations"
Definition at line 641 of file dict.h.
◆ stopper_nondict_certainty_base
double tesseract::Dict::stopper_nondict_certainty_base = -2.50 |
"Certainty threshold for non-dict words"
Definition at line 629 of file dict.h.
◆ stopper_phase2_certainty_rejection_offset
double tesseract::Dict::stopper_phase2_certainty_rejection_offset = 1.0 |
"Reject certainty offset"
Definition at line 631 of file dict.h.
◆ stopper_smallword_size
int tesseract::Dict::stopper_smallword_size = 2 |
"Size of dict word to be treated as non-dict word"
Definition at line 633 of file dict.h.
◆ tessedit_truncate_wordchoice_log
int tesseract::Dict::tessedit_truncate_wordchoice_log = 10 |
"Max words to keep in list"
Definition at line 642 of file dict.h.
◆ use_only_first_uft8_step
bool tesseract::Dict::use_only_first_uft8_step = false |
"Use only the first UTF8 step of the given string" " when computing log probabilities."
Definition at line 626 of file dict.h.
◆ user_patterns_file
char* tesseract::Dict::user_patterns_file = "" |
"A filename of user-provided patterns."
Definition at line 582 of file dict.h.
◆ user_patterns_suffix
char* tesseract::Dict::user_patterns_suffix = "" |
"A suffix of user-provided patterns located in tessdata."
Definition at line 584 of file dict.h.
◆ user_words_file
char* tesseract::Dict::user_words_file = "" |
Variable members. These have to be declared and initialized after image_ptr_, which contains the pointer to the params vector - the member of its base CCUtil class. "A filename of user-provided words."
Definition at line 578 of file dict.h.
◆ user_words_suffix
char* tesseract::Dict::user_words_suffix = "" |
"A suffix of user-provided words located in tessdata."
Definition at line 580 of file dict.h.
◆ word_to_debug
char* tesseract::Dict::word_to_debug = "" |
"Word for which stopper debug information" " should be printed to stdout"
Definition at line 644 of file dict.h.
◆ xheight_penalty_inconsistent
double tesseract::Dict::xheight_penalty_inconsistent = 0.25 |
"Score penalty (0.1 = 10%) added if an xheight is " "inconsistent."
Definition at line 598 of file dict.h.
◆ xheight_penalty_subscripts
double tesseract::Dict::xheight_penalty_subscripts = 0.125 |
"Score penalty (0.1 = 10%) added if there are subscripts " "or superscripts in a word, but it is otherwise OK."
Definition at line 595 of file dict.h.
The documentation for this class was generated from the following files:
const CCUtil * getCCUtil() const
const STRING & unichar_string() const
void delete_data_pointers()
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
int max_permuter_attempts
void set_adjust_factor(float factor)
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
double segment_penalty_dict_case_bad
bool contains_unichar_id(UNICHAR_ID unichar_id) const
const CHAR_FRAGMENT * fragment
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
float(Dict::* params_model_classify_)(const char *lang, void *path)
char * output_ambig_words_file
int UniformCertainties(const WERD_CHOICE &word)
UNICHAR_ID unichar_id(int index) const
double xheight_penalty_subscripts
bool get_isdigit(UNICHAR_ID unichar_id) const
bool dangerous_ambig_found() const
bool get_isalpha(UNICHAR_ID unichar_id) const
double stopper_certainty_per_char
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
int tessedit_truncate_wordchoice_log
#define INT_MEMBER(name, val, comment, vec)
double segment_penalty_garbage
int GetTopScriptID() const
const UnicharAmbigs & getUnicharAmbigs() const
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
double doc_dict_certainty_threshold
void set_certainty(float new_val)
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
void make_bad()
Set the fields in this choice to be default (bad) values.
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
void EndDangerousAmbigs()
bool get_isngram(UNICHAR_ID unichar_id) const
void IncreaseBandSize(int bandwidth)
bool FreeDawg(Dawg *dawg)
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
char * user_patterns_file
const UNICHARSET * unicharset() const
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
int state(int index) const
STRING language_data_path_prefix
int step(const char *str) const
double segment_penalty_dict_nonword
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
double xheight_penalty_inconsistent
#define BOOL_INIT_MEMBER(name, val, comment, vec)
void set_classifier(BlobChoiceClassifier classifier)
bool get_ispunctuation(UNICHAR_ID unichar_id) const
void copy_hyphen_info(WERD_CHOICE *word) const
bool stopper_no_acceptable_choices
bool AcceptableResult(WERD_RES *word) const
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
void set_rating(float newrat)
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
GenericVector< int > SuccessorList
void punct_stripped(int *start_core, int *end_core) const
void set_matrix_cell(int col, int row)
#define STRING_MEMBER(name, val, comment, vec)
float min_x_height() const
WERD_CHOICE * best_choice
const char * c_str() const
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
void set_certainty(float newrat)
Dawg * GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
double stopper_allowable_character_badness
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
const UnicharAmbigsVector & dang_ambigs() const
double segment_penalty_dict_case_ok
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
#define double_MEMBER(name, val, comment, vec)
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
int stopper_smallword_size
bool valid_punctuation(const WERD_CHOICE &word)
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
double segment_penalty_dict_frequent_word
void set_rating(float new_val)
const STRING debug_string() const
WERD_CHOICE_LIST best_choices
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
static int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2)
#define STRING_INIT_MEMBER(name, val, comment, vec)
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
UnicharAmbigs unichar_ambigs
bool get_isupper(UNICHAR_ID unichar_id) const
void remove_last_unichar_id()
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
bool use_only_first_uft8_step
double doc_dict_pending_threshold
double stopper_phase2_certainty_rejection_offset
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET ¤t_unicharset)
void put(ICOORD pos, const T &thing)
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
DLLSYM void tprintf(const char *format,...)
const UNICHARSET & getUnicharset() const
const UnicharAmbigsVector & replace_ambigs() const
static const UNICHAR_ID kPatternUnicharID
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
void remove_unichar_id(int index)
const char * id_to_unichar(UNICHAR_ID id) const
#define BOOL_MEMBER(name, val, comment, vec)
double stopper_nondict_certainty_base
void set_permuter(uint8_t perm)
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
void set_unichar_id(UNICHAR_ID newunichar_id)
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
bool segment_nonalphabetic_script
char * user_patterns_suffix
float max_x_height() const
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const