tesseract
5.0.0-alpha-619-ge9db
|
Go to the documentation of this file.
19 #ifndef TESSERACT_DICT_DICT_H_
20 #define TESSERACT_DICT_DICT_H_
22 #ifndef DISABLED_LEGACY_ENGINE
31 #ifndef DISABLED_LEGACY_ENGINE
33 #endif // ndef DISABLED_LEGACY_ENGINE
38 #define CHARS_PER_LINE 500
39 #define MAX_WERD_LENGTH (int64_t) 128
58 static const int kRatingPad = 4;
59 static const char kDictWildcard[] =
"\u2606";
60 static const int kDictMaxWildcards = 2;
63 static const char kHyphenSymbol[] =
"-";
64 static const char kSlashSymbol[] =
"/";
65 static const char kQuestionSymbol[] =
"?";
66 static const char kApostropheSymbol[] =
"'";
67 static const float kSimCertaintyScale = -10.0;
68 static const float kSimCertaintyOffset = -10.0;
69 static const float kSimilarityFloor = 100.0;
70 static const int kDocDictMaxRepChars = 4;
107 #ifndef DISABLED_LEGACY_ENGINE
118 return normed_ids.
size() == 1 &&
119 (normed_ids[0] == hyphen_unichar_id_ ||
120 normed_ids[0] == slash_unichar_id_);
129 return normed_ids.
size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
136 !last_word_on_line_ && hyphen_word_;
147 *word = *hyphen_word_;
153 UNICHAR_ID unichar_id,
bool first_pos)
const {
154 if (!last_word_on_line_ || first_pos)
159 return normed_ids.
size() == 1 && normed_ids[0] == hyphen_unichar_id_;
163 int word_index = word.
length() - 1;
192 bool ambigs_mode)
const;
195 bool suppress_patterns)
const;
211 bool word_ending,
WERD_CHOICE *word,
float certainties[],
212 float *limit,
WERD_CHOICE *best_choice,
int *attempts_left,
213 void *void_more_args);
218 int char_choice_index,
221 float certainties[],
float *limit,
223 void *void_more_args);
230 int char_choice_index,
243 int char_choice_index,
253 float curr_rating,
float curr_certainty,
255 const char *debug,
int word_ending,
259 #if !defined(DISABLED_LEGACY_ENGINE)
262 bool fix_replaceable,
264 #endif // !defined(DISABLED_LEGACY_ENGINE)
272 void ReplaceAmbig(
int wrong_ngram_begin_index,
int wrong_ngram_size,
293 #if !defined(DISABLED_LEGACY_ENGINE)
295 #endif // !defined(DISABLED_LEGACY_ENGINE)
327 if (pending_words_ !=
nullptr)
328 pending_words_->
clear();
329 if (document_words_ !=
nullptr)
330 document_words_->
clear();
379 unicharset, unichar_id, word_end);
388 int character_bytes);
393 int character_bytes) {
396 context, context_bytes,
402 const char* lang,
const char* context,
int context_bytes,
403 const char*
character,
int character_bytes) {
408 (void)character_bytes;
415 int character_bytes);
432 inline const Dawg *
GetDawg(
int index)
const {
return dawgs_[index]; }
439 if (edge_ref == NO_EDGE)
return 0;
441 if (node == 0) node = NO_EDGE;
449 const Dawg *dawg)
const {
450 if (!dawg)
return ch;
451 switch (dawg->
type()) {
506 float additional_adjust,
511 wordseg_rating_adjust_factor_ = f;
525 #ifndef DISABLED_LEGACY_ENGINE
531 float reject_offset_;
541 bool last_word_on_line_;
549 bool dawg_cache_is_ours_;
553 Trie *pending_words_;
567 Trie *document_words_;
570 float wordseg_rating_adjust_factor_;
572 FILE *output_ambig_words_file_;
580 "A suffix of user-provided words located in tessdata.");
582 "A filename of user-provided patterns.");
584 "A suffix of user-provided patterns located in tessdata.");
589 "Load dawg with punctuation patterns.");
592 "Load dawg with special word bigrams.");
594 "Score penalty (0.1 = 10%) added if there are subscripts "
595 "or superscripts in a word, but it is otherwise OK.");
597 "Score penalty (0.1 = 10%) added if an xheight is "
600 "Score multiplier for word matches which have good case and"
601 "are frequent in the given language (lower is better).");
604 "Score multiplier for word matches that have good case "
605 "(lower is better).");
608 "Default score multiplier for word matches, which may have "
609 "case issues (lower is better).");
612 "Score multiplier for glyph fragment segmentations which "
613 "do not match a dictionary word (lower is better).");
616 "Score multiplier for poorly cased strings that are not in"
617 " the dictionary and generally look like garbage (lower is"
620 "Output file for ambiguities found in the dictionary");
622 ", to 2 for more details, to 3 to see all the debug messages");
625 "Use only the first UTF8 step of the given string"
626 " when computing log probabilities.");
629 "Certainty threshold for non-dict words");
631 "Reject certainty offset");
633 "Size of dict word to be treated as non-dict word");
635 "Certainty to add for each dict char above small word size.");
637 "Max certaintly variation allowed in a word (in sigma)");
640 "Make AcceptableChoice() always return false. Useful"
641 " when there is a need to explore all segmentations");
644 " should be printed to stdout");
646 "Don't use any alphabetic-specific tricks."
647 "Set to true in the traineddata config file for"
648 " scripts that are cursive or inherently fixed-pitch");
651 "Worst certainty for using pending dictionary");
653 " for words that can be inserted into the document dictionary");
655 " character choices to consider during permutation."
656 " This limit is especially useful when user patterns"
657 " are specified, since overly generic patterns can result in"
658 " dawg search exploring an overly large number of options.");
662 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_
const CCUtil * getCCUtil() const
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
int max_permuter_attempts
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
double segment_penalty_dict_case_bad
bool contains_unichar_id(UNICHAR_ID unichar_id) const
const CHAR_FRAGMENT * fragment
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
float(Dict::* params_model_classify_)(const char *lang, void *path)
char * output_ambig_words_file
int UniformCertainties(const WERD_CHOICE &word)
UNICHAR_ID unichar_id(int index) const
double xheight_penalty_subscripts
bool get_isdigit(UNICHAR_ID unichar_id) const
double stopper_certainty_per_char
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
int tessedit_truncate_wordchoice_log
double segment_penalty_garbage
const UnicharAmbigs & getUnicharAmbigs() const
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
double doc_dict_certainty_threshold
int valid_word(const char *string) const
This function is used by api/tesseract_cube_combiner.cpp.
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
void EndDangerousAmbigs()
void Load(const STRING &lang, TessdataManager *data_file)
void DebugWordChoices()
Prints the current choices for this word to stdout.
bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset)
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
char * user_patterns_file
const UNICHARSET * unicharset() const
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
static TESS_API DawgCache * GlobalDawgCache()
#define double_VAR_H(name, val, comment)
double segment_penalty_dict_nonword
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
double xheight_penalty_inconsistent
void copy_hyphen_info(WERD_CHOICE *word) const
bool stopper_no_acceptable_choices
bool AcceptableResult(WERD_RES *word) const
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
int good_choice(const WERD_CHOICE &choice)
Returns true if a good answer is found for the unknown blob rating.
double ngram_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
UNICHAR_ID WildcardID() const
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
const char * c_str() const
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
int valid_word_or_number(const WERD_CHOICE &word) const
double stopper_allowable_character_badness
DawgPositionVector * updated_dawgs
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
double segment_penalty_dict_case_ok
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
void SetupForLoad(DawgCache *dawg_cache)
bool has_hyphen_end(const WERD_CHOICE &word) const
Same as above, but check the unichar at the end of the word.
int stopper_smallword_size
bool valid_punctuation(const WERD_CHOICE &word)
double segment_penalty_dict_frequent_word
#define INT_VAR_H(name, val, comment)
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
bool compound_marker(UNICHAR_ID unichar_id)
UnicharAmbigs unichar_ambigs
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
void LoadLSTM(const STRING &lang, TessdataManager *data_file)
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
bool use_only_first_uft8_step
double doc_dict_pending_threshold
double stopper_phase2_certainty_rejection_offset
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
void reset_hyphen_vars(bool last_word_on_line)
#define BOOL_VAR_H(name, val, comment)
float ParamsModelClassify(const char *lang, void *path)
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
float CallParamsModelClassify(void *path)
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
const UNICHARSET & getUnicharset() const
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
static const UNICHAR_ID kPatternUnicharID
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
UNICHARSET & getUnicharset()
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
DawgPositionVector * active_dawgs
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
bool is_apostrophe(UNICHAR_ID unichar_id)
bool IsSpaceDelimitedLang() const
Returns true if the language is space-delimited (not CJ, or T).
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
int valid_word(const WERD_CHOICE &word) const
void ResetDocumentDictionary()
double stopper_nondict_certainty_base
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
void SetWildcardID(UNICHAR_ID id)
#define STRING_VAR_H(name, val, comment)
bool segment_nonalphabetic_script
char * user_patterns_suffix
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const