tesseract  4.0.0-1-g2a2b
tesseract::Dict Class Reference

#include <dict.h>

Public Member Functions

 Dict (CCUtil *image_ptr)
 
 ~Dict ()
 
const CCUtilgetCCUtil () const
 
CCUtilgetCCUtil ()
 
const UNICHARSETgetUnicharset () const
 
UNICHARSETgetUnicharset ()
 
const UnicharAmbigsgetUnicharAmbigs () const
 
bool compound_marker (UNICHAR_ID unichar_id)
 
bool is_apostrophe (UNICHAR_ID unichar_id)
 
bool hyphenated () const
 Returns true if we've recorded the beginning of a hyphenated word. More...
 
int hyphen_base_size () const
 Size of the base word (the part on the line before) of a hyphenated word. More...
 
void copy_hyphen_info (WERD_CHOICE *word) const
 
bool has_hyphen_end (UNICHAR_ID unichar_id, bool first_pos) const
 Check whether the word has a hyphen at the end. More...
 
bool has_hyphen_end (const WERD_CHOICE &word) const
 Same as above, but check the unichar at the end of the word. More...
 
void reset_hyphen_vars (bool last_word_on_line)
 
void set_hyphen_word (const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
 
void update_best_choice (const WERD_CHOICE &word, WERD_CHOICE *best_choice)
 
void init_active_dawgs (DawgPositionVector *active_dawgs, bool ambigs_mode) const
 
void default_dawgs (DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
 
bool NoDangerousAmbig (WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
 
void ReplaceAmbig (int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
 
int LengthOfShortestAlphaRun (const WERD_CHOICE &WordChoice) const
 Returns the length of the shortest alpha run in WordChoice. More...
 
int UniformCertainties (const WERD_CHOICE &word)
 
bool AcceptableChoice (const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
 Returns true if the given best_choice is good enough to stop. More...
 
bool AcceptableResult (WERD_RES *word) const
 
void EndDangerousAmbigs ()
 
void DebugWordChoices ()
 Prints the current choices for this word to stdout. More...
 
void SettupStopperPass1 ()
 Sets up stopper variables in preparation for the first pass. More...
 
void SettupStopperPass2 ()
 Sets up stopper variables in preparation for the second pass. More...
 
int case_ok (const WERD_CHOICE &word, const UNICHARSET &unicharset) const
 Check a string to see if it matches a set of lexical rules. More...
 
bool absolute_garbage (const WERD_CHOICE &word, const UNICHARSET &unicharset)
 
void SetupForLoad (DawgCache *dawg_cache)
 
void Load (const STRING &lang, TessdataManager *data_file)
 
void LoadLSTM (const STRING &lang, TessdataManager *data_file)
 
bool FinishLoad ()
 
void End ()
 
void ResetDocumentDictionary ()
 
int def_letter_is_okay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 
int LetterIsOkay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 Calls letter_is_okay_ member function. More...
 
double ProbabilityInContext (const char *context, int context_bytes, const char *character, int character_bytes)
 Calls probability_in_context_ member function. More...
 
double def_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Default (no-op) implementation of probability in context function. More...
 
double ngram_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 
float ParamsModelClassify (const char *lang, void *path)
 
float CallParamsModelClassify (void *path)
 
void SetWildcardID (UNICHAR_ID id)
 
UNICHAR_ID WildcardID () const
 
int NumDawgs () const
 Return the number of dawgs in the dawgs_ vector. More...
 
const DawgGetDawg (int index) const
 Return i-th dawg pointer recorded in the dawgs_ vector. More...
 
const DawgGetPuncDawg () const
 Return the points to the punctuation dawg. More...
 
const DawgGetUnambigDawg () const
 Return the points to the unambiguous words dawg. More...
 
UNICHAR_ID char_for_dawg (const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
 
void ProcessPatternEdges (const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
 
int valid_word (const WERD_CHOICE &word, bool numbers_ok) const
 
int valid_word (const WERD_CHOICE &word) const
 
int valid_word_or_number (const WERD_CHOICE &word) const
 
int valid_word (const char *string) const
 This function is used by api/tesseract_cube_combiner.cpp. More...
 
bool valid_bigram (const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
 
bool valid_punctuation (const WERD_CHOICE &word)
 
int good_choice (const WERD_CHOICE &choice)
 Returns true if a good answer is found for the unknown blob rating. More...
 
void add_document_word (const WERD_CHOICE &best_choice)
 Adds a word found on this document to the document specific dictionary. More...
 
void adjust_word (WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
 Adjusts the rating of the given word. More...
 
void SetWordsegRatingAdjustFactor (float f)
 Set wordseg_rating_adjust_factor_ to the given value. More...
 
bool IsSpaceDelimitedLang () const
 Returns true if the language is space-delimited (not CJ, or T). More...
 
go_deeper_dawg_fxn

If the choice being composed so far could be a dictionary word keep exploring choices.

WERD_CHOICEdawg_permute_and_select (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
 
void go_deeper_dawg_fxn (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 
void permute_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
void append_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
fragment_state

Given the current char choice and information about previously seen fragments, determines whether adjacent character fragments are present and whether they can be concatenated.

The given prev_char_frag_info contains:

  • fragment: if not nullptr contains information about immediately preceding fragmented character choice
  • num_fragments: number of fragments that have been used so far to construct a character
  • certainty: certainty of the current choice or minimum certainty of all fragments concatenated so far
  • rating: rating of the current choice or sum of fragment ratings concatenated so far

The output char_frag_info is filled in as follows:

  • character: is set to be nullptr if the choice is a non-matching or non-ending fragment piece; is set to unichar of the given choice if it represents a regular character or a matching ending fragment
  • fragment,num_fragments,certainty,rating are set as described above
Returns
false if a non-matching fragment is discovered, true otherwise.
bool fragment_state_okay (UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
 

Static Public Member Functions

static DawgCacheGlobalDawgCache ()
 
static NODE_REF GetStartingNode (const Dawg *dawg, EDGE_REF edge_ref)
 Returns the appropriate next node given the EDGE_REF. More...
 
static bool valid_word_permuter (uint8_t perm, bool numbers_ok)
 Check all the DAWGs to see if this word is in any of them. More...
 

Public Attributes

void(Dict::* go_deeper_fxn_ )(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 Pointer to go_deeper function. More...
 
int(Dict::* letter_is_okay_ )(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 
double(Dict::* probability_in_context_ )(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Probability in context function used by the ngram permuter. More...
 
float(Dict::* params_model_classify_ )(const char *lang, void *path)
 
char * user_words_file = ""
 
char * user_words_suffix = ""
 
char * user_patterns_file = ""
 
char * user_patterns_suffix = ""
 
bool load_system_dawg = true
 
bool load_freq_dawg = true
 
bool load_unambig_dawg = true
 
bool load_punc_dawg = true
 
bool load_number_dawg = true
 
bool load_bigram_dawg = true
 
double xheight_penalty_subscripts = 0.125
 
double xheight_penalty_inconsistent = 0.25
 
double segment_penalty_dict_frequent_word = 1.0
 
double segment_penalty_dict_case_ok = 1.1
 
double segment_penalty_dict_case_bad = 1.3125
 
double segment_penalty_dict_nonword = 1.25
 
double segment_penalty_garbage = 1.50
 
char * output_ambig_words_file = ""
 
int dawg_debug_level = 0
 
int hyphen_debug_level = 0
 
int max_viterbi_list_size = 10
 
bool use_only_first_uft8_step = false
 
double certainty_scale = 20.0
 
double stopper_nondict_certainty_base = -2.50
 
double stopper_phase2_certainty_rejection_offset = 1.0
 
int stopper_smallword_size = 2
 
double stopper_certainty_per_char = -0.50
 
double stopper_allowable_character_badness = 3.0
 
int stopper_debug_level = 0
 
bool stopper_no_acceptable_choices = false
 
int tessedit_truncate_wordchoice_log = 10
 
char * word_to_debug = ""
 
char * word_to_debug_lengths = ""
 
int fragments_debug = 0
 
bool segment_nonalphabetic_script = false
 
bool save_doc_words = 0
 
double doc_dict_pending_threshold = 0.0
 
double doc_dict_certainty_threshold = -2.25
 
int max_permuter_attempts = 10000
 

Detailed Description

Definition at line 88 of file dict.h.

Constructor & Destructor Documentation

◆ Dict()

tesseract::Dict::Dict ( CCUtil image_ptr)

Definition at line 30 of file dict.cpp.

33  params_model_classify_(nullptr),
34  ccutil_(ccutil),
35  wildcard_unichar_id_(INVALID_UNICHAR_ID),
36  apostrophe_unichar_id_(INVALID_UNICHAR_ID),
37  question_unichar_id_(INVALID_UNICHAR_ID),
38  slash_unichar_id_(INVALID_UNICHAR_ID),
39  hyphen_unichar_id_(INVALID_UNICHAR_ID),
40  STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
41  getCCUtil()->params()),
43  "A suffix of user-provided words located in tessdata.",
44  getCCUtil()->params()),
46  "A filename of user-provided patterns.",
47  getCCUtil()->params()),
49  "A suffix of user-provided patterns located in "
50  "tessdata.",
51  getCCUtil()->params()),
52  BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
53  getCCUtil()->params()),
54  BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
55  getCCUtil()->params()),
56  BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
57  getCCUtil()->params()),
59  "Load dawg with punctuation"
60  " patterns.",
61  getCCUtil()->params()),
63  "Load dawg with number"
64  " patterns.",
65  getCCUtil()->params()),
67  "Load dawg with special word "
68  "bigrams.",
69  getCCUtil()->params()),
71  "Score penalty (0.1 = 10%) added if there are subscripts "
72  "or superscripts in a word, but it is otherwise OK.",
73  getCCUtil()->params()),
75  "Score penalty (0.1 = 10%) added if an xheight is "
76  "inconsistent.",
77  getCCUtil()->params()),
79  "Score multiplier for word matches which have good case and"
80  " are frequent in the given language (lower is better).",
81  getCCUtil()->params()),
83  "Score multiplier for word matches that have good case "
84  "(lower is better).",
85  getCCUtil()->params()),
87  "Default score multiplier for word matches, which may have "
88  "case issues (lower is better).",
89  getCCUtil()->params()),
91  "Score multiplier for glyph fragment segmentations which "
92  "do not match a dictionary word (lower is better).",
93  getCCUtil()->params()),
95  "Score multiplier for poorly cased strings that are not in"
96  " the dictionary and generally look like garbage (lower is"
97  " better).",
98  getCCUtil()->params()),
100  "Output file for ambiguities found in the dictionary",
101  getCCUtil()->params()),
103  "Set to 1 for general debug info"
104  ", to 2 for more details, to 3 to see all the debug messages",
105  getCCUtil()->params()),
106  INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
107  getCCUtil()->params()),
108  INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.",
109  getCCUtil()->params()),
111  "Use only the first UTF8 step of the given string"
112  " when computing log probabilities.",
113  getCCUtil()->params()),
114  double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
115  getCCUtil()->params()),
117  "Certainty threshold for non-dict words",
118  getCCUtil()->params()),
120  "Reject certainty offset", getCCUtil()->params()),
122  "Size of dict word to be treated as non-dict word",
123  getCCUtil()->params()),
125  "Certainty to add"
126  " for each dict char above small word size.",
127  getCCUtil()->params()),
129  "Max certaintly variation allowed in a word (in sigma)",
130  getCCUtil()->params()),
131  INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
132  getCCUtil()->params()),
134  "Make AcceptableChoice() always return false. Useful"
135  " when there is a need to explore all segmentations",
136  getCCUtil()->params()),
138  "Max words to keep in list", getCCUtil()->params()),
140  "Word for which stopper debug"
141  " information should be printed to stdout",
142  getCCUtil()->params()),
144  "Lengths of unichars in word_to_debug",
145  getCCUtil()->params()),
146  INT_MEMBER(fragments_debug, 0, "Debug character fragments",
147  getCCUtil()->params()),
149  "Don't use any alphabetic-specific tricks."
150  " Set to true in the traineddata config file for"
151  " scripts that are cursive or inherently fixed-pitch",
152  getCCUtil()->params()),
153  BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
154  getCCUtil()->params()),
156  "Worst certainty for using pending dictionary",
157  getCCUtil()->params()),
159  "Worst certainty for words that can be inserted into the"
160  " document dictionary",
161  getCCUtil()->params()),
163  "Maximum number of different"
164  " character choices to consider during permutation."
165  " This limit is especially useful when user patterns"
166  " are specified, since overly generic patterns can result in"
167  " dawg search exploring an overly large number of options.",
168  getCCUtil()->params()) {
169  dang_ambigs_table_ = nullptr;
170  replace_ambigs_table_ = nullptr;
171  reject_offset_ = 0.0;
172  go_deeper_fxn_ = nullptr;
173  hyphen_word_ = nullptr;
174  last_word_on_line_ = false;
175  document_words_ = nullptr;
176  dawg_cache_ = nullptr;
177  dawg_cache_is_ours_ = false;
178  pending_words_ = nullptr;
179  bigram_dawg_ = nullptr;
180  freq_dawg_ = nullptr;
181  punc_dawg_ = nullptr;
182  unambig_dawg_ = nullptr;
183  wordseg_rating_adjust_factor_ = -1.0f;
184  output_ambig_words_file_ = nullptr;
185 }
double segment_penalty_dict_case_ok
Definition: dict.h:588
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:294
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:306
const CCUtil * getCCUtil() const
Definition: dict.h:92
int fragments_debug
Definition: dict.h:631
char * word_to_debug
Definition: dict.h:628
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:369
double certainty_scale
Definition: dict.h:611
double segment_penalty_dict_frequent_word
Definition: dict.h:584
int tessedit_truncate_wordchoice_log
Definition: dict.h:626
double doc_dict_pending_threshold
Definition: dict.h:638
int max_viterbi_list_size
Definition: dict.h:607
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:303
double segment_penalty_dict_case_bad
Definition: dict.h:592
bool stopper_no_acceptable_choices
Definition: dict.h:625
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:288
double segment_penalty_dict_nonword
Definition: dict.h:596
bool load_number_dawg
Definition: dict.h:573
int hyphen_debug_level
Definition: dict.h:606
bool load_punc_dawg
Definition: dict.h:572
bool load_freq_dawg
Definition: dict.h:569
int stopper_debug_level
Definition: dict.h:622
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:291
char * user_words_file
Definition: dict.h:561
char * output_ambig_words_file
Definition: dict.h:603
double segment_penalty_garbage
Definition: dict.h:601
float(Dict::* params_model_classify_)(const char *lang, void *path)
Definition: dict.h:403
bool load_unambig_dawg
Definition: dict.h:570
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:297
char * user_words_suffix
Definition: dict.h:563
double stopper_allowable_character_badness
Definition: dict.h:621
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:205
double stopper_nondict_certainty_base
Definition: dict.h:613
bool load_bigram_dawg
Definition: dict.h:575
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:367
double doc_dict_certainty_threshold
Definition: dict.h:640
bool use_only_first_uft8_step
Definition: dict.h:610
double xheight_penalty_subscripts
Definition: dict.h:578
bool load_system_dawg
Definition: dict.h:568
double stopper_certainty_per_char
Definition: dict.h:619
int dawg_debug_level
Definition: dict.h:605
int max_permuter_attempts
Definition: dict.h:645
char * word_to_debug_lengths
Definition: dict.h:630
int stopper_smallword_size
Definition: dict.h:617
char * user_patterns_file
Definition: dict.h:565
char * user_patterns_suffix
Definition: dict.h:567
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:386
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:615
double xheight_penalty_inconsistent
Definition: dict.h:581
bool segment_nonalphabetic_script
Definition: dict.h:635
CCUtil ccutil
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:357
bool save_doc_words
Definition: dict.h:636

◆ ~Dict()

tesseract::Dict::~Dict ( )

Definition at line 187 of file dict.cpp.

187  {
188  End();
189  delete hyphen_word_;
190  if (output_ambig_words_file_ != nullptr) fclose(output_ambig_words_file_);
191 }
void End()
Definition: dict.cpp:343

Member Function Documentation

◆ absolute_garbage()

bool tesseract::Dict::absolute_garbage ( const WERD_CHOICE word,
const UNICHARSET unicharset 
)

Returns true if the word looks like an absolute garbage (e.g. image mistakenly recognized as text).

Definition at line 70 of file context.cpp.

71  {
72  if (word.length() < kMinAbsoluteGarbageWordLength) return false;
73  int num_alphanum = 0;
74  for (int x = 0; x < word.length(); ++x) {
75  num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
76  unicharset.get_isdigit(word.unichar_id(x)));
77  }
78  return (static_cast<float>(num_alphanum) /
79  static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
80 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int length() const
Definition: ratngs.h:303

◆ AcceptableChoice()

bool tesseract::Dict::AcceptableChoice ( const WERD_CHOICE best_choice,
XHeightConsistencyEnum  xheight_consistency 
)

Returns true if the given best_choice is good enough to stop.

Definition at line 41 of file stopper.cpp.

42  {
43  float CertaintyThreshold = stopper_nondict_certainty_base;
44  int WordSize;
45 
46  if (stopper_no_acceptable_choices) return false;
47 
48  if (best_choice.length() == 0) return false;
49 
50  bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
51  bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
52  bool is_case_ok = case_ok(best_choice, getUnicharset());
53 
54  if (stopper_debug_level >= 1) {
55  const char *xht = "UNKNOWN";
56  switch (xheight_consistency) {
57  case XH_GOOD: xht = "NORMAL"; break;
58  case XH_SUBNORMAL: xht = "SUBNORMAL"; break;
59  case XH_INCONSISTENT: xht = "INCONSISTENT"; break;
60  default: xht = "UNKNOWN";
61  }
62  tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
63  best_choice.unichar_string().string(),
64  (is_valid_word ? 'y' : 'n'),
65  (is_case_ok ? 'y' : 'n'),
66  xht,
67  best_choice.min_x_height(),
68  best_choice.max_x_height());
69  }
70  // Do not accept invalid words in PASS1.
71  if (reject_offset_ <= 0.0f && !is_valid_word) return false;
72  if (is_valid_word && is_case_ok) {
73  WordSize = LengthOfShortestAlphaRun(best_choice);
74  WordSize -= stopper_smallword_size;
75  if (WordSize < 0)
76  WordSize = 0;
77  CertaintyThreshold += WordSize * stopper_certainty_per_char;
78  }
79 
80  if (stopper_debug_level >= 1)
81  tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
82  best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
83 
84  if (no_dang_ambigs &&
85  best_choice.certainty() > CertaintyThreshold &&
86  xheight_consistency < XH_INCONSISTENT &&
87  UniformCertainties(best_choice)) {
88  return true;
89  } else {
90  if (stopper_debug_level >= 1) {
91  tprintf("AcceptableChoice() returned false"
92  " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
93  no_dang_ambigs, best_choice.certainty(),
94  CertaintyThreshold,
95  UniformCertainties(best_choice));
96  }
97  return false;
98  }
99 }
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:52
int UniformCertainties(const WERD_CHOICE &word)
Definition: stopper.cpp:461
const char * string() const
Definition: strngs.cpp:196
uint8_t permuter() const
Definition: ratngs.h:346
bool stopper_no_acceptable_choices
Definition: dict.h:625
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
int stopper_debug_level
Definition: dict.h:622
bool dangerous_ambig_found() const
Definition: ratngs.h:363
float max_x_height() const
Definition: ratngs.h:339
double stopper_nondict_certainty_base
Definition: dict.h:613
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
float min_x_height() const
Definition: ratngs.h:336
double stopper_certainty_per_char
Definition: dict.h:619
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
const STRING & unichar_string() const
Definition: ratngs.h:541
int stopper_smallword_size
Definition: dict.h:617
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:459
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:442

◆ AcceptableResult()

bool tesseract::Dict::AcceptableResult ( WERD_RES word) const

Returns false if the best choice for the current word is questionable and should be tried again on the second pass or should be flagged to the user.

Definition at line 101 of file stopper.cpp.

101  {
102  if (word->best_choice == nullptr) return false;
103  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
104  int WordSize;
105 
106  if (stopper_debug_level >= 1) {
107  tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
108  word->best_choice->debug_string().string(),
109  (valid_word(*word->best_choice) ? 'y' : 'n'),
110  (case_ok(*word->best_choice, getUnicharset()) ? 'y' : 'n'),
111  word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
112  word->best_choices.singleton() ? 'n' : 'y');
113  }
114 
115  if (word->best_choice->length() == 0 || !word->best_choices.singleton())
116  return false;
117  if (valid_word(*word->best_choice) &&
118  case_ok(*word->best_choice, getUnicharset())) {
119  WordSize = LengthOfShortestAlphaRun(*word->best_choice);
120  WordSize -= stopper_smallword_size;
121  if (WordSize < 0)
122  WordSize = 0;
123  CertaintyThreshold += WordSize * stopper_certainty_per_char;
124  }
125 
126  if (stopper_debug_level >= 1)
127  tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
128  word->best_choice->certainty(), CertaintyThreshold);
129 
130  if (word->best_choice->certainty() > CertaintyThreshold &&
132  if (stopper_debug_level >= 1)
133  tprintf("ACCEPTED\n");
134  return true;
135  } else {
136  if (stopper_debug_level >= 1)
137  tprintf("REJECTED\n");
138  return false;
139  }
140 }
WERD_CHOICE_LIST best_choices
Definition: pageres.h:243
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:52
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:753
const char * string() const
Definition: strngs.cpp:196
bool stopper_no_acceptable_choices
Definition: dict.h:625
float certainty() const
Definition: ratngs.h:330
int stopper_debug_level
Definition: dict.h:622
bool dangerous_ambig_found() const
Definition: ratngs.h:363
double stopper_nondict_certainty_base
Definition: dict.h:613
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
double stopper_certainty_per_char
Definition: dict.h:619
const STRING debug_string() const
Definition: ratngs.h:505
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
int stopper_smallword_size
Definition: dict.h:617
WERD_CHOICE * best_choice
Definition: pageres.h:235
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:442

◆ add_document_word()

void tesseract::Dict::add_document_word ( const WERD_CHOICE best_choice)

Adds a word found on this document to the document specific dictionary.

Definition at line 613 of file dict.cpp.

613  {
614  // Do not add hyphenated word parts to the document dawg.
615  // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
616  // called when the first part of the hyphenated word is
617  // discovered and while the second part of the word is recognized.
618  // hyphen_word_ is cleared in cc_recg() before the next word on
619  // the line is recognized.
620  if (hyphen_word_) return;
621 
622  int stringlen = best_choice.length();
623 
624  if (valid_word(best_choice) || stringlen < 2)
625  return;
626 
627  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
628  if (best_choice.length() >= kDocDictMaxRepChars) {
629  int num_rep_chars = 1;
630  UNICHAR_ID uch_id = best_choice.unichar_id(0);
631  for (int i = 1; i < best_choice.length(); ++i) {
632  if (best_choice.unichar_id(i) != uch_id) {
633  num_rep_chars = 1;
634  uch_id = best_choice.unichar_id(i);
635  } else {
636  ++num_rep_chars;
637  if (num_rep_chars == kDocDictMaxRepChars) return;
638  }
639  }
640  }
641 
642  if (best_choice.certainty() < doc_dict_certainty_threshold ||
643  stringlen == 2) {
644  if (best_choice.certainty() < doc_dict_pending_threshold)
645  return;
646 
647  if (!pending_words_->word_in_dawg(best_choice)) {
648  if (stringlen > 2 ||
649  (stringlen == 2 &&
650  getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
651  getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
652  pending_words_->add_word_to_dawg(best_choice);
653  }
654  return;
655  }
656  }
657 
658  if (save_doc_words) {
659  STRING filename(getCCUtil()->imagefile);
660  filename += ".doc";
661  FILE *doc_word_file = fopen(filename.string(), "a");
662  if (doc_word_file == nullptr) {
663  tprintf("Error: Could not open file %s\n", filename.string());
664  ASSERT_HOST(doc_word_file);
665  }
666  fprintf(doc_word_file, "%s\n",
667  best_choice.debug_string().string());
668  fclose(doc_word_file);
669  }
670  document_words_->add_word_to_dawg(best_choice);
671 }
int UNICHAR_ID
Definition: unichar.h:35
const CCUtil * getCCUtil() const
Definition: dict.h:92
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:753
const char * string() const
Definition: strngs.cpp:196
double doc_dict_pending_threshold
Definition: dict.h:638
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:71
float certainty() const
Definition: ratngs.h:330
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:174
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
double doc_dict_certainty_threshold
Definition: dict.h:640
Definition: strngs.h:45
const STRING debug_string() const
Definition: ratngs.h:505
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500
bool save_doc_words
Definition: dict.h:636
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ adjust_word()

void tesseract::Dict::adjust_word ( WERD_CHOICE word,
bool  nonword,
XHeightConsistencyEnum  xheight_consistency,
float  additional_adjust,
bool  modify_rating,
bool  debug 
)

Adjusts the rating of the given word.

Definition at line 673 of file dict.cpp.

678  {
679  bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
680  word->GetTopScriptID() == getUnicharset().han_sid());
681  bool case_is_ok = (is_han || case_ok(*word, getUnicharset()));
682  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
683 
684  float adjust_factor = additional_adjust;
685  float new_rating = word->rating();
686  new_rating += kRatingPad;
687  const char *xheight_triggered = "";
688  if (word->length() > 1) {
689  // Calculate x-height and y-offset consistency penalties.
690  switch (xheight_consistency) {
691  case XH_INCONSISTENT:
692  adjust_factor += xheight_penalty_inconsistent;
693  xheight_triggered = ", xhtBAD";
694  break;
695  case XH_SUBNORMAL:
696  adjust_factor += xheight_penalty_subscripts;
697  xheight_triggered = ", xhtSUB";
698  break;
699  case XH_GOOD:
700  // leave the factor alone - all good!
701  break;
702  }
703  // TODO(eger): if nonword is true, but there is a "core" that is a dict
704  // word, negate nonword status.
705  } else {
706  if (debug) {
707  tprintf("Consistency could not be calculated.\n");
708  }
709  }
710  if (debug) {
711  tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
712  word->unichar_string().string(), word->rating(),
713  xheight_triggered);
714  }
715 
716  if (nonword) { // non-dictionary word
717  if (case_is_ok && punc_is_ok) {
718  adjust_factor += segment_penalty_dict_nonword;
719  new_rating *= adjust_factor;
720  if (debug) tprintf(", W");
721  } else {
722  adjust_factor += segment_penalty_garbage;
723  new_rating *= adjust_factor;
724  if (debug) {
725  if (!case_is_ok) tprintf(", C");
726  if (!punc_is_ok) tprintf(", P");
727  }
728  }
729  } else { // dictionary word
730  if (case_is_ok) {
731  if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
733  adjust_factor += segment_penalty_dict_frequent_word;
734  new_rating *= adjust_factor;
735  if (debug) tprintf(", F");
736  } else {
737  adjust_factor += segment_penalty_dict_case_ok;
738  new_rating *= adjust_factor;
739  if (debug) tprintf(", ");
740  }
741  } else {
742  adjust_factor += segment_penalty_dict_case_bad;
743  new_rating *= adjust_factor;
744  if (debug) tprintf(", C");
745  }
746  }
747  new_rating -= kRatingPad;
748  if (modify_rating) word->set_rating(new_rating);
749  if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
750  word->set_adjust_factor(adjust_factor);
751 }
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:52
double segment_penalty_dict_case_ok
Definition: dict.h:588
double segment_penalty_dict_frequent_word
Definition: dict.h:584
const char * string() const
Definition: strngs.cpp:196
double segment_penalty_dict_case_bad
Definition: dict.h:592
double segment_penalty_dict_nonword
Definition: dict.h:596
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:71
float rating() const
Definition: ratngs.h:327
double segment_penalty_garbage
Definition: dict.h:601
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
int GetTopScriptID() const
Definition: ratngs.cpp:670
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:830
double xheight_penalty_subscripts
Definition: dict.h:578
int null_sid() const
Definition: unicharset.h:878
int han_sid() const
Definition: unicharset.h:883
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
const STRING & unichar_string() const
Definition: ratngs.h:541
void set_adjust_factor(float factor)
Definition: ratngs.h:309
double xheight_penalty_inconsistent
Definition: dict.h:581
void set_rating(float new_val)
Definition: ratngs.h:369
void set_permuter(uint8_t perm)
Definition: ratngs.h:375

◆ append_choices()

void tesseract::Dict::append_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
const BLOB_CHOICE blob_choice,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

append_choices

Checks to see whether or not the next choice is worth appending to the word being generated. If so then keeps going deeper into the word.

This function assumes that Dict::go_deeper_fxn_ is set.

Definition at line 245 of file permdawg.cpp.

256  {
257  int word_ending = (char_choice_index == char_choices.length() - 1);
258 
259  // Deal with fragments.
260  CHAR_FRAGMENT_INFO char_frag_info;
261  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
262  blob_choice.certainty(), prev_char_frag_info, debug,
263  word_ending, &char_frag_info)) {
264  return; // blob_choice must be an invalid fragment
265  }
266  // Search the next letter if this character is a fragment.
267  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
268  permute_choices(debug, char_choices, char_choice_index + 1,
269  &char_frag_info, word, certainties, limit,
270  best_choice, attempts_left, more_args);
271  return;
272  }
273 
274  // Add the next unichar.
275  float old_rating = word->rating();
276  float old_certainty = word->certainty();
277  uint8_t old_permuter = word->permuter();
278  certainties[word->length()] = char_frag_info.certainty;
280  char_frag_info.unichar_id, char_frag_info.num_fragments,
281  char_frag_info.rating, char_frag_info.certainty);
282 
283  // Explore the next unichar.
284  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
285  &char_frag_info, word_ending, word, certainties,
286  limit, best_choice, attempts_left, more_args);
287 
288  // Remove the unichar we added to explore other choices in it's place.
289  word->remove_last_unichar_id();
290  word->set_rating(old_rating);
291  word->set_certainty(old_certainty);
292  word->set_permuter(old_permuter);
293 }
float certainty() const
Definition: ratngs.h:83
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:320
float certainty
Definition: dict.h:45
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:452
uint8_t permuter() const
Definition: ratngs.h:346
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
UNICHAR_ID unichar_id
Definition: dict.h:41
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:205
int length() const
Definition: genericvector.h:85
int length() const
Definition: ratngs.h:303
void remove_last_unichar_id()
Definition: ratngs.h:483
int num_fragments
Definition: dict.h:43
float rating() const
Definition: ratngs.h:80
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:203
void set_certainty(float new_val)
Definition: ratngs.h:372
void set_rating(float new_val)
Definition: ratngs.h:369
float rating
Definition: dict.h:44
void set_permuter(uint8_t perm)
Definition: ratngs.h:375

◆ CallParamsModelClassify()

float tesseract::Dict::CallParamsModelClassify ( void *  path)
inline

Definition at line 406 of file dict.h.

406  {
407  ASSERT_HOST(params_model_classify_ != nullptr); // ASSERT_HOST -> assert
408  return (this->*params_model_classify_)(
409  getCCUtil()->lang.string(), path);
410  }
const CCUtil * getCCUtil() const
Definition: dict.h:92
const char * string() const
Definition: strngs.cpp:196
float(Dict::* params_model_classify_)(const char *lang, void *path)
Definition: dict.h:403
STRING lang
Definition: ccutil.h:66
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ case_ok()

int tesseract::Dict::case_ok ( const WERD_CHOICE word,
const UNICHARSET unicharset 
) const

Check a string to see if it matches a set of lexical rules.

Definition at line 52 of file context.cpp.

52  {
53  int state = 0;
54  int x;
55  for (x = 0; x < word.length(); ++x) {
56  UNICHAR_ID ch_id = word.unichar_id(x);
57  if (unicharset.get_isupper(ch_id))
58  state = case_state_table[state][1];
59  else if (unicharset.get_islower(ch_id))
60  state = case_state_table[state][2];
61  else if (unicharset.get_isdigit(ch_id))
62  state = case_state_table[state][3];
63  else
64  state = case_state_table[state][0];
65  if (state == -1) return false;
66  }
67  return state != 5; // single lower is bad
68 }
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:493
int UNICHAR_ID
Definition: unichar.h:35
const int case_state_table[6][4]
Definition: context.cpp:35
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int length() const
Definition: ratngs.h:303
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500

◆ char_for_dawg()

UNICHAR_ID tesseract::Dict::char_for_dawg ( const UNICHARSET unicharset,
UNICHAR_ID  ch,
const Dawg dawg 
) const
inline

Definition at line 433 of file dict.h.

434  {
435  if (!dawg) return ch;
436  switch (dawg->type()) {
437  case DAWG_TYPE_NUMBER:
438  return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
439  default:
440  return ch;
441  }
442  }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:126

◆ compound_marker()

bool tesseract::Dict::compound_marker ( UNICHAR_ID  unichar_id)
inline

Definition at line 109 of file dict.h.

109  {
110  const GenericVector<UNICHAR_ID>& normed_ids =
111  getUnicharset().normed_ids(unichar_id);
112  return normed_ids.size() == 1 &&
113  (normed_ids[0] == hyphen_unichar_id_ ||
114  normed_ids[0] == slash_unichar_id_);
115  }
int size() const
Definition: genericvector.h:71
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:830
const UNICHARSET & getUnicharset() const
Definition: dict.h:98

◆ copy_hyphen_info()

void tesseract::Dict::copy_hyphen_info ( WERD_CHOICE word) const
inline

If this word is hyphenated copy the base word (the part on the line before) of a hyphenated word into the given word. This function assumes that word is not nullptr.

Definition at line 137 of file dict.h.

137  {
138  if (this->hyphenated()) {
139  *word = *hyphen_word_;
140  if (hyphen_debug_level) word->print("copy_hyphen_info: ");
141  }
142  }
void print() const
Definition: ratngs.h:580
int hyphen_debug_level
Definition: dict.h:606
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:127

◆ dawg_permute_and_select()

WERD_CHOICE * tesseract::Dict::dawg_permute_and_select ( const BLOB_CHOICE_LIST_VECTOR char_choices,
float  rating_limit 
)

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to explore all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

dawg_permute_and_select

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to search all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

Definition at line 174 of file permdawg.cpp.

175  {
176  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
177  best_choice->make_bad();
178  best_choice->set_rating(rating_limit);
179  if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH)
180  return best_choice;
181  DawgPositionVector *active_dawgs =
182  new DawgPositionVector[char_choices.length() + 1];
183  init_active_dawgs(&(active_dawgs[0]), true);
184  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
186 
187  float certainties[MAX_WERD_LENGTH];
189  int attempts_left = max_permuter_attempts;
190  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr,
191  char_choices, 0, nullptr, &word, certainties, &rating_limit, best_choice,
192  &attempts_left, &dawg_args);
193  delete[] active_dawgs;
194  return best_choice;
195 }
#define MAX_WERD_LENGTH
Definition: dict.h:36
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:569
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:443
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:50
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:205
int length() const
Definition: genericvector.h:85
int dawg_debug_level
Definition: dict.h:605
int max_permuter_attempts
Definition: dict.h:645
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:203
void set_rating(float new_val)
Definition: ratngs.h:369

◆ DebugWordChoices()

void tesseract::Dict::DebugWordChoices ( )

Prints the current choices for this word to stdout.

◆ def_letter_is_okay()

int tesseract::Dict::def_letter_is_okay ( void *  void_dawg_args,
const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  word_end 
) const

Returns the maximal permuter code (from ccstruct/ratngs.h) if in light of the current state the letter at word_index in the given word is allowed according to at least one of the dawgs in dawgs_, otherwise returns NO_PERM.

The state is described by void_dawg_args, which are interpreted as DawgArgs and contain relevant active dawg positions. Each entry in the active_dawgs vector contains an index into the dawgs_ vector and an EDGE_REF that indicates the last edge followed in the dawg. It also may contain a position in the punctuation dawg which describes surrounding punctuation (see struct DawgPosition).

Input: At word_index 0 dawg_args->active_dawgs should contain an entry for each dawg that may start at the beginning of a word, with punc_ref and edge_ref initialized to NO_EDGE. Since the punctuation dawg includes the empty pattern " " (meaning anything without surrounding punctuation), having a single entry for the punctuation dawg will cover all dawgs reachable therefrom – that includes all number and word dawgs. The only dawg non-reachable from the punctuation_dawg is the pattern dawg. If hyphen state needs to be applied, initial dawg_args->active_dawgs can be copied from the saved hyphen state (maintained by Dict). For word_index > 0 the corresponding state (active_dawgs and punc position) can be obtained from dawg_args->updated_dawgs passed to def_letter_is_okay for word_index-1. Note: the function assumes that active_dawgs, and updated_dawgs member variables of dawg_args are not nullptr.

Output: The function fills in dawg_args->updated_dawgs vector with the entries for dawgs that contain the word up to the letter at word_index.

Definition at line 367 of file dict.cpp.

370  {
371  DawgArgs *dawg_args = static_cast<DawgArgs *>(void_dawg_args);
372 
373  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
374 
375  if (dawg_debug_level >= 3) {
376  tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
377  " num active dawgs=%d\n",
378  getUnicharset().debug_str(unichar_id).string(), word_end,
379  dawg_args->active_dawgs->length());
380  }
381 
382  // Do not accept words that contain kPatternUnicharID.
383  // (otherwise pattern dawgs would not function correctly).
384  // Do not accept words containing INVALID_UNICHAR_IDs.
385  if (unichar_id == Dawg::kPatternUnicharID ||
386  unichar_id == INVALID_UNICHAR_ID) {
387  dawg_args->permuter = NO_PERM;
388  return NO_PERM;
389  }
390 
391  // Initialization.
392  PermuterType curr_perm = NO_PERM;
393  dawg_args->updated_dawgs->clear();
394  dawg_args->valid_end = false;
395 
396  // Go over the active_dawgs vector and insert DawgPosition records
397  // with the updated ref (an edge with the corresponding unichar id) into
398  // dawg_args->updated_pos.
399  for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
400  const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
401  const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
402  const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
403 
404  if (!dawg && !punc_dawg) {
405  // shouldn't happen.
406  tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
407  continue;
408  }
409  if (!dawg) {
410  // We're in the punctuation dawg. A core dawg has not been chosen.
411  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
412  EDGE_REF punc_transition_edge = punc_dawg->edge_char_of(
413  punc_node, Dawg::kPatternUnicharID, word_end);
414  if (punc_transition_edge != NO_EDGE) {
415  // Find all successors, and see which can transition.
416  const SuccessorList &slist = *(successors_[pos.punc_index]);
417  for (int s = 0; s < slist.length(); ++s) {
418  int sdawg_index = slist[s];
419  const Dawg *sdawg = dawgs_[sdawg_index];
420  UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
421  EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
422  if (dawg_edge != NO_EDGE) {
423  if (dawg_debug_level >=3) {
424  tprintf("Letter found in dawg %d\n", sdawg_index);
425  }
426  dawg_args->updated_dawgs->add_unique(
427  DawgPosition(sdawg_index, dawg_edge,
428  pos.punc_index, punc_transition_edge, false),
429  dawg_debug_level > 0,
430  "Append transition from punc dawg to current dawgs: ");
431  if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
432  if (sdawg->end_of_word(dawg_edge) &&
433  punc_dawg->end_of_word(punc_transition_edge))
434  dawg_args->valid_end = true;
435  }
436  }
437  }
438  EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id,
439  word_end);
440  if (punc_edge != NO_EDGE) {
441  if (dawg_debug_level >=3) {
442  tprintf("Letter found in punctuation dawg\n");
443  }
444  dawg_args->updated_dawgs->add_unique(
445  DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
446  dawg_debug_level > 0,
447  "Extend punctuation dawg: ");
448  if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
449  if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
450  }
451  continue;
452  }
453 
454  if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
455  // We can end the main word here.
456  // If we can continue on the punc ref, add that possibility.
457  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
458  EDGE_REF punc_edge = punc_node == NO_EDGE ? NO_EDGE
459  : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
460  if (punc_edge != NO_EDGE) {
461  dawg_args->updated_dawgs->add_unique(
462  DawgPosition(pos.dawg_index, pos.dawg_ref,
463  pos.punc_index, punc_edge, true),
464  dawg_debug_level > 0,
465  "Return to punctuation dawg: ");
466  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
467  if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
468  }
469  }
470 
471  if (pos.back_to_punc) continue;
472 
473  // If we are dealing with the pattern dawg, look up all the
474  // possible edges, not only for the exact unichar_id, but also
475  // for all its character classes (alpha, digit, etc).
476  if (dawg->type() == DAWG_TYPE_PATTERN) {
477  ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args,
478  &curr_perm);
479  // There can't be any successors to dawg that is of type
480  // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
481  continue;
482  }
483 
484  // Find the edge out of the node for the unichar_id.
485  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
486  EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE
487  : dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg),
488  word_end);
489 
490  if (dawg_debug_level >= 3) {
491  tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
492  pos.dawg_index, node, edge);
493  }
494 
495  if (edge != NO_EDGE) { // the unichar was found in the current dawg
496  if (dawg_debug_level >=3) {
497  tprintf("Letter found in dawg %d\n", pos.dawg_index);
498  }
499  if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
500  if (dawg_debug_level >= 3) {
501  tprintf("Punctuation constraint not satisfied at end of word.\n");
502  }
503  continue;
504  }
505  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
506  if (dawg->end_of_word(edge) &&
507  (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref)))
508  dawg_args->valid_end = true;
509  dawg_args->updated_dawgs->add_unique(
510  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
511  false),
512  dawg_debug_level > 0,
513  "Append current dawg to updated active dawgs: ");
514  }
515  } // end for
516  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
517  // or if we found the current letter in a non-punctuation dawg. This
518  // allows preserving information on which dawg the "core" word came from.
519  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
520  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
521  (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
522  dawg_args->permuter = curr_perm;
523  }
524  if (dawg_debug_level >= 2) {
525  tprintf("Returning %d for permuter code for this character.\n",
526  dawg_args->permuter);
527  }
528  return dawg_args->permuter;
529 }
int UNICHAR_ID
Definition: unichar.h:35
#define REFFORMAT
Definition: dawg.h:93
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:423
int64_t EDGE_REF
Definition: dawg.h:55
int64_t NODE_REF
Definition: dawg.h:56
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
Definition: dict.cpp:531
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279
GenericVector< int > SuccessorList
Definition: dawg.h:69
int dawg_debug_level
Definition: dict.h:605
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:126
PermuterType
Definition: ratngs.h:242
#define ASSERT_HOST(x)
Definition: errcode.h:84
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:433

◆ def_probability_in_context()

double tesseract::Dict::def_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Default (no-op) implementation of probability in context function.

Definition at line 386 of file dict.h.

388  {
389  (void)lang;
390  (void)context;
391  (void)context_bytes;
392  (void)character;
393  (void)character_bytes;
394  return 0.0;
395  }

◆ default_dawgs()

void tesseract::Dict::default_dawgs ( DawgPositionVector anylength_dawgs,
bool  suppress_patterns 
) const

Definition at line 586 of file dict.cpp.

587  {
588  bool punc_dawg_available =
589  (punc_dawg_ != nullptr) &&
590  punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
591 
592  for (int i = 0; i < dawgs_.length(); i++) {
593  if (dawgs_[i] != nullptr &&
594  !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
595  int dawg_ty = dawgs_[i]->type();
596  bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
597  if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
598  *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
599  if (dawg_debug_level >= 3) {
600  tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
601  NO_EDGE);
602  }
603  } else if (!punc_dawg_available || !subsumed_by_punc) {
604  *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
605  if (dawg_debug_level >= 3) {
606  tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
607  }
608  }
609  }
610  }
611 }
#define REFFORMAT
Definition: dawg.h:93
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
int length() const
Definition: genericvector.h:85
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int dawg_debug_level
Definition: dict.h:605
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:126

◆ End()

void tesseract::Dict::End ( )

Definition at line 343 of file dict.cpp.

343  {
344  if (dawgs_.length() == 0)
345  return; // Not safe to call twice.
346  for (int i = 0; i < dawgs_.size(); i++) {
347  if (!dawg_cache_->FreeDawg(dawgs_[i])) {
348  delete dawgs_[i];
349  }
350  }
351  dawg_cache_->FreeDawg(bigram_dawg_);
352  if (dawg_cache_is_ours_) {
353  delete dawg_cache_;
354  dawg_cache_ = nullptr;
355  }
356  successors_.delete_data_pointers();
357  dawgs_.clear();
358  successors_.clear();
359  document_words_ = nullptr;
360  delete pending_words_;
361  pending_words_ = nullptr;
362 }
bool FreeDawg(Dawg *dawg)
Definition: dawg_cache.h:38
int size() const
Definition: genericvector.h:71
int length() const
Definition: genericvector.h:85
void delete_data_pointers()

◆ EndDangerousAmbigs()

void tesseract::Dict::EndDangerousAmbigs ( )

Definition at line 358 of file stopper.cpp.

358 {}

◆ FinishLoad()

bool tesseract::Dict::FinishLoad ( )

Definition at line 323 of file dict.cpp.

323  {
324  if (dawgs_.empty()) return false;
325  // Construct a list of corresponding successors for each dawg. Each entry, i,
326  // in the successors_ vector is a vector of integers that represent the
327  // indices into the dawgs_ vector of the successors for dawg i.
328  successors_.reserve(dawgs_.length());
329  for (int i = 0; i < dawgs_.length(); ++i) {
330  const Dawg *dawg = dawgs_[i];
331  SuccessorList *lst = new SuccessorList();
332  for (int j = 0; j < dawgs_.length(); ++j) {
333  const Dawg *other = dawgs_[j];
334  if (dawg != nullptr && other != nullptr &&
335  (dawg->lang() == other->lang()) &&
336  kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
337  }
338  successors_ += lst;
339  }
340  return true;
341 }
void reserve(int size)
int length() const
Definition: genericvector.h:85
bool empty() const
Definition: genericvector.h:90
GenericVector< int > SuccessorList
Definition: dawg.h:69

◆ fragment_state_okay()

bool tesseract::Dict::fragment_state_okay ( UNICHAR_ID  curr_unichar_id,
float  curr_rating,
float  curr_certainty,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
const char *  debug,
int  word_ending,
CHAR_FRAGMENT_INFO char_frag_info 
)

Definition at line 320 of file permdawg.cpp.

324  {
325  const CHAR_FRAGMENT *this_fragment =
326  getUnicharset().get_fragment(curr_unichar_id);
327  const CHAR_FRAGMENT *prev_fragment =
328  prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
329 
330  // Print debug info for fragments.
331  if (debug && (prev_fragment || this_fragment)) {
332  tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
333  getUnicharset().debug_str(curr_unichar_id).string(),
334  word_ending);
335  if (prev_fragment) {
336  tprintf("prev_fragment %s\n", prev_fragment->to_string().string());
337  }
338  if (this_fragment) {
339  tprintf("this_fragment %s\n", this_fragment->to_string().string());
340  }
341  }
342 
343  char_frag_info->unichar_id = curr_unichar_id;
344  char_frag_info->fragment = this_fragment;
345  char_frag_info->rating = curr_rating;
346  char_frag_info->certainty = curr_certainty;
347  char_frag_info->num_fragments = 1;
348  if (prev_fragment && !this_fragment) {
349  if (debug) tprintf("Skip choice with incomplete fragment\n");
350  return false;
351  }
352  if (this_fragment) {
353  // We are dealing with a fragment.
354  char_frag_info->unichar_id = INVALID_UNICHAR_ID;
355  if (prev_fragment) {
356  if (!this_fragment->is_continuation_of(prev_fragment)) {
357  if (debug) tprintf("Non-matching fragment piece\n");
358  return false;
359  }
360  if (this_fragment->is_ending()) {
361  char_frag_info->unichar_id =
362  getUnicharset().unichar_to_id(this_fragment->get_unichar());
363  char_frag_info->fragment = nullptr;
364  if (debug) {
365  tprintf("Built character %s from fragments\n",
366  getUnicharset().debug_str(
367  char_frag_info->unichar_id).string());
368  }
369  } else {
370  if (debug) tprintf("Record fragment continuation\n");
371  char_frag_info->fragment = this_fragment;
372  }
373  // Update certainty and rating.
374  char_frag_info->rating =
375  prev_char_frag_info->rating + curr_rating;
376  char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
377  char_frag_info->certainty =
378  std::min(curr_certainty, prev_char_frag_info->certainty);
379  } else {
380  if (this_fragment->is_beginning()) {
381  if (debug) tprintf("Record fragment beginning\n");
382  } else {
383  if (debug) {
384  tprintf("Non-starting fragment piece with no prev_fragment\n");
385  }
386  return false;
387  }
388  }
389  }
390  if (word_ending && char_frag_info->fragment) {
391  if (debug) tprintf("Word can not end with a fragment\n");
392  return false;
393  }
394  return true;
395 }
bool is_beginning() const
Definition: unicharset.h:106
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:99
float certainty
Definition: dict.h:45
const char * string() const
Definition: strngs.cpp:196
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
static STRING to_string(const char *unichar, int pos, int total, bool natural)
const char * get_unichar() const
Definition: unicharset.h:71
const CHAR_FRAGMENT * fragment
Definition: dict.h:42
UNICHAR_ID unichar_id
Definition: dict.h:41
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool is_ending() const
Definition: unicharset.h:109
int num_fragments
Definition: dict.h:43
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
float rating
Definition: dict.h:44

◆ getCCUtil() [1/2]

const CCUtil* tesseract::Dict::getCCUtil ( ) const
inline

Definition at line 92 of file dict.h.

92  {
93  return ccutil_;
94  }

◆ getCCUtil() [2/2]

CCUtil* tesseract::Dict::getCCUtil ( )
inline

Definition at line 95 of file dict.h.

95  {
96  return ccutil_;
97  }

◆ GetDawg()

const Dawg* tesseract::Dict::GetDawg ( int  index) const
inline

Return i-th dawg pointer recorded in the dawgs_ vector.

Definition at line 417 of file dict.h.

417 { return dawgs_[index]; }

◆ GetPuncDawg()

const Dawg* tesseract::Dict::GetPuncDawg ( ) const
inline

Return the points to the punctuation dawg.

Definition at line 419 of file dict.h.

419 { return punc_dawg_; }

◆ GetStartingNode()

static NODE_REF tesseract::Dict::GetStartingNode ( const Dawg dawg,
EDGE_REF  edge_ref 
)
inlinestatic

Returns the appropriate next node given the EDGE_REF.

Definition at line 423 of file dict.h.

423  {
424  if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg
425  NODE_REF node = dawg->next_node(edge_ref);
426  if (node == 0) node = NO_EDGE; // end of word
427  return node;
428  }
int64_t NODE_REF
Definition: dawg.h:56

◆ GetUnambigDawg()

const Dawg* tesseract::Dict::GetUnambigDawg ( ) const
inline

Return the points to the unambiguous words dawg.

Definition at line 421 of file dict.h.

421 { return unambig_dawg_; }

◆ getUnicharAmbigs()

const UnicharAmbigs& tesseract::Dict::getUnicharAmbigs ( ) const
inline

Definition at line 104 of file dict.h.

104  {
105  return getCCUtil()->unichar_ambigs;
106  }
const CCUtil * getCCUtil() const
Definition: dict.h:92
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:69

◆ getUnicharset() [1/2]

const UNICHARSET& tesseract::Dict::getUnicharset ( ) const
inline

Definition at line 98 of file dict.h.

98  {
99  return getCCUtil()->unicharset;
100  }
const CCUtil * getCCUtil() const
Definition: dict.h:92
UNICHARSET unicharset
Definition: ccutil.h:68

◆ getUnicharset() [2/2]

UNICHARSET& tesseract::Dict::getUnicharset ( )
inline

Definition at line 101 of file dict.h.

101  {
102  return getCCUtil()->unicharset;
103  }
const CCUtil * getCCUtil() const
Definition: dict.h:92
UNICHARSET unicharset
Definition: ccutil.h:68

◆ GlobalDawgCache()

DawgCache * tesseract::Dict::GlobalDawgCache ( )
static

Initialize Dict class - load dawgs from [lang].traineddata and user-specified wordlist and parttern list.

Definition at line 193 of file dict.cpp.

193  {
194  // This global cache (a singleton) will outlive every Tesseract instance
195  // (even those that someone else might declare as global statics).
196  static DawgCache cache;
197  return &cache;
198 }

◆ go_deeper_dawg_fxn()

void tesseract::Dict::go_deeper_dawg_fxn ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
bool  word_ending,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  void_more_args 
)

If the choice being composed so far could be a dictionary word and we have not reached the end of the word keep exploring the char_choices further.

Definition at line 50 of file permdawg.cpp.

54  {
55  DawgArgs *more_args = static_cast<DawgArgs *>(void_more_args);
56  word_ending = (char_choice_index == char_choices.size()-1);
57  int word_index = word->length() - 1;
58  if (best_choice->rating() < *limit) return;
59  // Look up char in DAWG
60 
61  // If the current unichar is an ngram first try calling
62  // letter_is_okay() for each unigram it contains separately.
63  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
64  bool checked_unigrams = false;
65  if (getUnicharset().get_isngram(orig_uch_id)) {
66  if (dawg_debug_level) {
67  tprintf("checking unigrams in an ngram %s\n",
68  getUnicharset().debug_str(orig_uch_id).string());
69  }
70  int num_unigrams = 0;
71  word->remove_last_unichar_id();
73  const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
74  // Since the string came out of the unicharset, failure is impossible.
75  ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr,
76  nullptr));
77  bool unigrams_ok = true;
78  // Construct DawgArgs that reflect the current state.
79  DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
80  DawgPositionVector unigram_updated_dawgs;
81  DawgArgs unigram_dawg_args(&unigram_active_dawgs,
82  &unigram_updated_dawgs,
83  more_args->permuter);
84  // Check unigrams in the ngram with letter_is_okay().
85  for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
86  UNICHAR_ID uch_id = encoding[i];
87  ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
88  ++num_unigrams;
89  word->append_unichar_id(uch_id, 1, 0.0, 0.0);
90  unigrams_ok = (this->*letter_is_okay_)(
91  &unigram_dawg_args, *word->unicharset(),
92  word->unichar_id(word_index+num_unigrams-1),
93  word_ending && i == encoding.size() - 1);
94  (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
95  if (dawg_debug_level) {
96  tprintf("unigram %s is %s\n",
97  getUnicharset().debug_str(uch_id).string(),
98  unigrams_ok ? "OK" : "not OK");
99  }
100  }
101  // Restore the word and copy the updated dawg state if needed.
102  while (num_unigrams-- > 0) word->remove_last_unichar_id();
103  word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
104  if (unigrams_ok) {
105  checked_unigrams = true;
106  more_args->permuter = unigram_dawg_args.permuter;
107  *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
108  }
109  }
110 
111  // Check which dawgs from the dawgs_ vector contain the word
112  // up to and including the current unichar.
113  if (checked_unigrams || (this->*letter_is_okay_)(
114  more_args, *word->unicharset(), word->unichar_id(word_index),
115  word_ending)) {
116  // Add a new word choice
117  if (word_ending) {
118  if (dawg_debug_level) {
119  tprintf("found word = %s\n", word->debug_string().string());
120  }
121  if (strcmp(output_ambig_words_file.string(), "") != 0) {
122  if (output_ambig_words_file_ == nullptr) {
123  output_ambig_words_file_ =
124  fopen(output_ambig_words_file.string(), "wb+");
125  if (output_ambig_words_file_ == nullptr) {
126  tprintf("Failed to open output_ambig_words_file %s\n",
127  output_ambig_words_file.string());
128  exit(1);
129  }
130  STRING word_str;
131  word->string_and_lengths(&word_str, nullptr);
132  word_str += " ";
133  fprintf(output_ambig_words_file_, "%s", word_str.string());
134  }
135  STRING word_str;
136  word->string_and_lengths(&word_str, nullptr);
137  word_str += " ";
138  fprintf(output_ambig_words_file_, "%s", word_str.string());
139  }
140  WERD_CHOICE *adjusted_word = word;
141  adjusted_word->set_permuter(more_args->permuter);
142  update_best_choice(*adjusted_word, best_choice);
143  } else { // search the next letter
144  // Make updated_* point to the next entries in the DawgPositionVector
145  // arrays (that were originally created in dawg_permute_and_select)
146  ++(more_args->updated_dawgs);
147  // Make active_dawgs and constraints point to the updated ones.
148  ++(more_args->active_dawgs);
149  permute_choices(debug, char_choices, char_choice_index + 1,
150  prev_char_frag_info, word, certainties, limit,
151  best_choice, attempts_left, more_args);
152  // Restore previous state to explore another letter in this position.
153  --(more_args->updated_dawgs);
154  --(more_args->active_dawgs);
155  }
156  } else {
157  if (dawg_debug_level) {
158  tprintf("last unichar not OK at index %d in %s\n",
159  word_index, word->debug_string().string());
160  }
161  }
162 }
int UNICHAR_ID
Definition: unichar.h:35
int size() const
Definition: genericvector.h:71
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:452
const char * string() const
Definition: strngs.cpp:196
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:468
float rating() const
Definition: ratngs.h:327
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:171
char * output_ambig_words_file
Definition: dict.h:603
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:449
void remove_last_unichar_id()
Definition: ratngs.h:483
int dawg_debug_level
Definition: dict.h:605
Definition: strngs.h:45
const STRING debug_string() const
Definition: ratngs.h:505
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:203
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:357
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_permuter(uint8_t perm)
Definition: ratngs.h:375

◆ good_choice()

int tesseract::Dict::good_choice ( const WERD_CHOICE choice)

Returns true if a good answer is found for the unknown blob rating.

◆ has_hyphen_end() [1/2]

bool tesseract::Dict::has_hyphen_end ( UNICHAR_ID  unichar_id,
bool  first_pos 
) const
inline

Check whether the word has a hyphen at the end.

Definition at line 144 of file dict.h.

144  {
145  if (!last_word_on_line_ || first_pos)
146  return false;
147  const GenericVector<UNICHAR_ID>& normed_ids =
148  getUnicharset().normed_ids(unichar_id);
149  return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
150  }
int size() const
Definition: genericvector.h:71
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:830
const UNICHARSET & getUnicharset() const
Definition: dict.h:98

◆ has_hyphen_end() [2/2]

bool tesseract::Dict::has_hyphen_end ( const WERD_CHOICE word) const
inline

Same as above, but check the unichar at the end of the word.

Definition at line 152 of file dict.h.

152  {
153  int word_index = word.length() - 1;
154  return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
155  }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int length() const
Definition: ratngs.h:303
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:144

◆ hyphen_base_size()

int tesseract::Dict::hyphen_base_size ( ) const
inline

Size of the base word (the part on the line before) of a hyphenated word.

Definition at line 131 of file dict.h.

131  {
132  return this->hyphenated() ? hyphen_word_->length() : 0;
133  }
int length() const
Definition: ratngs.h:303
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:127

◆ hyphenated()

bool tesseract::Dict::hyphenated ( ) const
inline

Returns true if we've recorded the beginning of a hyphenated word.

Definition at line 127 of file dict.h.

127  { return
128  !last_word_on_line_ && hyphen_word_;
129  }

◆ init_active_dawgs()

void tesseract::Dict::init_active_dawgs ( DawgPositionVector active_dawgs,
bool  ambigs_mode 
) const

Fill the given active_dawgs vector with dawgs that could contain the beginning of the word. If hyphenated() returns true, copy the entries from hyphen_active_dawgs_ instead.

Definition at line 569 of file dict.cpp.

570  {
571  int i;
572  if (hyphenated()) {
573  *active_dawgs = hyphen_active_dawgs_;
574  if (dawg_debug_level >= 3) {
575  for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
576  tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
577  hyphen_active_dawgs_[i].dawg_index,
578  hyphen_active_dawgs_[i].dawg_ref);
579  }
580  }
581  } else {
582  default_dawgs(active_dawgs, ambigs_mode);
583  }
584 }
int size() const
Definition: genericvector.h:71
#define REFFORMAT
Definition: dawg.h:93
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int dawg_debug_level
Definition: dict.h:605
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:127
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:586

◆ is_apostrophe()

bool tesseract::Dict::is_apostrophe ( UNICHAR_ID  unichar_id)
inline

Definition at line 118 of file dict.h.

118  {
119  const GenericVector<UNICHAR_ID>& normed_ids =
120  getUnicharset().normed_ids(unichar_id);
121  return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
122  }
int size() const
Definition: genericvector.h:71
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:830
const UNICHARSET & getUnicharset() const
Definition: dict.h:98

◆ IsSpaceDelimitedLang()

bool tesseract::Dict::IsSpaceDelimitedLang ( ) const

Returns true if the language is space-delimited (not CJ, or T).

Definition at line 857 of file dict.cpp.

857  {
858  const UNICHARSET &u_set = getUnicharset();
859  if (u_set.han_sid() > 0) return false;
860  if (u_set.katakana_sid() > 0) return false;
861  if (u_set.thai_sid() > 0) return false;
862  return true;
863 }
int han_sid() const
Definition: unicharset.h:883
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
int katakana_sid() const
Definition: unicharset.h:885
int thai_sid() const
Definition: unicharset.h:886

◆ LengthOfShortestAlphaRun()

int tesseract::Dict::LengthOfShortestAlphaRun ( const WERD_CHOICE WordChoice) const

Returns the length of the shortest alpha run in WordChoice.

Definition at line 442 of file stopper.cpp.

442  {
443  int shortest = INT32_MAX;
444  int curr_len = 0;
445  for (int w = 0; w < WordChoice.length(); ++w) {
446  if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) {
447  curr_len++;
448  } else if (curr_len > 0) {
449  if (curr_len < shortest) shortest = curr_len;
450  curr_len = 0;
451  }
452  }
453  if (curr_len > 0 && curr_len < shortest) {
454  shortest = curr_len;
455  } else if (shortest == INT32_MAX) {
456  shortest = 0;
457  }
458  return shortest;
459 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int length() const
Definition: ratngs.h:303
const UNICHARSET & getUnicharset() const
Definition: dict.h:98

◆ LetterIsOkay()

int tesseract::Dict::LetterIsOkay ( void *  void_dawg_args,
const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inline

Calls letter_is_okay_ member function.

Definition at line 361 of file dict.h.

362  {
363  return (this->*letter_is_okay_)(void_dawg_args,
364  unicharset, unichar_id, word_end);
365  }
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:357

◆ Load()

void tesseract::Dict::Load ( const STRING lang,
TessdataManager data_file 
)

Definition at line 219 of file dict.cpp.

219  {
220  // Load dawgs_.
221  if (load_punc_dawg) {
222  punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG,
223  dawg_debug_level, data_file);
224  if (punc_dawg_) dawgs_ += punc_dawg_;
225  }
226  if (load_system_dawg) {
227  Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
228  lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
229  if (system_dawg) dawgs_ += system_dawg;
230  }
231  if (load_number_dawg) {
232  Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
233  lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
234  if (number_dawg) dawgs_ += number_dawg;
235  }
236  if (load_bigram_dawg) {
237  bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG,
238  dawg_debug_level, data_file);
239  // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
240  // dawgs_!!
241  }
242  if (load_freq_dawg) {
243  freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG,
244  dawg_debug_level, data_file);
245  if (freq_dawg_) dawgs_ += freq_dawg_;
246  }
247  if (load_unambig_dawg) {
248  unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG,
249  dawg_debug_level, data_file);
250  if (unambig_dawg_) dawgs_ += unambig_dawg_;
251  }
252 
253  STRING name;
254  if (((STRING &)user_words_suffix).length() > 0 ||
255  ((STRING &)user_words_file).length() > 0) {
256  Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
257  getUnicharset().size(), dawg_debug_level);
258  if (((STRING &)user_words_file).length() > 0) {
259  name = user_words_file;
260  } else {
262  name += user_words_suffix;
263  }
264  if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
266  tprintf("Error: failed to load %s\n", name.string());
267  delete trie_ptr;
268  } else {
269  dawgs_ += trie_ptr;
270  }
271  }
272 
273  if (((STRING &)user_patterns_suffix).length() > 0 ||
274  ((STRING &)user_patterns_file).length() > 0) {
275  Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
276  getUnicharset().size(), dawg_debug_level);
277  trie_ptr->initialize_patterns(&(getUnicharset()));
278  if (((STRING &)user_patterns_file).length() > 0) {
279  name = user_patterns_file;
280  } else {
282  name += user_patterns_suffix;
283  }
284  if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
285  tprintf("Error: failed to load %s\n", name.string());
286  delete trie_ptr;
287  } else {
288  dawgs_ += trie_ptr;
289  }
290  }
291 
292  document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
293  getUnicharset().size(), dawg_debug_level);
294  dawgs_ += document_words_;
295 
296  // This dawg is temporary and should not be searched by letter_is_ok.
297  pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
298  getUnicharset().size(), dawg_debug_level);
299 }
const CCUtil * getCCUtil() const
Definition: dict.h:92
const char * string() const
Definition: strngs.cpp:196
STRING language_data_path_prefix
Definition: ccutil.h:67
bool load_number_dawg
Definition: dict.h:573
bool load_punc_dawg
Definition: dict.h:572
bool load_freq_dawg
Definition: dict.h:569
char * user_words_file
Definition: dict.h:561
bool load_unambig_dawg
Definition: dict.h:570
char * user_words_suffix
Definition: dict.h:563
Dawg * GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
Definition: dawg_cache.cpp:45
bool load_bigram_dawg
Definition: dict.h:575
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool load_system_dawg
Definition: dict.h:568
int dawg_debug_level
Definition: dict.h:605
Definition: strngs.h:45
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
char * user_patterns_file
Definition: dict.h:565
char * user_patterns_suffix
Definition: dict.h:567

◆ LoadLSTM()

void tesseract::Dict::LoadLSTM ( const STRING lang,
TessdataManager data_file 
)

Definition at line 302 of file dict.cpp.

302  {
303  // Load dawgs_.
304  if (load_punc_dawg) {
305  punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG,
306  dawg_debug_level, data_file);
307  if (punc_dawg_) dawgs_ += punc_dawg_;
308  }
309  if (load_system_dawg) {
310  Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
311  lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
312  if (system_dawg) dawgs_ += system_dawg;
313  }
314  if (load_number_dawg) {
315  Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
316  lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
317  if (number_dawg) dawgs_ += number_dawg;
318  }
319 }
bool load_number_dawg
Definition: dict.h:573
bool load_punc_dawg
Definition: dict.h:572
Dawg * GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
Definition: dawg_cache.cpp:45
bool load_system_dawg
Definition: dict.h:568
int dawg_debug_level
Definition: dict.h:605

◆ ngram_probability_in_context()

double tesseract::Dict::ngram_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)

◆ NoDangerousAmbig()

bool tesseract::Dict::NoDangerousAmbig ( WERD_CHOICE BestChoice,
DANGERR fixpt,
bool  fix_replaceable,
MATRIX ratings 
)

Definition at line 142 of file stopper.cpp.

145  {
146  if (stopper_debug_level > 2) {
147  tprintf("\nRunning NoDangerousAmbig() for %s\n",
148  best_choice->debug_string().string());
149  }
150 
151  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
152  // for each unichar id in BestChoice.
153  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
154  int i;
155  bool ambigs_found = false;
156  // For each position in best_choice:
157  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
158  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
159  // -- look for ambiguities corresponding to wrong_ngram in the list while
160  // adding the following unichar_ids from best_choice to wrong_ngram
161  //
162  // Repeat the above procedure twice: first time look through
163  // ambigs to be replaced and replace all the ambiguities found;
164  // second time look through dangerous ambiguities and construct
165  // ambig_blob_choices with fake a blob choice for each ambiguity
166  // and pass them to dawg_permute_and_select() to search for
167  // ambiguous words in the dictionaries.
168  //
169  // Note that during the execution of the for loop (on the first pass)
170  // if replacements are made the length of best_choice might change.
171  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
172  bool replace = (fix_replaceable && pass == 0);
173  const UnicharAmbigsVector &table = replace ?
175  if (!replace) {
176  // Initialize ambig_blob_choices with lists containing a single
177  // unichar id for the corresponding position in best_choice.
178  // best_choice consisting from only the original letters will
179  // have a rating of 0.0.
180  for (i = 0; i < best_choice->length(); ++i) {
181  BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
182  BLOB_CHOICE_IT lst_it(lst);
183  // TODO(rays/antonova) Put real xheights and y shifts here.
184  lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
185  0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
186  ambig_blob_choices.push_back(lst);
187  }
188  }
189  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
190  int wrong_ngram_index;
191  int next_index;
192  int blob_index = 0;
193  for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
194  ++i) {
195  UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
196  if (stopper_debug_level > 2) {
197  tprintf("Looking for %s ngrams starting with %s:\n",
198  replace ? "replaceable" : "ambiguous",
199  getUnicharset().debug_str(curr_unichar_id).string());
200  }
201  int num_wrong_blobs = best_choice->state(i);
202  wrong_ngram_index = 0;
203  wrong_ngram[wrong_ngram_index] = curr_unichar_id;
204  if (curr_unichar_id == INVALID_UNICHAR_ID ||
205  curr_unichar_id >= table.size() ||
206  table[curr_unichar_id] == nullptr) {
207  continue; // there is no ambig spec for this unichar id
208  }
209  AmbigSpec_IT spec_it(table[curr_unichar_id]);
210  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
211  const AmbigSpec *ambig_spec = spec_it.data();
212  wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
213  int compare = UnicharIdArrayUtils::compare(wrong_ngram,
214  ambig_spec->wrong_ngram);
215  if (stopper_debug_level > 2) {
216  tprintf("candidate ngram: ");
218  tprintf("current ngram from spec: ");
219  UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
220  tprintf("comparison result: %d\n", compare);
221  }
222  if (compare == 0) {
223  // Record the place where we found an ambiguity.
224  if (fixpt != nullptr) {
225  UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
226  fixpt->push_back(DANGERR_INFO(
227  blob_index, blob_index + num_wrong_blobs, replace,
228  getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
229  leftmost_id));
230  if (stopper_debug_level > 1) {
231  tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index,
232  blob_index + num_wrong_blobs, false,
233  getUnicharset().get_isngram(
234  ambig_spec->correct_ngram_id),
235  getUnicharset().id_to_unichar(leftmost_id));
236  }
237  }
238 
239  if (replace) {
240  if (stopper_debug_level > 2) {
241  tprintf("replace ambiguity with %s : ",
242  getUnicharset().id_to_unichar(
243  ambig_spec->correct_ngram_id));
245  ambig_spec->correct_fragments, getUnicharset());
246  }
247  ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
248  ambig_spec->correct_ngram_id,
249  best_choice, ratings);
250  } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
251  // We found dang ambig - update ambig_blob_choices.
252  if (stopper_debug_level > 2) {
253  tprintf("found ambiguity: ");
255  ambig_spec->correct_fragments, getUnicharset());
256  }
257  ambigs_found = true;
258  for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
259  ++tmp_index) {
260  // Add a blob choice for the corresponding fragment of the
261  // ambiguity. These fake blob choices are initialized with
262  // negative ratings (which are not possible for real blob
263  // choices), so that dawg_permute_and_select() considers any
264  // word not consisting of only the original letters a better
265  // choice and stops searching for alternatives once such a
266  // choice is found.
267  BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
268  bc_it.add_to_end(new BLOB_CHOICE(
269  ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
270  -1, 0, 1, 0, BCC_AMBIG));
271  }
272  }
273  spec_it.forward();
274  } else if (compare == -1) {
275  if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
276  ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
277  // Add the next unichar id to wrong_ngram and keep looking for
278  // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
279  wrong_ngram[++wrong_ngram_index] =
280  best_choice->unichar_id(next_index);
281  num_wrong_blobs += best_choice->state(next_index);
282  } else {
283  break; // no more matching ambigs in this AMBIG_SPEC_LIST
284  }
285  } else {
286  spec_it.forward();
287  }
288  } // end searching AmbigSpec_LIST
289  } // end searching best_choice
290  } // end searching replace and dangerous ambigs
291 
292  // If any ambiguities were found permute the constructed ambig_blob_choices
293  // to see if an alternative dictionary word can be found.
294  if (ambigs_found) {
295  if (stopper_debug_level > 2) {
296  tprintf("\nResulting ambig_blob_choices:\n");
297  for (i = 0; i < ambig_blob_choices.length(); ++i) {
298  print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
299  tprintf("\n");
300  }
301  }
302  WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
303  ambigs_found = (alt_word->rating() < 0.0);
304  if (ambigs_found) {
305  if (stopper_debug_level >= 1) {
306  tprintf ("Stopper: Possible ambiguous word = %s\n",
307  alt_word->debug_string().string());
308  }
309  if (fixpt != nullptr) {
310  // Note: Currently character choices combined from fragments can only
311  // be generated by NoDangrousAmbigs(). This code should be updated if
312  // the capability to produce classifications combined from character
313  // fragments is added to other functions.
314  int orig_i = 0;
315  for (i = 0; i < alt_word->length(); ++i) {
316  const UNICHARSET &uchset = getUnicharset();
317  bool replacement_is_ngram =
318  uchset.get_isngram(alt_word->unichar_id(i));
319  UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
320  if (replacement_is_ngram) {
321  // we have to extract the leftmost unichar from the ngram.
322  const char *str = uchset.id_to_unichar(leftmost_id);
323  int step = uchset.step(str);
324  if (step) leftmost_id = uchset.unichar_to_id(str, step);
325  }
326  int end_i = orig_i + alt_word->state(i);
327  if (alt_word->state(i) > 1 ||
328  (orig_i + 1 == end_i && replacement_is_ngram)) {
329  // Compute proper blob indices.
330  int blob_start = 0;
331  for (int j = 0; j < orig_i; ++j)
332  blob_start += best_choice->state(j);
333  int blob_end = blob_start;
334  for (int j = orig_i; j < end_i; ++j)
335  blob_end += best_choice->state(j);
336  fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true,
337  replacement_is_ngram, leftmost_id));
338  if (stopper_debug_level > 1) {
339  tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
340  true, replacement_is_ngram,
341  uchset.id_to_unichar(leftmost_id));
342  }
343  }
344  orig_i += alt_word->state(i);
345  }
346  }
347  }
348  delete alt_word;
349  }
350  if (output_ambig_words_file_ != nullptr) {
351  fprintf(output_ambig_words_file_, "\n");
352  }
353 
354  ambig_blob_choices.delete_data_pointers();
355  return !ambigs_found;
356 }
int UNICHAR_ID
Definition: unichar.h:35
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:104
#define MAX_AMBIG_SIZE
Definition: ambigs.h:30
const char * string() const
Definition: strngs.cpp:196
int state(int index) const
Definition: ratngs.h:319
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
float rating() const
Definition: ratngs.h:327
int stopper_debug_level
Definition: dict.h:622
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
Definition: stopper.cpp:368
static int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2)
Definition: ambigs.h:62
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:521
T & get(int index) const
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:98
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:152
int length() const
Definition: genericvector.h:85
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
int step(const char *str) const
Definition: unicharset.cpp:232
int push_back(T object)
const STRING debug_string() const
Definition: ratngs.h:505
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
void delete_data_pointers()
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:836
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:141
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:174
const UnicharAmbigsVector & replace_ambigs() const
Definition: ambigs.h:153

◆ NumDawgs()

int tesseract::Dict::NumDawgs ( ) const
inline

Return the number of dawgs in the dawgs_ vector.

Definition at line 415 of file dict.h.

415 { return dawgs_.size(); }
int size() const
Definition: genericvector.h:71

◆ ParamsModelClassify()

float tesseract::Dict::ParamsModelClassify ( const char *  lang,
void *  path 
)

◆ permute_choices()

void tesseract::Dict::permute_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

permute_choices

Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST with the given char_choice_index in char_choices.

Definition at line 203 of file permdawg.cpp.

213  {
214  if (debug) {
215  tprintf("%s permute_choices: char_choice_index=%d"
216  " limit=%g rating=%g, certainty=%g word=%s\n",
217  debug, char_choice_index, *limit, word->rating(),
218  word->certainty(), word->debug_string().string());
219  }
220  if (char_choice_index < char_choices.length()) {
221  BLOB_CHOICE_IT blob_choice_it;
222  blob_choice_it.set_to_list(char_choices.get(char_choice_index));
223  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
224  blob_choice_it.forward()) {
225  (*attempts_left)--;
226  append_choices(debug, char_choices, *(blob_choice_it.data()),
227  char_choice_index, prev_char_frag_info, word,
228  certainties, limit, best_choice, attempts_left, more_args);
229  if (*attempts_left <= 0) {
230  if (debug) tprintf("permute_choices(): attempts_left is 0\n");
231  break;
232  }
233  }
234  }
235 }
const char * string() const
Definition: strngs.cpp:196
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
T & get(int index) const
int length() const
Definition: genericvector.h:85
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:245
const STRING debug_string() const
Definition: ratngs.h:505

◆ ProbabilityInContext()

double tesseract::Dict::ProbabilityInContext ( const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Calls probability_in_context_ member function.

Definition at line 375 of file dict.h.

378  {
379  return (this->*probability_in_context_)(
380  getCCUtil()->lang.string(),
381  context, context_bytes,
382  character, character_bytes);
383  }
const CCUtil * getCCUtil() const
Definition: dict.h:92
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:369
const char * string() const
Definition: strngs.cpp:196
STRING lang
Definition: ccutil.h:66

◆ ProcessPatternEdges()

void tesseract::Dict::ProcessPatternEdges ( const Dawg dawg,
const DawgPosition info,
UNICHAR_ID  unichar_id,
bool  word_end,
DawgArgs dawg_args,
PermuterType current_permuter 
) const

For each of the character classes of the given unichar_id (and the unichar_id itself) finds the corresponding outgoing node or self-loop in the given dawg and (after checking that it is valid) records it in dawg_args->updated_ative_dawgs. Updates current_permuter if any valid edges were found.

Definition at line 531 of file dict.cpp.

534  {
535  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
536  // Try to find the edge corresponding to the exact unichar_id and to all the
537  // edges corresponding to the character class of unichar_id.
538  GenericVector<UNICHAR_ID> unichar_id_patterns;
539  unichar_id_patterns.push_back(unichar_id);
540  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
541  &unichar_id_patterns);
542  for (int i = 0; i < unichar_id_patterns.size(); ++i) {
543  // On the first iteration check all the outgoing edges.
544  // On the second iteration check all self-loops.
545  for (int k = 0; k < 2; ++k) {
546  EDGE_REF edge = (k == 0)
547  ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
548  : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_patterns[i], word_end);
549  if (edge == NO_EDGE) continue;
550  if (dawg_debug_level >= 3) {
551  tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
552  pos.dawg_index, node, edge);
553  tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
554  }
555  if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
556  if (dawg->end_of_word(edge)) dawg_args->valid_end = true;
557  dawg_args->updated_dawgs->add_unique(
558  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
559  pos.back_to_punc),
560  dawg_debug_level > 0,
561  "Append current dawg to updated active dawgs: ");
562  }
563  }
564 }
int size() const
Definition: genericvector.h:71
#define REFFORMAT
Definition: dawg.h:93
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:423
int64_t EDGE_REF
Definition: dawg.h:55
int64_t NODE_REF
Definition: dawg.h:56
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int push_back(T object)
int dawg_debug_level
Definition: dict.h:605
const UNICHARSET & getUnicharset() const
Definition: dict.h:98

◆ ReplaceAmbig()

void tesseract::Dict::ReplaceAmbig ( int  wrong_ngram_begin_index,
int  wrong_ngram_size,
UNICHAR_ID  correct_ngram_id,
WERD_CHOICE werd_choice,
MATRIX ratings 
)

Definition at line 368 of file stopper.cpp.

370  {
371  int num_blobs_to_replace = 0;
372  int begin_blob_index = 0;
373  int i;
374  // Rating and certainty for the new BLOB_CHOICE are derived from the
375  // replaced choices.
376  float new_rating = 0.0f;
377  float new_certainty = 0.0f;
378  BLOB_CHOICE* old_choice = nullptr;
379  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
380  if (i >= wrong_ngram_begin_index) {
381  int num_blobs = werd_choice->state(i);
382  int col = begin_blob_index + num_blobs_to_replace;
383  int row = col + num_blobs - 1;
384  BLOB_CHOICE_LIST* choices = ratings->get(col, row);
385  ASSERT_HOST(choices != nullptr);
386  old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
387  ASSERT_HOST(old_choice != nullptr);
388  new_rating += old_choice->rating();
389  new_certainty += old_choice->certainty();
390  num_blobs_to_replace += num_blobs;
391  } else {
392  begin_blob_index += werd_choice->state(i);
393  }
394  }
395  new_certainty /= wrong_ngram_size;
396  // If there is no entry in the ratings matrix, add it.
397  MATRIX_COORD coord(begin_blob_index,
398  begin_blob_index + num_blobs_to_replace - 1);
399  if (!coord.Valid(*ratings)) {
400  ratings->IncreaseBandSize(coord.row - coord.col + 1);
401  }
402  if (ratings->get(coord.col, coord.row) == nullptr)
403  ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
404  BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
405  BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
406  if (choice != nullptr) {
407  // Already there. Upgrade if new rating better.
408  if (new_rating < choice->rating())
409  choice->set_rating(new_rating);
410  if (new_certainty < choice->certainty())
411  choice->set_certainty(new_certainty);
412  // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
413  } else {
414  // Need a new choice with the correct_ngram_id.
415  choice = new BLOB_CHOICE(*old_choice);
416  choice->set_unichar_id(correct_ngram_id);
417  choice->set_rating(new_rating);
418  choice->set_certainty(new_certainty);
419  choice->set_classifier(BCC_AMBIG);
420  choice->set_matrix_cell(coord.col, coord.row);
421  BLOB_CHOICE_IT it (new_choices);
422  it.add_to_end(choice);
423  }
424  // Remove current unichar from werd_choice. On the last iteration
425  // set the correct replacement unichar instead of removing a unichar.
426  for (int replaced_count = 0; replaced_count < wrong_ngram_size;
427  ++replaced_count) {
428  if (replaced_count + 1 == wrong_ngram_size) {
429  werd_choice->set_blob_choice(wrong_ngram_begin_index,
430  num_blobs_to_replace, choice);
431  } else {
432  werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
433  }
434  }
435  if (stopper_debug_level >= 1) {
436  werd_choice->print("ReplaceAmbig() ");
437  tprintf("Modified blob_choices: ");
438  print_ratings_list("\n", new_choices, getUnicharset());
439  }
440 }
float certainty() const
Definition: ratngs.h:83
void remove_unichar_id(int index)
Definition: ratngs.h:484
void set_classifier(BlobChoiceClassifier classifier)
Definition: ratngs.h:167
void print() const
Definition: ratngs.h:580
int state(int index) const
Definition: ratngs.h:319
void set_matrix_cell(int col, int row)
Definition: ratngs.h:157
int stopper_debug_level
Definition: dict.h:622
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:54
void set_certainty(float newrat)
Definition: ratngs.h:151
void set_rating(float newrat)
Definition: ratngs.h:148
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:145
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:312
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void put(ICOORD pos, const T &thing)
Definition: matrix.h:220
float rating() const
Definition: ratngs.h:80
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:836
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:180
T get(ICOORD pos) const
Definition: matrix.h:228
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ reset_hyphen_vars()

void tesseract::Dict::reset_hyphen_vars ( bool  last_word_on_line)

Unless the previous word was the last one on the line, and the current one is not (thus it is the first one on the line), erase hyphen_word_, clear hyphen_active_dawgs_, update last_word_on_line_.

Definition at line 28 of file hyphen.cpp.

28  {
29  if (!(last_word_on_line_ == true && last_word_on_line == false)) {
30  if (hyphen_word_ != nullptr) {
31  delete hyphen_word_;
32  hyphen_word_ = nullptr;
33  hyphen_active_dawgs_.clear();
34  }
35  }
36  if (hyphen_debug_level) {
37  tprintf("reset_hyphen_vars: last_word_on_line %d -> %d\n",
38  last_word_on_line_, last_word_on_line);
39  }
40  last_word_on_line_ = last_word_on_line;
41 }
int hyphen_debug_level
Definition: dict.h:606
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37

◆ ResetDocumentDictionary()

void tesseract::Dict::ResetDocumentDictionary ( )
inline

Definition at line 311 of file dict.h.

311  {
312  if (pending_words_ != nullptr)
313  pending_words_->clear();
314  if (document_words_ != nullptr)
315  document_words_->clear();
316  }
void clear()
Definition: trie.cpp:62

◆ set_hyphen_word()

void tesseract::Dict::set_hyphen_word ( const WERD_CHOICE word,
const DawgPositionVector active_dawgs 
)

Update hyphen_word_, and copy the given DawgPositionVectors into hyphen_active_dawgs_ .

Definition at line 45 of file hyphen.cpp.

46  {
47  if (hyphen_word_ == nullptr) {
48  hyphen_word_ = new WERD_CHOICE(word.unicharset());
49  hyphen_word_->make_bad();
50  }
51  if (hyphen_word_->rating() > word.rating()) {
52  *hyphen_word_ = word;
53  // Remove the last unichar id as it is a hyphen, and remove
54  // any unichar_string/lengths that are present.
55  hyphen_word_->remove_last_unichar_id();
56  hyphen_active_dawgs_ = active_dawgs;
57  }
58  if (hyphen_debug_level) {
59  hyphen_word_->print("set_hyphen_word: ");
60  }
61 }
void print() const
Definition: ratngs.h:580
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
float rating() const
Definition: ratngs.h:327
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:443
int hyphen_debug_level
Definition: dict.h:606
void remove_last_unichar_id()
Definition: ratngs.h:483

◆ SettupStopperPass1()

void tesseract::Dict::SettupStopperPass1 ( )

Sets up stopper variables in preparation for the first pass.

Definition at line 360 of file stopper.cpp.

360  {
361  reject_offset_ = 0.0;
362 }

◆ SettupStopperPass2()

void tesseract::Dict::SettupStopperPass2 ( )

Sets up stopper variables in preparation for the second pass.

Definition at line 364 of file stopper.cpp.

364  {
366 }
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:615

◆ SetupForLoad()

void tesseract::Dict::SetupForLoad ( DawgCache dawg_cache)

Definition at line 201 of file dict.cpp.

201  {
202  if (dawgs_.length() != 0) this->End();
203 
204  apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
205  question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
206  slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
207  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
208 
209  if (dawg_cache != nullptr) {
210  dawg_cache_ = dawg_cache;
211  dawg_cache_is_ours_ = false;
212  } else {
213  dawg_cache_ = new DawgCache();
214  dawg_cache_is_ours_ = true;
215  }
216 }
void End()
Definition: dict.cpp:343
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
int length() const
Definition: genericvector.h:85
const UNICHARSET & getUnicharset() const
Definition: dict.h:98

◆ SetWildcardID()

void tesseract::Dict::SetWildcardID ( UNICHAR_ID  id)
inline

Definition at line 412 of file dict.h.

412 { wildcard_unichar_id_ = id; }

◆ SetWordsegRatingAdjustFactor()

void tesseract::Dict::SetWordsegRatingAdjustFactor ( float  f)
inline

Set wordseg_rating_adjust_factor_ to the given value.

Definition at line 495 of file dict.h.

495  {
496  wordseg_rating_adjust_factor_ = f;
497  }

◆ UniformCertainties()

int tesseract::Dict::UniformCertainties ( const WERD_CHOICE word)

Returns true if the certainty of the BestChoice word is within a reasonable range of the average certainties for the best choices for each character in the segmentation. This test is used to catch words in which one character is much worse than the other characters in the word (i.e. false will be returned in that case). The algorithm computes the mean and std deviation of the certainties in the word with the worst certainty thrown out.

Definition at line 461 of file stopper.cpp.

461  {
462  float Certainty;
463  float WorstCertainty = FLT_MAX;
464  float CertaintyThreshold;
465  double TotalCertainty;
466  double TotalCertaintySquared;
467  double Variance;
468  float Mean, StdDev;
469  int word_length = word.length();
470 
471  if (word_length < 3)
472  return true;
473 
474  TotalCertainty = TotalCertaintySquared = 0.0;
475  for (int i = 0; i < word_length; ++i) {
476  Certainty = word.certainty(i);
477  TotalCertainty += Certainty;
478  TotalCertaintySquared += static_cast<double>(Certainty) * Certainty;
479  if (Certainty < WorstCertainty)
480  WorstCertainty = Certainty;
481  }
482 
483  // Subtract off worst certainty from statistics.
484  word_length--;
485  TotalCertainty -= WorstCertainty;
486  TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty;
487 
488  Mean = TotalCertainty / word_length;
489  Variance = ((word_length * TotalCertaintySquared -
490  TotalCertainty * TotalCertainty) /
491  (word_length * (word_length - 1)));
492  if (Variance < 0.0)
493  Variance = 0.0;
494  StdDev = sqrt(Variance);
495 
496  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
497  if (CertaintyThreshold > stopper_nondict_certainty_base)
498  CertaintyThreshold = stopper_nondict_certainty_base;
499 
500  if (word.certainty() < CertaintyThreshold) {
501  if (stopper_debug_level >= 1)
502  tprintf("Stopper: Non-uniform certainty = %4.1f"
503  " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
504  word.certainty(), Mean, StdDev, CertaintyThreshold);
505  return false;
506  } else {
507  return true;
508  }
509 }
float certainty() const
Definition: ratngs.h:330
int stopper_debug_level
Definition: dict.h:622
double stopper_allowable_character_badness
Definition: dict.h:621
double stopper_nondict_certainty_base
Definition: dict.h:613
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:628

◆ update_best_choice()

void tesseract::Dict::update_best_choice ( const WERD_CHOICE word,
WERD_CHOICE best_choice 
)
inline

Copies word into best_choice if its rating is smaller than that of best_choice.

Definition at line 171 of file dict.h.

172  {
173  if (word.rating() < best_choice->rating()) {
174  *best_choice = word;
175  }
176  }
float rating() const
Definition: ratngs.h:327

◆ valid_bigram()

bool tesseract::Dict::valid_bigram ( const WERD_CHOICE word1,
const WERD_CHOICE word2 
) const

Definition at line 787 of file dict.cpp.

788  {
789  if (bigram_dawg_ == nullptr) return false;
790 
791  // Extract the core word from the middle of each word with any digits
792  // replaced with question marks.
793  int w1start, w1end, w2start, w2end;
794  word1.punct_stripped(&w1start, &w1end);
795  word2.punct_stripped(&w2start, &w2end);
796 
797  // We don't want to penalize a single guillemet, hyphen, etc.
798  // But our bigram list doesn't have any information about punctuation.
799  if (w1start >= w1end) return word1.length() < 3;
800  if (w2start >= w2end) return word2.length() < 3;
801 
802  const UNICHARSET& uchset = getUnicharset();
803  GenericVector<UNICHAR_ID> bigram_string;
804  bigram_string.reserve(w1end + w2end + 1);
805  for (int i = w1start; i < w1end; i++) {
806  const GenericVector<UNICHAR_ID>& normed_ids =
807  getUnicharset().normed_ids(word1.unichar_id(i));
808  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
809  bigram_string.push_back(question_unichar_id_);
810  else
811  bigram_string += normed_ids;
812  }
813  bigram_string.push_back(UNICHAR_SPACE);
814  for (int i = w2start; i < w2end; i++) {
815  const GenericVector<UNICHAR_ID>& normed_ids =
816  getUnicharset().normed_ids(word2.unichar_id(i));
817  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
818  bigram_string.push_back(question_unichar_id_);
819  else
820  bigram_string += normed_ids;
821  }
822  WERD_CHOICE normalized_word(&uchset, bigram_string.size());
823  for (int i = 0; i < bigram_string.size(); ++i) {
824  normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1,
825  0.0f, 0.0f);
826  }
827  return bigram_dawg_->word_in_dawg(normalized_word);
828 }
int size() const
Definition: genericvector.h:71
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:452
void reserve(int size)
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:830
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:71
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:383
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int length() const
Definition: ratngs.h:303
int push_back(T object)
const UNICHARSET & getUnicharset() const
Definition: dict.h:98

◆ valid_punctuation()

bool tesseract::Dict::valid_punctuation ( const WERD_CHOICE word)

Returns true if the word contains a valid punctuation pattern. Note: Since the domains of punctuation symbols and symblos used in numbers are not disjoint, a valid number might contain an invalid punctuation pattern (e.g. .99).

Definition at line 830 of file dict.cpp.

830  {
831  if (word.length() == 0) return NO_PERM;
832  int i;
833  WERD_CHOICE new_word(word.unicharset());
834  int last_index = word.length() - 1;
835  int new_len = 0;
836  for (i = 0; i <= last_index; ++i) {
837  UNICHAR_ID unichar_id = (word.unichar_id(i));
838  if (getUnicharset().get_ispunctuation(unichar_id)) {
839  new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
840  } else if (!getUnicharset().get_isalpha(unichar_id) &&
841  !getUnicharset().get_isdigit(unichar_id)) {
842  return false; // neither punc, nor alpha, nor digit
843  } else if ((new_len = new_word.length()) == 0 ||
844  new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
845  new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
846  }
847  }
848  for (i = 0; i < dawgs_.size(); ++i) {
849  if (dawgs_[i] != nullptr &&
850  dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
851  dawgs_[i]->word_in_dawg(new_word)) return true;
852  }
853  return false;
854 }
int UNICHAR_ID
Definition: unichar.h:35
int size() const
Definition: genericvector.h:71
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:514
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int length() const
Definition: ratngs.h:303
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:126

◆ valid_word() [1/3]

int tesseract::Dict::valid_word ( const WERD_CHOICE word,
bool  numbers_ok 
) const

Definition at line 753 of file dict.cpp.

753  {
754  const WERD_CHOICE *word_ptr = &word;
755  WERD_CHOICE temp_word(word.unicharset());
756  if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
757  copy_hyphen_info(&temp_word);
758  temp_word += word;
759  word_ptr = &temp_word;
760  }
761  if (word_ptr->length() == 0) return NO_PERM;
762  // Allocate vectors for holding current and updated
763  // active_dawgs and initialize them.
764  DawgPositionVector *active_dawgs = new DawgPositionVector[2];
765  init_active_dawgs(&(active_dawgs[0]), false);
766  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
767  int last_index = word_ptr->length() - 1;
768  // Call letter_is_okay for each letter in the word.
769  for (int i = hyphen_base_size(); i <= last_index; ++i) {
770  if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(),
771  word_ptr->unichar_id(i),
772  i == last_index))) break;
773  // Swap active_dawgs, constraints with the corresponding updated vector.
774  if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
775  dawg_args.updated_dawgs = &(active_dawgs[0]);
776  ++(dawg_args.active_dawgs);
777  } else {
778  ++(dawg_args.updated_dawgs);
779  dawg_args.active_dawgs = &(active_dawgs[0]);
780  }
781  }
782  delete[] active_dawgs;
783  return valid_word_permuter(dawg_args.permuter, numbers_ok) ?
784  dawg_args.permuter : NO_PERM;
785 }
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:569
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:131
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int length() const
Definition: ratngs.h:303
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:137
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:127
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:459
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:357

◆ valid_word() [2/3]

int tesseract::Dict::valid_word ( const WERD_CHOICE word) const
inline

Definition at line 466 of file dict.h.

466  {
467  return valid_word(word, false); // return NO_PERM for words with digits
468  }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:753

◆ valid_word() [3/3]

int tesseract::Dict::valid_word ( const char *  string) const
inline

This function is used by api/tesseract_cube_combiner.cpp.

Definition at line 473 of file dict.h.

473  {
474  WERD_CHOICE word(string, getUnicharset());
475  return valid_word(word);
476  }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:753
const UNICHARSET & getUnicharset() const
Definition: dict.h:98

◆ valid_word_or_number()

int tesseract::Dict::valid_word_or_number ( const WERD_CHOICE word) const
inline

Definition at line 469 of file dict.h.

469  {
470  return valid_word(word, true); // return NUMBER_PERM for valid numbers
471  }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:753

◆ valid_word_permuter()

static bool tesseract::Dict::valid_word_permuter ( uint8_t  perm,
bool  numbers_ok 
)
inlinestatic

Check all the DAWGs to see if this word is in any of them.

Read/Write/Access special purpose dawgs which contain words only of a certain length (used for phrase search for non-space-delimited languages).

Definition at line 459 of file dict.h.

459  {
460  return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
461  perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
462  perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
463  (numbers_ok && perm == NUMBER_PERM));
464  }

◆ WildcardID()

UNICHAR_ID tesseract::Dict::WildcardID ( ) const
inline

Definition at line 413 of file dict.h.

413 { return wildcard_unichar_id_; }

Member Data Documentation

◆ certainty_scale

double tesseract::Dict::certainty_scale = 20.0

"Certainty scaling factor"

Definition at line 611 of file dict.h.

◆ dawg_debug_level

int tesseract::Dict::dawg_debug_level = 0

"Set to 1 for general debug info" ", to 2 for more details, to 3 to see all the debug messages"

Definition at line 605 of file dict.h.

◆ doc_dict_certainty_threshold

double tesseract::Dict::doc_dict_certainty_threshold = -2.25

"Worst certainty" " for words that can be inserted into the document dictionary"

Definition at line 640 of file dict.h.

◆ doc_dict_pending_threshold

double tesseract::Dict::doc_dict_pending_threshold = 0.0

"Worst certainty for using pending dictionary"

Definition at line 638 of file dict.h.

◆ fragments_debug

int tesseract::Dict::fragments_debug = 0

"Debug character fragments"

Definition at line 631 of file dict.h.

◆ go_deeper_fxn_

void(Dict::* tesseract::Dict::go_deeper_fxn_) (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)

Pointer to go_deeper function.

Definition at line 205 of file dict.h.

◆ hyphen_debug_level

int tesseract::Dict::hyphen_debug_level = 0

"Debug level for hyphenated words."

Definition at line 606 of file dict.h.

◆ letter_is_okay_

int(Dict::* tesseract::Dict::letter_is_okay_) (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const

Definition at line 357 of file dict.h.

◆ load_bigram_dawg

bool tesseract::Dict::load_bigram_dawg = true

"Load dawg with special word bigrams."

Definition at line 575 of file dict.h.

◆ load_freq_dawg

bool tesseract::Dict::load_freq_dawg = true

"Load frequent word dawg."

Definition at line 569 of file dict.h.

◆ load_number_dawg

bool tesseract::Dict::load_number_dawg = true

"Load dawg with number patterns."

Definition at line 573 of file dict.h.

◆ load_punc_dawg

bool tesseract::Dict::load_punc_dawg = true

"Load dawg with punctuation patterns."

Definition at line 572 of file dict.h.

◆ load_system_dawg

bool tesseract::Dict::load_system_dawg = true

"Load system word dawg."

Definition at line 568 of file dict.h.

◆ load_unambig_dawg

bool tesseract::Dict::load_unambig_dawg = true

"Load unambiguous word dawg."

Definition at line 570 of file dict.h.

◆ max_permuter_attempts

int tesseract::Dict::max_permuter_attempts = 10000

"Maximum number of different" " character choices to consider during permutation." " This limit is especially useful when user patterns" " are specified, since overly generic patterns can result in" " dawg search exploring an overly large number of options."

Definition at line 645 of file dict.h.

◆ max_viterbi_list_size

int tesseract::Dict::max_viterbi_list_size = 10

"Maximum size of viterbi list."

Definition at line 607 of file dict.h.

◆ output_ambig_words_file

char* tesseract::Dict::output_ambig_words_file = ""

"Output file for ambiguities found in the dictionary"

Definition at line 603 of file dict.h.

◆ params_model_classify_

float(Dict::* tesseract::Dict::params_model_classify_) (const char *lang, void *path)

Definition at line 403 of file dict.h.

◆ probability_in_context_

double(Dict::* tesseract::Dict::probability_in_context_) (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)

Probability in context function used by the ngram permuter.

Definition at line 369 of file dict.h.

◆ save_doc_words

bool tesseract::Dict::save_doc_words = 0

"Save Document Words"

Definition at line 636 of file dict.h.

◆ segment_nonalphabetic_script

bool tesseract::Dict::segment_nonalphabetic_script = false

"Don't use any alphabetic-specific tricks." "Set to true in the traineddata config file for" " scripts that are cursive or inherently fixed-pitch"

Definition at line 635 of file dict.h.

◆ segment_penalty_dict_case_bad

double tesseract::Dict::segment_penalty_dict_case_bad = 1.3125

"Default score multiplier for word matches, which may have " "case issues (lower is better)."

Definition at line 592 of file dict.h.

◆ segment_penalty_dict_case_ok

double tesseract::Dict::segment_penalty_dict_case_ok = 1.1

"Score multiplier for word matches that have good case " "(lower is better)."

Definition at line 588 of file dict.h.

◆ segment_penalty_dict_frequent_word

double tesseract::Dict::segment_penalty_dict_frequent_word = 1.0

"Score multiplier for word matches which have good case and" "are frequent in the given language (lower is better)."

Definition at line 584 of file dict.h.

◆ segment_penalty_dict_nonword

double tesseract::Dict::segment_penalty_dict_nonword = 1.25

"Score multiplier for glyph fragment segmentations which " "do not match a dictionary word (lower is better)."

Definition at line 596 of file dict.h.

◆ segment_penalty_garbage

double tesseract::Dict::segment_penalty_garbage = 1.50

"Score multiplier for poorly cased strings that are not in" " the dictionary and generally look like garbage (lower is" " better)."

Definition at line 601 of file dict.h.

◆ stopper_allowable_character_badness

double tesseract::Dict::stopper_allowable_character_badness = 3.0

"Max certaintly variation allowed in a word (in sigma)"

Definition at line 621 of file dict.h.

◆ stopper_certainty_per_char

double tesseract::Dict::stopper_certainty_per_char = -0.50

"Certainty to add for each dict char above small word size."

Definition at line 619 of file dict.h.

◆ stopper_debug_level

int tesseract::Dict::stopper_debug_level = 0

"Stopper debug level"

Definition at line 622 of file dict.h.

◆ stopper_no_acceptable_choices

bool tesseract::Dict::stopper_no_acceptable_choices = false

"Make AcceptableChoice() always return false. Useful" " when there is a need to explore all segmentations"

Definition at line 625 of file dict.h.

◆ stopper_nondict_certainty_base

double tesseract::Dict::stopper_nondict_certainty_base = -2.50

"Certainty threshold for non-dict words"

Definition at line 613 of file dict.h.

◆ stopper_phase2_certainty_rejection_offset

double tesseract::Dict::stopper_phase2_certainty_rejection_offset = 1.0

"Reject certainty offset"

Definition at line 615 of file dict.h.

◆ stopper_smallword_size

int tesseract::Dict::stopper_smallword_size = 2

"Size of dict word to be treated as non-dict word"

Definition at line 617 of file dict.h.

◆ tessedit_truncate_wordchoice_log

int tesseract::Dict::tessedit_truncate_wordchoice_log = 10

"Max words to keep in list"

Definition at line 626 of file dict.h.

◆ use_only_first_uft8_step

bool tesseract::Dict::use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities."

Definition at line 610 of file dict.h.

◆ user_patterns_file

char* tesseract::Dict::user_patterns_file = ""

"A filename of user-provided patterns."

Definition at line 565 of file dict.h.

◆ user_patterns_suffix

char* tesseract::Dict::user_patterns_suffix = ""

"A suffix of user-provided patterns located in tessdata."

Definition at line 567 of file dict.h.

◆ user_words_file

char* tesseract::Dict::user_words_file = ""

Variable members. These have to be declared and initialized after image_ptr_, which contains the pointer to the params vector - the member of its base CCUtil class. "A filename of user-provided words."

Definition at line 561 of file dict.h.

◆ user_words_suffix

char* tesseract::Dict::user_words_suffix = ""

"A suffix of user-provided words located in tessdata."

Definition at line 563 of file dict.h.

◆ word_to_debug

char* tesseract::Dict::word_to_debug = ""

"Word for which stopper debug information" " should be printed to stdout"

Definition at line 628 of file dict.h.

◆ word_to_debug_lengths

char* tesseract::Dict::word_to_debug_lengths = ""

"Lengths of unichars in word_to_debug"

Definition at line 630 of file dict.h.

◆ xheight_penalty_inconsistent

double tesseract::Dict::xheight_penalty_inconsistent = 0.25

"Score penalty (0.1 = 10%) added if an xheight is " "inconsistent."

Definition at line 581 of file dict.h.

◆ xheight_penalty_subscripts

double tesseract::Dict::xheight_penalty_subscripts = 0.125

"Score penalty (0.1 = 10%) added if there are subscripts " "or superscripts in a word, but it is otherwise OK."

Definition at line 578 of file dict.h.


The documentation for this class was generated from the following files: