tesseract  5.0.0-alpha-619-ge9db
tesseract::Dict Class Reference

#include <dict.h>

Public Member Functions

 Dict (CCUtil *image_ptr)
 
 ~Dict ()
 
const CCUtilgetCCUtil () const
 
CCUtilgetCCUtil ()
 
const UNICHARSETgetUnicharset () const
 
UNICHARSETgetUnicharset ()
 
const UnicharAmbigsgetUnicharAmbigs () const
 
bool compound_marker (UNICHAR_ID unichar_id)
 
bool is_apostrophe (UNICHAR_ID unichar_id)
 
bool hyphenated () const
 Returns true if we've recorded the beginning of a hyphenated word. More...
 
int hyphen_base_size () const
 Size of the base word (the part on the line before) of a hyphenated word. More...
 
void copy_hyphen_info (WERD_CHOICE *word) const
 
bool has_hyphen_end (const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
 Check whether the word has a hyphen at the end. More...
 
bool has_hyphen_end (const WERD_CHOICE &word) const
 Same as above, but check the unichar at the end of the word. More...
 
void reset_hyphen_vars (bool last_word_on_line)
 
void set_hyphen_word (const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
 
void update_best_choice (const WERD_CHOICE &word, WERD_CHOICE *best_choice)
 
void init_active_dawgs (DawgPositionVector *active_dawgs, bool ambigs_mode) const
 
void default_dawgs (DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
 
bool NoDangerousAmbig (WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
 
void ReplaceAmbig (int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
 
int LengthOfShortestAlphaRun (const WERD_CHOICE &WordChoice) const
 Returns the length of the shortest alpha run in WordChoice. More...
 
int UniformCertainties (const WERD_CHOICE &word)
 
bool AcceptableChoice (const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
 Returns true if the given best_choice is good enough to stop. More...
 
bool AcceptableResult (WERD_RES *word) const
 
void EndDangerousAmbigs ()
 
void DebugWordChoices ()
 Prints the current choices for this word to stdout. More...
 
void SettupStopperPass1 ()
 Sets up stopper variables in preparation for the first pass. More...
 
void SettupStopperPass2 ()
 Sets up stopper variables in preparation for the second pass. More...
 
int case_ok (const WERD_CHOICE &word) const
 Check a string to see if it matches a set of lexical rules. More...
 
bool absolute_garbage (const WERD_CHOICE &word, const UNICHARSET &unicharset)
 
void SetupForLoad (DawgCache *dawg_cache)
 
void Load (const STRING &lang, TessdataManager *data_file)
 
void LoadLSTM (const STRING &lang, TessdataManager *data_file)
 
bool FinishLoad ()
 
void End ()
 
void ResetDocumentDictionary ()
 
int def_letter_is_okay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 
int LetterIsOkay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 Calls letter_is_okay_ member function. More...
 
double ProbabilityInContext (const char *context, int context_bytes, const char *character, int character_bytes)
 Calls probability_in_context_ member function. More...
 
double def_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Default (no-op) implementation of probability in context function. More...
 
double ngram_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 
float ParamsModelClassify (const char *lang, void *path)
 
float CallParamsModelClassify (void *path)
 
void SetWildcardID (UNICHAR_ID id)
 
UNICHAR_ID WildcardID () const
 
int NumDawgs () const
 Return the number of dawgs in the dawgs_ vector. More...
 
const DawgGetDawg (int index) const
 Return i-th dawg pointer recorded in the dawgs_ vector. More...
 
const DawgGetPuncDawg () const
 Return the points to the punctuation dawg. More...
 
const DawgGetUnambigDawg () const
 Return the points to the unambiguous words dawg. More...
 
UNICHAR_ID char_for_dawg (const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
 
void ProcessPatternEdges (const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
 
int valid_word (const WERD_CHOICE &word, bool numbers_ok) const
 
int valid_word (const WERD_CHOICE &word) const
 
int valid_word_or_number (const WERD_CHOICE &word) const
 
int valid_word (const char *string) const
 This function is used by api/tesseract_cube_combiner.cpp. More...
 
bool valid_bigram (const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
 
bool valid_punctuation (const WERD_CHOICE &word)
 
int good_choice (const WERD_CHOICE &choice)
 Returns true if a good answer is found for the unknown blob rating. More...
 
void add_document_word (const WERD_CHOICE &best_choice)
 Adds a word found on this document to the document specific dictionary. More...
 
void adjust_word (WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
 Adjusts the rating of the given word. More...
 
void SetWordsegRatingAdjustFactor (float f)
 Set wordseg_rating_adjust_factor_ to the given value. More...
 
bool IsSpaceDelimitedLang () const
 Returns true if the language is space-delimited (not CJ, or T). More...
 
go_deeper_dawg_fxn

If the choice being composed so far could be a dictionary word keep exploring choices.

WERD_CHOICEdawg_permute_and_select (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
 
void go_deeper_dawg_fxn (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 
void permute_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
void append_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
fragment_state

Given the current char choice and information about previously seen fragments, determines whether adjacent character fragments are present and whether they can be concatenated.

The given prev_char_frag_info contains:

  • fragment: if not nullptr contains information about immediately preceding fragmented character choice
  • num_fragments: number of fragments that have been used so far to construct a character
  • certainty: certainty of the current choice or minimum certainty of all fragments concatenated so far
  • rating: rating of the current choice or sum of fragment ratings concatenated so far

The output char_frag_info is filled in as follows:

  • character: is set to be nullptr if the choice is a non-matching or non-ending fragment piece; is set to unichar of the given choice if it represents a regular character or a matching ending fragment
  • fragment,num_fragments,certainty,rating are set as described above
Returns
false if a non-matching fragment is discovered, true otherwise.
bool fragment_state_okay (UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
 

Static Public Member Functions

static TESS_API DawgCacheGlobalDawgCache ()
 
static NODE_REF GetStartingNode (const Dawg *dawg, EDGE_REF edge_ref)
 Returns the appropriate next node given the EDGE_REF. More...
 
static bool valid_word_permuter (uint8_t perm, bool numbers_ok)
 Check all the DAWGs to see if this word is in any of them. More...
 

Public Attributes

void(Dict::* go_deeper_fxn_ )(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 Pointer to go_deeper function. More...
 
int(Dict::* letter_is_okay_ )(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 
double(Dict::* probability_in_context_ )(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Probability in context function used by the ngram permuter. More...
 
float(Dict::* params_model_classify_ )(const char *lang, void *path)
 
char * user_words_file = ""
 
char * user_words_suffix = ""
 
char * user_patterns_file = ""
 
char * user_patterns_suffix = ""
 
bool load_system_dawg = true
 
bool load_freq_dawg = true
 
bool load_unambig_dawg = true
 
bool load_punc_dawg = true
 
bool load_number_dawg = true
 
bool load_bigram_dawg = true
 
double xheight_penalty_subscripts = 0.125
 
double xheight_penalty_inconsistent = 0.25
 
double segment_penalty_dict_frequent_word = 1.0
 
double segment_penalty_dict_case_ok = 1.1
 
double segment_penalty_dict_case_bad = 1.3125
 
double segment_penalty_dict_nonword = 1.25
 
double segment_penalty_garbage = 1.50
 
char * output_ambig_words_file = ""
 
int dawg_debug_level = 0
 
int hyphen_debug_level = 0
 
bool use_only_first_uft8_step = false
 
double certainty_scale = 20.0
 
double stopper_nondict_certainty_base = -2.50
 
double stopper_phase2_certainty_rejection_offset = 1.0
 
int stopper_smallword_size = 2
 
double stopper_certainty_per_char = -0.50
 
double stopper_allowable_character_badness = 3.0
 
int stopper_debug_level = 0
 
bool stopper_no_acceptable_choices = false
 
int tessedit_truncate_wordchoice_log = 10
 
char * word_to_debug = ""
 
bool segment_nonalphabetic_script = false
 
bool save_doc_words = 0
 
double doc_dict_pending_threshold = 0.0
 
double doc_dict_certainty_threshold = -2.25
 
int max_permuter_attempts = 10000
 

Detailed Description

Definition at line 91 of file dict.h.

Constructor & Destructor Documentation

◆ Dict()

tesseract::Dict::Dict ( CCUtil image_ptr)

Definition at line 30 of file dict.cpp.

33  params_model_classify_(nullptr),
34  ccutil_(ccutil),
35  wildcard_unichar_id_(INVALID_UNICHAR_ID),
36  apostrophe_unichar_id_(INVALID_UNICHAR_ID),
37  question_unichar_id_(INVALID_UNICHAR_ID),
38  slash_unichar_id_(INVALID_UNICHAR_ID),
39  hyphen_unichar_id_(INVALID_UNICHAR_ID),
40  STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
41  getCCUtil()->params()),
43  "A suffix of user-provided words located in tessdata.",
44  getCCUtil()->params()),
46  "A filename of user-provided patterns.",
47  getCCUtil()->params()),
49  "A suffix of user-provided patterns located in "
50  "tessdata.",
51  getCCUtil()->params()),
52  BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
53  getCCUtil()->params()),
54  BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
55  getCCUtil()->params()),
56  BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
57  getCCUtil()->params()),
59  "Load dawg with punctuation"
60  " patterns.",
61  getCCUtil()->params()),
63  "Load dawg with number"
64  " patterns.",
65  getCCUtil()->params()),
67  "Load dawg with special word "
68  "bigrams.",
69  getCCUtil()->params()),
71  "Score penalty (0.1 = 10%) added if there are subscripts "
72  "or superscripts in a word, but it is otherwise OK.",
73  getCCUtil()->params()),
75  "Score penalty (0.1 = 10%) added if an xheight is "
76  "inconsistent.",
77  getCCUtil()->params()),
79  "Score multiplier for word matches which have good case and"
80  " are frequent in the given language (lower is better).",
81  getCCUtil()->params()),
83  "Score multiplier for word matches that have good case "
84  "(lower is better).",
85  getCCUtil()->params()),
87  "Default score multiplier for word matches, which may have "
88  "case issues (lower is better).",
89  getCCUtil()->params()),
91  "Score multiplier for glyph fragment segmentations which "
92  "do not match a dictionary word (lower is better).",
93  getCCUtil()->params()),
95  "Score multiplier for poorly cased strings that are not in"
96  " the dictionary and generally look like garbage (lower is"
97  " better).",
98  getCCUtil()->params()),
100  "Output file for ambiguities found in the dictionary",
101  getCCUtil()->params()),
103  "Set to 1 for general debug info"
104  ", to 2 for more details, to 3 to see all the debug messages",
105  getCCUtil()->params()),
106  INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
107  getCCUtil()->params()),
109  "Use only the first UTF8 step of the given string"
110  " when computing log probabilities.",
111  getCCUtil()->params()),
112  double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
113  getCCUtil()->params()),
115  "Certainty threshold for non-dict words",
116  getCCUtil()->params()),
118  "Reject certainty offset", getCCUtil()->params()),
120  "Size of dict word to be treated as non-dict word",
121  getCCUtil()->params()),
123  "Certainty to add"
124  " for each dict char above small word size.",
125  getCCUtil()->params()),
127  "Max certaintly variation allowed in a word (in sigma)",
128  getCCUtil()->params()),
129  INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
130  getCCUtil()->params()),
132  "Make AcceptableChoice() always return false. Useful"
133  " when there is a need to explore all segmentations",
134  getCCUtil()->params()),
136  "Max words to keep in list", getCCUtil()->params()),
138  "Word for which stopper debug"
139  " information should be printed to stdout",
140  getCCUtil()->params()),
142  "Don't use any alphabetic-specific tricks."
143  " Set to true in the traineddata config file for"
144  " scripts that are cursive or inherently fixed-pitch",
145  getCCUtil()->params()),
146  BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
147  getCCUtil()->params()),
149  "Worst certainty for using pending dictionary",
150  getCCUtil()->params()),
152  "Worst certainty for words that can be inserted into the"
153  " document dictionary",
154  getCCUtil()->params()),
156  "Maximum number of different"
157  " character choices to consider during permutation."
158  " This limit is especially useful when user patterns"
159  " are specified, since overly generic patterns can result in"
160  " dawg search exploring an overly large number of options.",
161  getCCUtil()->params()) {
162  reject_offset_ = 0.0;
163  go_deeper_fxn_ = nullptr;
164  hyphen_word_ = nullptr;
165  last_word_on_line_ = false;
166  document_words_ = nullptr;
167  dawg_cache_ = nullptr;
168  dawg_cache_is_ours_ = false;
169  pending_words_ = nullptr;
170  bigram_dawg_ = nullptr;
171  freq_dawg_ = nullptr;
172  punc_dawg_ = nullptr;
173  unambig_dawg_ = nullptr;
174  wordseg_rating_adjust_factor_ = -1.0f;
175  output_ambig_words_file_ = nullptr;
176 }

◆ ~Dict()

tesseract::Dict::~Dict ( )

Definition at line 178 of file dict.cpp.

178  {
179  End();
180  delete hyphen_word_;
181  if (output_ambig_words_file_ != nullptr) fclose(output_ambig_words_file_);
182 }

Member Function Documentation

◆ absolute_garbage()

bool tesseract::Dict::absolute_garbage ( const WERD_CHOICE word,
const UNICHARSET unicharset 
)

Returns true if the word looks like an absolute garbage (e.g. image mistakenly recognized as text).

Definition at line 80 of file context.cpp.

◆ AcceptableChoice()

bool tesseract::Dict::AcceptableChoice ( const WERD_CHOICE best_choice,
XHeightConsistencyEnum  xheight_consistency 
)

Returns true if the given best_choice is good enough to stop.

Definition at line 56 of file stopper.cpp.

57  {
58  case XH_GOOD: xht = "NORMAL"; break;
59  case XH_SUBNORMAL: xht = "SUBNORMAL"; break;
60  case XH_INCONSISTENT: xht = "INCONSISTENT"; break;
61  default: xht = "UNKNOWN";
62  }
63  tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
64  best_choice.unichar_string().c_str(),
65  (is_valid_word ? 'y' : 'n'),
66  (is_case_ok ? 'y' : 'n'),
67  xht,
68  best_choice.min_x_height(),
69  best_choice.max_x_height());
70  }
71  // Do not accept invalid words in PASS1.
72  if (reject_offset_ <= 0.0f && !is_valid_word) return false;
73  if (is_valid_word && is_case_ok) {
74  WordSize = LengthOfShortestAlphaRun(best_choice);
75  WordSize -= stopper_smallword_size;
76  if (WordSize < 0)
77  WordSize = 0;
78  CertaintyThreshold += WordSize * stopper_certainty_per_char;
79  }
80 
81  if (stopper_debug_level >= 1)
82  tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
83  best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
84 
85  if (no_dang_ambigs &&
86  best_choice.certainty() > CertaintyThreshold &&
87  xheight_consistency < XH_INCONSISTENT &&
88  UniformCertainties(best_choice)) {
89  return true;
90  } else {
91  if (stopper_debug_level >= 1) {
92  tprintf("AcceptableChoice() returned false"
93  " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
94  no_dang_ambigs, best_choice.certainty(),
95  CertaintyThreshold,
96  UniformCertainties(best_choice));
97  }
98  return false;
99  }
100 }
101 
102 bool Dict::AcceptableResult(WERD_RES *word) const {
103  if (word->best_choice == nullptr) return false;
104  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
105  int WordSize;
106 
107  if (stopper_debug_level >= 1) {
108  tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
109  word->best_choice->debug_string().c_str(),
110  (valid_word(*word->best_choice) ? 'y' : 'n'),
111  (case_ok(*word->best_choice) ? 'y' : 'n'),
112  word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
113  word->best_choices.singleton() ? 'n' : 'y');
114  }

◆ AcceptableResult()

bool tesseract::Dict::AcceptableResult ( WERD_RES word) const

Returns false if the best choice for the current word is questionable and should be tried again on the second pass or should be flagged to the user.

Definition at line 116 of file stopper.cpp.

118  {
119  WordSize = LengthOfShortestAlphaRun(*word->best_choice);
120  WordSize -= stopper_smallword_size;
121  if (WordSize < 0)
122  WordSize = 0;
123  CertaintyThreshold += WordSize * stopper_certainty_per_char;
124  }
125 
126  if (stopper_debug_level >= 1)
127  tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
128  word->best_choice->certainty(), CertaintyThreshold);
129 
130  if (word->best_choice->certainty() > CertaintyThreshold &&
132  if (stopper_debug_level >= 1)
133  tprintf("ACCEPTED\n");
134  return true;
135  } else {
136  if (stopper_debug_level >= 1)
137  tprintf("REJECTED\n");
138  return false;
139  }
140 }
141 
142 #if !defined(DISABLED_LEGACY_ENGINE)
143 
144 bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
145  DANGERR *fixpt,
146  bool fix_replaceable,
147  MATRIX *ratings) {
148  if (stopper_debug_level > 2) {
149  tprintf("\nRunning NoDangerousAmbig() for %s\n",
150  best_choice->debug_string().c_str());
151  }
152 
153  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
154  // for each unichar id in BestChoice.

◆ add_document_word()

void tesseract::Dict::add_document_word ( const WERD_CHOICE best_choice)

Adds a word found on this document to the document specific dictionary.

Definition at line 644 of file dict.cpp.

644  {
645  // Do not add hyphenated word parts to the document dawg.
646  // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
647  // called when the first part of the hyphenated word is
648  // discovered and while the second part of the word is recognized.
649  // hyphen_word_ is cleared in cc_recg() before the next word on
650  // the line is recognized.
651  if (hyphen_word_) return;
652 
653  int stringlen = best_choice.length();
654 
655  if (valid_word(best_choice) || stringlen < 2) return;
656 
657  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
658  if (best_choice.length() >= kDocDictMaxRepChars) {
659  int num_rep_chars = 1;
660  UNICHAR_ID uch_id = best_choice.unichar_id(0);
661  for (int i = 1; i < best_choice.length(); ++i) {
662  if (best_choice.unichar_id(i) != uch_id) {
663  num_rep_chars = 1;
664  uch_id = best_choice.unichar_id(i);
665  } else {
666  ++num_rep_chars;
667  if (num_rep_chars == kDocDictMaxRepChars) return;
668  }
669  }
670  }
671 
672  if (best_choice.certainty() < doc_dict_certainty_threshold ||
673  stringlen == 2) {
674  if (best_choice.certainty() < doc_dict_pending_threshold) return;
675 
676  if (!pending_words_->word_in_dawg(best_choice)) {
677  if (stringlen > 2 ||
678  (stringlen == 2 &&
679  getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
680  getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
681  pending_words_->add_word_to_dawg(best_choice);
682  }
683  return;
684  }
685  }
686 
687  if (save_doc_words) {
688  STRING filename(getCCUtil()->imagefile);
689  filename += ".doc";
690  FILE* doc_word_file = fopen(filename.c_str(), "a");
691  if (doc_word_file == nullptr) {
692  tprintf("Error: Could not open file %s\n", filename.c_str());
693  ASSERT_HOST(doc_word_file);
694  }
695  fprintf(doc_word_file, "%s\n", best_choice.debug_string().c_str());
696  fclose(doc_word_file);
697  }
698  document_words_->add_word_to_dawg(best_choice);
699 }

◆ adjust_word()

void tesseract::Dict::adjust_word ( WERD_CHOICE word,
bool  nonword,
XHeightConsistencyEnum  xheight_consistency,
float  additional_adjust,
bool  modify_rating,
bool  debug 
)

Adjusts the rating of the given word.

Definition at line 701 of file dict.cpp.

704  {
705  bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
706  word->GetTopScriptID() == getUnicharset().han_sid());
707  bool case_is_ok = (is_han || case_ok(*word));
708  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
709 
710  float adjust_factor = additional_adjust;
711  float new_rating = word->rating();
712  new_rating += kRatingPad;
713  const char* xheight_triggered = "";
714  if (word->length() > 1) {
715  // Calculate x-height and y-offset consistency penalties.
716  switch (xheight_consistency) {
717  case XH_INCONSISTENT:
718  adjust_factor += xheight_penalty_inconsistent;
719  xheight_triggered = ", xhtBAD";
720  break;
721  case XH_SUBNORMAL:
722  adjust_factor += xheight_penalty_subscripts;
723  xheight_triggered = ", xhtSUB";
724  break;
725  case XH_GOOD:
726  // leave the factor alone - all good!
727  break;
728  }
729  // TODO(eger): if nonword is true, but there is a "core" that is a dict
730  // word, negate nonword status.
731  } else {
732  if (debug) {
733  tprintf("Consistency could not be calculated.\n");
734  }
735  }
736  if (debug) {
737  tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
738  word->unichar_string().c_str(), word->rating(), xheight_triggered);
739  }
740 
741  if (nonword) { // non-dictionary word
742  if (case_is_ok && punc_is_ok) {
743  adjust_factor += segment_penalty_dict_nonword;
744  new_rating *= adjust_factor;
745  if (debug) tprintf(", W");
746  } else {
747  adjust_factor += segment_penalty_garbage;
748  new_rating *= adjust_factor;
749  if (debug) {
750  if (!case_is_ok) tprintf(", C");
751  if (!punc_is_ok) tprintf(", P");
752  }
753  }
754  } else { // dictionary word
755  if (case_is_ok) {
756  if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
758  adjust_factor += segment_penalty_dict_frequent_word;
759  new_rating *= adjust_factor;
760  if (debug) tprintf(", F");
761  } else {
762  adjust_factor += segment_penalty_dict_case_ok;
763  new_rating *= adjust_factor;
764  if (debug) tprintf(", ");
765  }
766  } else {
767  adjust_factor += segment_penalty_dict_case_bad;
768  new_rating *= adjust_factor;
769  if (debug) tprintf(", C");
770  }
771  }
772  new_rating -= kRatingPad;
773  if (modify_rating) word->set_rating(new_rating);
774  if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
775  word->set_adjust_factor(adjust_factor);
776 }

◆ append_choices()

void tesseract::Dict::append_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
const BLOB_CHOICE blob_choice,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

append_choices

Checks to see whether or not the next choice is worth appending to the word being generated. If so then keeps going deeper into the word.

This function assumes that Dict::go_deeper_fxn_ is set.

Definition at line 253 of file permdawg.cpp.

256  {
257  return; // blob_choice must be an invalid fragment
258  }
259  // Search the next letter if this character is a fragment.
260  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
261  permute_choices(debug, char_choices, char_choice_index + 1,
262  &char_frag_info, word, certainties, limit,
263  best_choice, attempts_left, more_args);
264  return;
265  }
266 
267  // Add the next unichar.
268  float old_rating = word->rating();
269  float old_certainty = word->certainty();
270  uint8_t old_permuter = word->permuter();
271  certainties[word->length()] = char_frag_info.certainty;
273  char_frag_info.unichar_id, char_frag_info.num_fragments,
274  char_frag_info.rating, char_frag_info.certainty);
275 
276  // Explore the next unichar.
277  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
278  &char_frag_info, word_ending, word, certainties,
279  limit, best_choice, attempts_left, more_args);
280 
281  // Remove the unichar we added to explore other choices in it's place.
282  word->remove_last_unichar_id();
283  word->set_rating(old_rating);
284  word->set_certainty(old_certainty);
285  word->set_permuter(old_permuter);
286 }
287 

◆ CallParamsModelClassify()

float tesseract::Dict::CallParamsModelClassify ( void *  path)
inline

Definition at line 421 of file dict.h.

421  {
422  ASSERT_HOST(params_model_classify_ != nullptr); // ASSERT_HOST -> assert
423  return (this->*params_model_classify_)(
424  getCCUtil()->lang.c_str(), path);
425  }

◆ case_ok()

int tesseract::Dict::case_ok ( const WERD_CHOICE word) const

Check a string to see if it matches a set of lexical rules.

Definition at line 61 of file context.cpp.

65  {
66  if (word.length() < kMinAbsoluteGarbageWordLength) return false;
67  int num_alphanum = 0;
68  for (int x = 0; x < word.length(); ++x) {
69  num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
70  unicharset.get_isdigit(word.unichar_id(x)));
71  }
72  return (static_cast<float>(num_alphanum) /
73  static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
74 }
75 
76 } // namespace tesseract

◆ char_for_dawg()

UNICHAR_ID tesseract::Dict::char_for_dawg ( const UNICHARSET unicharset,
UNICHAR_ID  ch,
const Dawg dawg 
) const
inline

Definition at line 448 of file dict.h.

449  {
450  if (!dawg) return ch;
451  switch (dawg->type()) {
452  case DAWG_TYPE_NUMBER:
453  return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
454  default:
455  return ch;
456  }
457  }

◆ compound_marker()

bool tesseract::Dict::compound_marker ( UNICHAR_ID  unichar_id)
inline

Definition at line 113 of file dict.h.

113  {
114  const UNICHARSET& unicharset = getUnicharset();
115  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
116  const GenericVector<UNICHAR_ID>& normed_ids =
117  unicharset.normed_ids(unichar_id);
118  return normed_ids.size() == 1 &&
119  (normed_ids[0] == hyphen_unichar_id_ ||
120  normed_ids[0] == slash_unichar_id_);
121  }

◆ copy_hyphen_info()

void tesseract::Dict::copy_hyphen_info ( WERD_CHOICE word) const
inline

If this word is hyphenated copy the base word (the part on the line before) of a hyphenated word into the given word. This function assumes that word is not nullptr.

Definition at line 145 of file dict.h.

145  {
146  if (this->hyphenated()) {
147  *word = *hyphen_word_;
148  if (hyphen_debug_level) word->print("copy_hyphen_info: ");
149  }
150  }

◆ dawg_permute_and_select()

WERD_CHOICE * tesseract::Dict::dawg_permute_and_select ( const BLOB_CHOICE_LIST_VECTOR char_choices,
float  rating_limit 
)

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to explore all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

dawg_permute_and_select

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to search all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

Definition at line 182 of file permdawg.cpp.

183  : nullptr,
184  char_choices, 0, nullptr, &word, certainties, &rating_limit, best_choice,
185  &attempts_left, &dawg_args);
186  delete[] active_dawgs;
187  return best_choice;
188 }
189 
197  const char *debug,
198  const BLOB_CHOICE_LIST_VECTOR &char_choices,
199  int char_choice_index,
200  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
201  WERD_CHOICE *word,
202  float certainties[],
203  float *limit,

◆ DebugWordChoices()

void tesseract::Dict::DebugWordChoices ( )

Prints the current choices for this word to stdout.

◆ def_letter_is_okay()

int tesseract::Dict::def_letter_is_okay ( void *  void_dawg_args,
const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  word_end 
) const

Returns the maximal permuter code (from ccstruct/ratngs.h) if in light of the current state the letter at word_index in the given word is allowed according to at least one of the dawgs in dawgs_, otherwise returns NO_PERM.

The state is described by void_dawg_args, which are interpreted as DawgArgs and contain relevant active dawg positions. Each entry in the active_dawgs vector contains an index into the dawgs_ vector and an EDGE_REF that indicates the last edge followed in the dawg. It also may contain a position in the punctuation dawg which describes surrounding punctuation (see struct DawgPosition).

Input: At word_index 0 dawg_args->active_dawgs should contain an entry for each dawg that may start at the beginning of a word, with punc_ref and edge_ref initialized to NO_EDGE. Since the punctuation dawg includes the empty pattern " " (meaning anything without surrounding punctuation), having a single entry for the punctuation dawg will cover all dawgs reachable therefrom – that includes all number and word dawgs. The only dawg non-reachable from the punctuation_dawg is the pattern dawg. If hyphen state needs to be applied, initial dawg_args->active_dawgs can be copied from the saved hyphen state (maintained by Dict). For word_index > 0 the corresponding state (active_dawgs and punc position) can be obtained from dawg_args->updated_dawgs passed to def_letter_is_okay for word_index-1. Note: the function assumes that active_dawgs, and updated_dawgs member variables of dawg_args are not nullptr.

Output: The function fills in dawg_args->updated_dawgs vector with the entries for dawgs that contain the word up to the letter at word_index.

Definition at line 395 of file dict.cpp.

396  {
397  auto* dawg_args = static_cast<DawgArgs*>(void_dawg_args);
398 
399  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
400 
401  if (dawg_debug_level >= 3) {
402  tprintf(
403  "def_letter_is_okay: current unichar=%s word_end=%d"
404  " num active dawgs=%d\n",
405  getUnicharset().debug_str(unichar_id).c_str(), word_end,
406  dawg_args->active_dawgs->size());
407  }
408 
409  // Do not accept words that contain kPatternUnicharID.
410  // (otherwise pattern dawgs would not function correctly).
411  // Do not accept words containing INVALID_UNICHAR_IDs.
412  if (unichar_id == Dawg::kPatternUnicharID ||
413  unichar_id == INVALID_UNICHAR_ID) {
414  dawg_args->permuter = NO_PERM;
415  return NO_PERM;
416  }
417 
418  // Initialization.
419  PermuterType curr_perm = NO_PERM;
420  dawg_args->updated_dawgs->clear();
421  dawg_args->valid_end = false;
422 
423  // Go over the active_dawgs vector and insert DawgPosition records
424  // with the updated ref (an edge with the corresponding unichar id) into
425  // dawg_args->updated_pos.
426  for (int a = 0; a < dawg_args->active_dawgs->size(); ++a) {
427  const DawgPosition& pos = (*dawg_args->active_dawgs)[a];
428  const Dawg* punc_dawg =
429  pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
430  const Dawg* dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
431 
432  if (!dawg && !punc_dawg) {
433  // shouldn't happen.
434  tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
435  continue;
436  }
437  if (!dawg) {
438  // We're in the punctuation dawg. A core dawg has not been chosen.
439  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
440  EDGE_REF punc_transition_edge =
441  punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);
442  if (punc_transition_edge != NO_EDGE) {
443  // Find all successors, and see which can transition.
444  const SuccessorList& slist = *(successors_[pos.punc_index]);
445  for (int s = 0; s < slist.size(); ++s) {
446  int sdawg_index = slist[s];
447  const Dawg* sdawg = dawgs_[sdawg_index];
448  UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
449  EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
450  if (dawg_edge != NO_EDGE) {
451  if (dawg_debug_level >= 3) {
452  tprintf("Letter found in dawg %d\n", sdawg_index);
453  }
454  dawg_args->updated_dawgs->add_unique(
455  DawgPosition(sdawg_index, dawg_edge, pos.punc_index,
456  punc_transition_edge, false),
457  dawg_debug_level > 0,
458  "Append transition from punc dawg to current dawgs: ");
459  if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
460  if (sdawg->end_of_word(dawg_edge) &&
461  punc_dawg->end_of_word(punc_transition_edge))
462  dawg_args->valid_end = true;
463  }
464  }
465  }
466  EDGE_REF punc_edge =
467  punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
468  if (punc_edge != NO_EDGE) {
469  if (dawg_debug_level >= 3) {
470  tprintf("Letter found in punctuation dawg\n");
471  }
472  dawg_args->updated_dawgs->add_unique(
473  DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
474  dawg_debug_level > 0, "Extend punctuation dawg: ");
475  if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
476  if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
477  }
478  continue;
479  }
480 
481  if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
482  // We can end the main word here.
483  // If we can continue on the punc ref, add that possibility.
484  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
485  EDGE_REF punc_edge =
486  punc_node == NO_EDGE
487  ? NO_EDGE
488  : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
489  if (punc_edge != NO_EDGE) {
490  dawg_args->updated_dawgs->add_unique(
491  DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index,
492  punc_edge, true),
493  dawg_debug_level > 0, "Return to punctuation dawg: ");
494  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
495  if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
496  }
497  }
498 
499  if (pos.back_to_punc) continue;
500 
501  // If we are dealing with the pattern dawg, look up all the
502  // possible edges, not only for the exact unichar_id, but also
503  // for all its character classes (alpha, digit, etc).
504  if (dawg->type() == DAWG_TYPE_PATTERN) {
505  ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args,
506  &curr_perm);
507  // There can't be any successors to dawg that is of type
508  // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
509  continue;
510  }
511 
512  // Find the edge out of the node for the unichar_id.
513  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
514  EDGE_REF edge =
515  (node == NO_EDGE)
516  ? NO_EDGE
517  : dawg->edge_char_of(
518  node, char_for_dawg(unicharset, unichar_id, dawg), word_end);
519 
520  if (dawg_debug_level >= 3) {
521  tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
522  pos.dawg_index, node, edge);
523  }
524 
525  if (edge != NO_EDGE) { // the unichar was found in the current dawg
526  if (dawg_debug_level >= 3) {
527  tprintf("Letter found in dawg %d\n", pos.dawg_index);
528  }
529  if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
530  if (dawg_debug_level >= 3) {
531  tprintf("Punctuation constraint not satisfied at end of word.\n");
532  }
533  continue;
534  }
535  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
536  if (dawg->end_of_word(edge) &&
537  (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref)))
538  dawg_args->valid_end = true;
539  dawg_args->updated_dawgs->add_unique(
540  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
541  false),
542  dawg_debug_level > 0,
543  "Append current dawg to updated active dawgs: ");
544  }
545  } // end for
546  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
547  // or if we found the current letter in a non-punctuation dawg. This
548  // allows preserving information on which dawg the "core" word came from.
549  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
550  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
551  (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
552  dawg_args->permuter = curr_perm;
553  }
554  if (dawg_debug_level >= 2) {
555  tprintf("Returning %d for permuter code for this character.\n",
556  dawg_args->permuter);
557  }
558  return dawg_args->permuter;
559 }

◆ def_probability_in_context()

double tesseract::Dict::def_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Default (no-op) implementation of probability in context function.

Definition at line 401 of file dict.h.

403  {
404  (void)lang;
405  (void)context;
406  (void)context_bytes;
407  (void)character;
408  (void)character_bytes;
409  return 0.0;
410  }

◆ default_dawgs()

void tesseract::Dict::default_dawgs ( DawgPositionVector anylength_dawgs,
bool  suppress_patterns 
) const

Definition at line 617 of file dict.cpp.

618  {
619  bool punc_dawg_available =
620  (punc_dawg_ != nullptr) &&
621  punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
622 
623  for (int i = 0; i < dawgs_.size(); i++) {
624  if (dawgs_[i] != nullptr &&
625  !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
626  int dawg_ty = dawgs_[i]->type();
627  bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
628  if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
629  *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
630  if (dawg_debug_level >= 3) {
631  tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
632  NO_EDGE);
633  }
634  } else if (!punc_dawg_available || !subsumed_by_punc) {
635  *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
636  if (dawg_debug_level >= 3) {
637  tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
638  }
639  }
640  }
641  }
642 }

◆ End()

void tesseract::Dict::End ( )

Definition at line 372 of file dict.cpp.

372  {
373  if (dawgs_.size() == 0) return; // Not safe to call twice.
374  for (int i = 0; i < dawgs_.size(); i++) {
375  if (!dawg_cache_->FreeDawg(dawgs_[i])) {
376  delete dawgs_[i];
377  }
378  }
379  dawg_cache_->FreeDawg(bigram_dawg_);
380  if (dawg_cache_is_ours_) {
381  delete dawg_cache_;
382  dawg_cache_ = nullptr;
383  }
384  successors_.delete_data_pointers();
385  dawgs_.clear();
386  successors_.clear();
387  document_words_ = nullptr;
388  delete pending_words_;
389  pending_words_ = nullptr;
390 }

◆ EndDangerousAmbigs()

void tesseract::Dict::EndDangerousAmbigs ( )

Definition at line 374 of file stopper.cpp.

374 {

◆ FinishLoad()

bool tesseract::Dict::FinishLoad ( )

Definition at line 351 of file dict.cpp.

351  {
352  if (dawgs_.empty()) return false;
353  // Construct a list of corresponding successors for each dawg. Each entry, i,
354  // in the successors_ vector is a vector of integers that represent the
355  // indices into the dawgs_ vector of the successors for dawg i.
356  successors_.reserve(dawgs_.size());
357  for (int i = 0; i < dawgs_.size(); ++i) {
358  const Dawg* dawg = dawgs_[i];
359  auto* lst = new SuccessorList();
360  for (int j = 0; j < dawgs_.size(); ++j) {
361  const Dawg* other = dawgs_[j];
362  if (dawg != nullptr && other != nullptr &&
363  (dawg->lang() == other->lang()) &&
364  kDawgSuccessors[dawg->type()][other->type()])
365  *lst += j;
366  }
367  successors_ += lst;
368  }
369  return true;
370 }

◆ fragment_state_okay()

bool tesseract::Dict::fragment_state_okay ( UNICHAR_ID  curr_unichar_id,
float  curr_rating,
float  curr_certainty,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
const char *  debug,
int  word_ending,
CHAR_FRAGMENT_INFO char_frag_info 
)

Definition at line 328 of file permdawg.cpp.

328  {
329  tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str());
330  }
331  if (this_fragment) {
332  tprintf("this_fragment %s\n", this_fragment->to_string().c_str());
333  }
334  }
335 
336  char_frag_info->unichar_id = curr_unichar_id;
337  char_frag_info->fragment = this_fragment;
338  char_frag_info->rating = curr_rating;
339  char_frag_info->certainty = curr_certainty;
340  char_frag_info->num_fragments = 1;
341  if (prev_fragment && !this_fragment) {
342  if (debug) tprintf("Skip choice with incomplete fragment\n");
343  return false;
344  }
345  if (this_fragment) {
346  // We are dealing with a fragment.
347  char_frag_info->unichar_id = INVALID_UNICHAR_ID;
348  if (prev_fragment) {
349  if (!this_fragment->is_continuation_of(prev_fragment)) {
350  if (debug) tprintf("Non-matching fragment piece\n");
351  return false;
352  }
353  if (this_fragment->is_ending()) {
354  char_frag_info->unichar_id =
355  getUnicharset().unichar_to_id(this_fragment->get_unichar());
356  char_frag_info->fragment = nullptr;
357  if (debug) {
358  tprintf("Built character %s from fragments\n",
359  getUnicharset().debug_str(
360  char_frag_info->unichar_id).c_str());
361  }
362  } else {
363  if (debug) tprintf("Record fragment continuation\n");
364  char_frag_info->fragment = this_fragment;
365  }
366  // Update certainty and rating.
367  char_frag_info->rating =
368  prev_char_frag_info->rating + curr_rating;
369  char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
370  char_frag_info->certainty =
371  std::min(curr_certainty, prev_char_frag_info->certainty);
372  } else {
373  if (this_fragment->is_beginning()) {
374  if (debug) tprintf("Record fragment beginning\n");
375  } else {
376  if (debug) {
377  tprintf("Non-starting fragment piece with no prev_fragment\n");
378  }
379  return false;
380  }
381  }
382  }
383  if (word_ending && char_frag_info->fragment) {
384  if (debug) tprintf("Word can not end with a fragment\n");
385  return false;
386  }
387  return true;
388 }
389 
390 } // namespace tesseract

◆ getCCUtil() [1/2]

CCUtil* tesseract::Dict::getCCUtil ( )
inline

Definition at line 98 of file dict.h.

98  {
99  return ccutil_;
100  }

◆ getCCUtil() [2/2]

const CCUtil* tesseract::Dict::getCCUtil ( ) const
inline

Definition at line 95 of file dict.h.

95  {
96  return ccutil_;
97  }

◆ GetDawg()

const Dawg* tesseract::Dict::GetDawg ( int  index) const
inline

Return i-th dawg pointer recorded in the dawgs_ vector.

Definition at line 432 of file dict.h.

432 { return dawgs_[index]; }

◆ GetPuncDawg()

const Dawg* tesseract::Dict::GetPuncDawg ( ) const
inline

Return the points to the punctuation dawg.

Definition at line 434 of file dict.h.

434 { return punc_dawg_; }

◆ GetStartingNode()

static NODE_REF tesseract::Dict::GetStartingNode ( const Dawg dawg,
EDGE_REF  edge_ref 
)
inlinestatic

Returns the appropriate next node given the EDGE_REF.

Definition at line 438 of file dict.h.

438  {
439  if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg
440  NODE_REF node = dawg->next_node(edge_ref);
441  if (node == 0) node = NO_EDGE; // end of word
442  return node;
443  }

◆ GetUnambigDawg()

const Dawg* tesseract::Dict::GetUnambigDawg ( ) const
inline

Return the points to the unambiguous words dawg.

Definition at line 436 of file dict.h.

436 { return unambig_dawg_; }

◆ getUnicharAmbigs()

const UnicharAmbigs& tesseract::Dict::getUnicharAmbigs ( ) const
inline

Definition at line 108 of file dict.h.

108  {
109  return getCCUtil()->unichar_ambigs;
110  }

◆ getUnicharset() [1/2]

UNICHARSET& tesseract::Dict::getUnicharset ( )
inline

Definition at line 104 of file dict.h.

104  {
105  return getCCUtil()->unicharset;
106  }

◆ getUnicharset() [2/2]

const UNICHARSET& tesseract::Dict::getUnicharset ( ) const
inline

Definition at line 101 of file dict.h.

101  {
102  return getCCUtil()->unicharset;
103  }

◆ GlobalDawgCache()

DawgCache * tesseract::Dict::GlobalDawgCache ( )
static

Initialize Dict class - load dawgs from [lang].traineddata and user-specified wordlist and parttern list.

Definition at line 184 of file dict.cpp.

184  {
185  // This global cache (a singleton) will outlive every Tesseract instance
186  // (even those that someone else might declare as global statics).
187  static DawgCache cache;
188  return &cache;
189 }

◆ go_deeper_dawg_fxn()

void tesseract::Dict::go_deeper_dawg_fxn ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
bool  word_ending,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  void_more_args 
)

If the choice being composed so far could be a dictionary word and we have not reached the end of the word keep exploring the char_choices further.

Definition at line 58 of file permdawg.cpp.

58  {
59  if (dawg_debug_level) {
60  tprintf("checking unigrams in an ngram %s\n",
61  getUnicharset().debug_str(orig_uch_id).c_str());
62  }
63  int num_unigrams = 0;
64  word->remove_last_unichar_id();
66  const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
67  // Since the string came out of the unicharset, failure is impossible.
68  ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr,
69  nullptr));
70  bool unigrams_ok = true;
71  // Construct DawgArgs that reflect the current state.
72  DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
73  DawgPositionVector unigram_updated_dawgs;
74  DawgArgs unigram_dawg_args(&unigram_active_dawgs,
75  &unigram_updated_dawgs,
76  more_args->permuter);
77  // Check unigrams in the ngram with letter_is_okay().
78  for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
79  UNICHAR_ID uch_id = encoding[i];
80  ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
81  ++num_unigrams;
82  word->append_unichar_id(uch_id, 1, 0.0, 0.0);
83  unigrams_ok = (this->*letter_is_okay_)(
84  &unigram_dawg_args, *word->unicharset(),
85  word->unichar_id(word_index+num_unigrams-1),
86  word_ending && i == encoding.size() - 1);
87  (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
88  if (dawg_debug_level) {
89  tprintf("unigram %s is %s\n",
90  getUnicharset().debug_str(uch_id).c_str(),
91  unigrams_ok ? "OK" : "not OK");
92  }
93  }
94  // Restore the word and copy the updated dawg state if needed.
95  while (num_unigrams-- > 0) word->remove_last_unichar_id();
96  word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
97  if (unigrams_ok) {
98  checked_unigrams = true;
99  more_args->permuter = unigram_dawg_args.permuter;
100  *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
101  }
102  }
103 
104  // Check which dawgs from the dawgs_ vector contain the word
105  // up to and including the current unichar.
106  if (checked_unigrams || (this->*letter_is_okay_)(
107  more_args, *word->unicharset(), word->unichar_id(word_index),
108  word_ending)) {
109  // Add a new word choice
110  if (word_ending) {
111  if (dawg_debug_level) {
112  tprintf("found word = %s\n", word->debug_string().c_str());
113  }
114  if (strcmp(output_ambig_words_file.c_str(), "") != 0) {
115  if (output_ambig_words_file_ == nullptr) {
116  output_ambig_words_file_ =
117  fopen(output_ambig_words_file.c_str(), "wb+");
118  if (output_ambig_words_file_ == nullptr) {
119  tprintf("Failed to open output_ambig_words_file %s\n",
120  output_ambig_words_file.c_str());
121  exit(1);
122  }
123  STRING word_str;
124  word->string_and_lengths(&word_str, nullptr);
125  word_str += " ";
126  fprintf(output_ambig_words_file_, "%s", word_str.c_str());
127  }
128  STRING word_str;
129  word->string_and_lengths(&word_str, nullptr);
130  word_str += " ";
131  fprintf(output_ambig_words_file_, "%s", word_str.c_str());
132  }
133  WERD_CHOICE *adjusted_word = word;
134  adjusted_word->set_permuter(more_args->permuter);
135  update_best_choice(*adjusted_word, best_choice);
136  } else { // search the next letter
137  // Make updated_* point to the next entries in the DawgPositionVector
138  // arrays (that were originally created in dawg_permute_and_select)
139  ++(more_args->updated_dawgs);
140  // Make active_dawgs and constraints point to the updated ones.
141  ++(more_args->active_dawgs);
142  permute_choices(debug, char_choices, char_choice_index + 1,
143  prev_char_frag_info, word, certainties, limit,
144  best_choice, attempts_left, more_args);
145  // Restore previous state to explore another letter in this position.
146  --(more_args->updated_dawgs);
147  --(more_args->active_dawgs);
148  }
149  } else {
150  if (dawg_debug_level) {
151  tprintf("last unichar not OK at index %d in %s\n",
152  word_index, word->debug_string().c_str());
153  }
154  }
155 }
156 
157 
168  const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
169  auto *best_choice = new WERD_CHOICE(&getUnicharset());
170  best_choice->make_bad();

◆ good_choice()

int tesseract::Dict::good_choice ( const WERD_CHOICE choice)

Returns true if a good answer is found for the unknown blob rating.

◆ has_hyphen_end() [1/2]

bool tesseract::Dict::has_hyphen_end ( const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  first_pos 
) const
inline

Check whether the word has a hyphen at the end.

Definition at line 152 of file dict.h.

153  {
154  if (!last_word_on_line_ || first_pos)
155  return false;
156  ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
157  const GenericVector<UNICHAR_ID>& normed_ids =
158  unicharset->normed_ids(unichar_id);
159  return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
160  }

◆ has_hyphen_end() [2/2]

bool tesseract::Dict::has_hyphen_end ( const WERD_CHOICE word) const
inline

Same as above, but check the unichar at the end of the word.

Definition at line 162 of file dict.h.

162  {
163  int word_index = word.length() - 1;
164  return has_hyphen_end(word.unicharset(), word.unichar_id(word_index),
165  word_index == 0);
166  }

◆ hyphen_base_size()

int tesseract::Dict::hyphen_base_size ( ) const
inline

Size of the base word (the part on the line before) of a hyphenated word.

Definition at line 139 of file dict.h.

139  {
140  return this->hyphenated() ? hyphen_word_->length() : 0;
141  }

◆ hyphenated()

bool tesseract::Dict::hyphenated ( ) const
inline

Returns true if we've recorded the beginning of a hyphenated word.

Definition at line 135 of file dict.h.

135  { return
136  !last_word_on_line_ && hyphen_word_;
137  }

◆ init_active_dawgs()

void tesseract::Dict::init_active_dawgs ( DawgPositionVector active_dawgs,
bool  ambigs_mode 
) const

Fill the given active_dawgs vector with dawgs that could contain the beginning of the word. If hyphenated() returns true, copy the entries from hyphen_active_dawgs_ instead.

Definition at line 600 of file dict.cpp.

601  {
602  int i;
603  if (hyphenated()) {
604  *active_dawgs = hyphen_active_dawgs_;
605  if (dawg_debug_level >= 3) {
606  for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
607  tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
608  hyphen_active_dawgs_[i].dawg_index,
609  hyphen_active_dawgs_[i].dawg_ref);
610  }
611  }
612  } else {
613  default_dawgs(active_dawgs, ambigs_mode);
614  }
615 }

◆ is_apostrophe()

bool tesseract::Dict::is_apostrophe ( UNICHAR_ID  unichar_id)
inline

Definition at line 124 of file dict.h.

124  {
125  const UNICHARSET& unicharset = getUnicharset();
126  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
127  const GenericVector<UNICHAR_ID>& normed_ids =
128  unicharset.normed_ids(unichar_id);
129  return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
130  }

◆ IsSpaceDelimitedLang()

bool tesseract::Dict::IsSpaceDelimitedLang ( ) const

Returns true if the language is space-delimited (not CJ, or T).

Definition at line 883 of file dict.cpp.

883  {
884  const UNICHARSET& u_set = getUnicharset();
885  if (u_set.han_sid() > 0) return false;
886  if (u_set.katakana_sid() > 0) return false;
887  if (u_set.thai_sid() > 0) return false;
888  return true;
889 }

◆ LengthOfShortestAlphaRun()

int tesseract::Dict::LengthOfShortestAlphaRun ( const WERD_CHOICE WordChoice) const

Returns the length of the shortest alpha run in WordChoice.

Definition at line 460 of file stopper.cpp.

465  {
466  float Certainty;
467  float WorstCertainty = FLT_MAX;
468  float CertaintyThreshold;
469  double TotalCertainty;
470  double TotalCertaintySquared;
471  double Variance;
472  float Mean, StdDev;
473  int word_length = word.length();
474 
475  if (word_length < 3)
476  return true;
477 

◆ LetterIsOkay()

int tesseract::Dict::LetterIsOkay ( void *  void_dawg_args,
const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inline

Calls letter_is_okay_ member function.

Definition at line 376 of file dict.h.

377  {
378  return (this->*letter_is_okay_)(void_dawg_args,
379  unicharset, unichar_id, word_end);
380  }

◆ Load()

void tesseract::Dict::Load ( const STRING lang,
TessdataManager data_file 
)

Definition at line 210 of file dict.cpp.

210  {
211  // Load dawgs_.
212  if (load_punc_dawg) {
213  punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG,
214  dawg_debug_level, data_file);
215  if (punc_dawg_) dawgs_ += punc_dawg_;
216  }
217  if (load_system_dawg) {
218  Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
219  lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
220  if (system_dawg) dawgs_ += system_dawg;
221  }
222  if (load_number_dawg) {
223  Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
224  lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
225  if (number_dawg) dawgs_ += number_dawg;
226  }
227  if (load_bigram_dawg) {
228  bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG,
229  dawg_debug_level, data_file);
230  // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
231  // dawgs_!!
232  }
233  if (load_freq_dawg) {
234  freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG,
235  dawg_debug_level, data_file);
236  if (freq_dawg_) dawgs_ += freq_dawg_;
237  }
238  if (load_unambig_dawg) {
239  unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG,
240  dawg_debug_level, data_file);
241  if (unambig_dawg_) dawgs_ += unambig_dawg_;
242  }
243 
244  STRING name;
245  if (!user_words_suffix.empty() || !user_words_file.empty()) {
246  Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
247  getUnicharset().size(), dawg_debug_level);
248  if (!user_words_file.empty()) {
249  name = user_words_file;
250  } else {
252  name += user_words_suffix;
253  }
254  if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
256  tprintf("Error: failed to load %s\n", name.c_str());
257  delete trie_ptr;
258  } else {
259  dawgs_ += trie_ptr;
260  }
261  }
262 
263  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
264  Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
265  getUnicharset().size(), dawg_debug_level);
266  trie_ptr->initialize_patterns(&(getUnicharset()));
267  if (!user_patterns_file.empty()) {
268  name = user_patterns_file;
269  } else {
271  name += user_patterns_suffix;
272  }
273  if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
274  tprintf("Error: failed to load %s\n", name.c_str());
275  delete trie_ptr;
276  } else {
277  dawgs_ += trie_ptr;
278  }
279  }
280 
281  document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
282  getUnicharset().size(), dawg_debug_level);
283  dawgs_ += document_words_;
284 
285  // This dawg is temporary and should not be searched by letter_is_ok.
286  pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
287  getUnicharset().size(), dawg_debug_level);
288 }

◆ LoadLSTM()

void tesseract::Dict::LoadLSTM ( const STRING lang,
TessdataManager data_file 
)

Definition at line 291 of file dict.cpp.

291  {
292  // Load dawgs_.
293  if (load_punc_dawg) {
294  punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG,
295  dawg_debug_level, data_file);
296  if (punc_dawg_) dawgs_ += punc_dawg_;
297  }
298  if (load_system_dawg) {
299  Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
300  lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
301  if (system_dawg) dawgs_ += system_dawg;
302  }
303  if (load_number_dawg) {
304  Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
305  lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
306  if (number_dawg) dawgs_ += number_dawg;
307  }
308 
309  // stolen from Dict::Load (but needs params_ from Tesseract
310  // langdata/config/api):
311  STRING name;
312  if (!user_words_suffix.empty() || !user_words_file.empty()) {
313  Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
314  getUnicharset().size(), dawg_debug_level);
315  if (!user_words_file.empty()) {
316  name = user_words_file;
317  } else {
319  name += user_words_suffix;
320  }
321  if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
323  tprintf("Error: failed to load %s\n", name.c_str());
324  delete trie_ptr;
325  } else {
326  dawgs_ += trie_ptr;
327  }
328  }
329 
330  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
331  Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
332  getUnicharset().size(), dawg_debug_level);
333  trie_ptr->initialize_patterns(&(getUnicharset()));
334  if (!user_patterns_file.empty()) {
335  name = user_patterns_file;
336  } else {
338  name += user_patterns_suffix;
339  }
340  if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
341  tprintf("Error: failed to load %s\n", name.c_str());
342  delete trie_ptr;
343  } else {
344  dawgs_ += trie_ptr;
345  }
346  }
347 }

◆ ngram_probability_in_context()

double tesseract::Dict::ngram_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)

◆ NoDangerousAmbig()

bool tesseract::Dict::NoDangerousAmbig ( WERD_CHOICE BestChoice,
DANGERR fixpt,
bool  fix_replaceable,
MATRIX ratings 
)

Definition at line 158 of file stopper.cpp.

173  : 1); ++pass) {
174  bool replace = (fix_replaceable && pass == 0);
175  const UnicharAmbigsVector &table = replace ?
177  if (!replace) {
178  // Initialize ambig_blob_choices with lists containing a single
179  // unichar id for the corresponding position in best_choice.
180  // best_choice consisting from only the original letters will
181  // have a rating of 0.0.
182  for (i = 0; i < best_choice->length(); ++i) {
183  auto *lst = new BLOB_CHOICE_LIST();
184  BLOB_CHOICE_IT lst_it(lst);
185  // TODO(rays/antonova) Put real xheights and y shifts here.
186  lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
187  0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
188  ambig_blob_choices.push_back(lst);
189  }
190  }
191  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
192  int wrong_ngram_index;
193  int next_index;
194  int blob_index = 0;
195  for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
196  ++i) {
197  UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
198  if (stopper_debug_level > 2) {
199  tprintf("Looking for %s ngrams starting with %s:\n",
200  replace ? "replaceable" : "ambiguous",
201  getUnicharset().debug_str(curr_unichar_id).c_str());
202  }
203  int num_wrong_blobs = best_choice->state(i);
204  wrong_ngram_index = 0;
205  wrong_ngram[wrong_ngram_index] = curr_unichar_id;
206  if (curr_unichar_id == INVALID_UNICHAR_ID ||
207  curr_unichar_id >= table.size() ||
208  table[curr_unichar_id] == nullptr) {
209  continue; // there is no ambig spec for this unichar id
210  }
211  AmbigSpec_IT spec_it(table[curr_unichar_id]);
212  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
213  const AmbigSpec *ambig_spec = spec_it.data();
214  wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
215  int compare = UnicharIdArrayUtils::compare(wrong_ngram,
216  ambig_spec->wrong_ngram);
217  if (stopper_debug_level > 2) {
218  tprintf("candidate ngram: ");
220  tprintf("current ngram from spec: ");
221  UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
222  tprintf("comparison result: %d\n", compare);
223  }
224  if (compare == 0) {
225  // Record the place where we found an ambiguity.
226  if (fixpt != nullptr) {
227  UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
228  fixpt->push_back(DANGERR_INFO(
229  blob_index, blob_index + num_wrong_blobs, replace,
230  getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
231  leftmost_id));
232  if (stopper_debug_level > 1) {
233  tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index,
234  blob_index + num_wrong_blobs, false,
235  getUnicharset().get_isngram(
236  ambig_spec->correct_ngram_id),
237  getUnicharset().id_to_unichar(leftmost_id));
238  }
239  }
240 
241  if (replace) {
242  if (stopper_debug_level > 2) {
243  tprintf("replace ambiguity with %s : ",
244  getUnicharset().id_to_unichar(
245  ambig_spec->correct_ngram_id));
247  ambig_spec->correct_fragments, getUnicharset());
248  }
249  ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
250  ambig_spec->correct_ngram_id,
251  best_choice, ratings);
252  } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
253  // We found dang ambig - update ambig_blob_choices.
254  if (stopper_debug_level > 2) {
255  tprintf("found ambiguity: ");
257  ambig_spec->correct_fragments, getUnicharset());
258  }
259  ambigs_found = true;
260  for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
261  ++tmp_index) {
262  // Add a blob choice for the corresponding fragment of the
263  // ambiguity. These fake blob choices are initialized with
264  // negative ratings (which are not possible for real blob
265  // choices), so that dawg_permute_and_select() considers any
266  // word not consisting of only the original letters a better
267  // choice and stops searching for alternatives once such a
268  // choice is found.
269  BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
270  bc_it.add_to_end(new BLOB_CHOICE(
271  ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
272  -1, 0, 1, 0, BCC_AMBIG));
273  }
274  }
275  spec_it.forward();
276  } else if (compare == -1) {
277  if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
278  ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
279  // Add the next unichar id to wrong_ngram and keep looking for
280  // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
281  wrong_ngram[++wrong_ngram_index] =
282  best_choice->unichar_id(next_index);
283  num_wrong_blobs += best_choice->state(next_index);
284  } else {
285  break; // no more matching ambigs in this AMBIG_SPEC_LIST
286  }
287  } else {
288  spec_it.forward();
289  }
290  } // end searching AmbigSpec_LIST
291  } // end searching best_choice
292  } // end searching replace and dangerous ambigs
293 
294  // If any ambiguities were found permute the constructed ambig_blob_choices
295  // to see if an alternative dictionary word can be found.
296  if (ambigs_found) {
297  if (stopper_debug_level > 2) {
298  tprintf("\nResulting ambig_blob_choices:\n");
299  for (i = 0; i < ambig_blob_choices.size(); ++i) {
300  print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
301  tprintf("\n");
302  }
303  }
304  WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
305  ambigs_found = (alt_word->rating() < 0.0);
306  if (ambigs_found) {
307  if (stopper_debug_level >= 1) {
308  tprintf ("Stopper: Possible ambiguous word = %s\n",
309  alt_word->debug_string().c_str());
310  }
311  if (fixpt != nullptr) {
312  // Note: Currently character choices combined from fragments can only
313  // be generated by NoDangrousAmbigs(). This code should be updated if
314  // the capability to produce classifications combined from character
315  // fragments is added to other functions.
316  int orig_i = 0;
317  for (i = 0; i < alt_word->length(); ++i) {
318  const UNICHARSET &uchset = getUnicharset();
319  bool replacement_is_ngram =
320  uchset.get_isngram(alt_word->unichar_id(i));
321  UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
322  if (replacement_is_ngram) {
323  // we have to extract the leftmost unichar from the ngram.
324  const char *str = uchset.id_to_unichar(leftmost_id);
325  int step = uchset.step(str);
326  if (step) leftmost_id = uchset.unichar_to_id(str, step);
327  }
328  int end_i = orig_i + alt_word->state(i);
329  if (alt_word->state(i) > 1 ||
330  (orig_i + 1 == end_i && replacement_is_ngram)) {
331  // Compute proper blob indices.
332  int blob_start = 0;
333  for (int j = 0; j < orig_i; ++j)
334  blob_start += best_choice->state(j);
335  int blob_end = blob_start;
336  for (int j = orig_i; j < end_i; ++j)
337  blob_end += best_choice->state(j);
338  fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true,
339  replacement_is_ngram, leftmost_id));
340  if (stopper_debug_level > 1) {
341  tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
342  true, replacement_is_ngram,
343  uchset.id_to_unichar(leftmost_id));
344  }
345  }
346  orig_i += alt_word->state(i);
347  }
348  }
349  }
350  delete alt_word;
351  }
352  if (output_ambig_words_file_ != nullptr) {
353  fprintf(output_ambig_words_file_, "\n");
354  }
355 
356  ambig_blob_choices.delete_data_pointers();
357  return !ambigs_found;
358 }
359 
360 void Dict::EndDangerousAmbigs() {}
361 
362 #endif // !defined(DISABLED_LEGACY_ENGINE)
363 
365  reject_offset_ = 0.0;
366 }
367 
370 }
371 
372 void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,

◆ NumDawgs()

int tesseract::Dict::NumDawgs ( ) const
inline

Return the number of dawgs in the dawgs_ vector.

Definition at line 430 of file dict.h.

430 { return dawgs_.size(); }

◆ ParamsModelClassify()

float tesseract::Dict::ParamsModelClassify ( const char *  lang,
void *  path 
)

◆ permute_choices()

void tesseract::Dict::permute_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

permute_choices

Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST with the given char_choice_index in char_choices.

Definition at line 211 of file permdawg.cpp.

213  {
214  BLOB_CHOICE_IT blob_choice_it;
215  blob_choice_it.set_to_list(char_choices.get(char_choice_index));
216  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
217  blob_choice_it.forward()) {
218  (*attempts_left)--;
219  append_choices(debug, char_choices, *(blob_choice_it.data()),
220  char_choice_index, prev_char_frag_info, word,
221  certainties, limit, best_choice, attempts_left, more_args);
222  if (*attempts_left <= 0) {
223  if (debug) tprintf("permute_choices(): attempts_left is 0\n");
224  break;
225  }
226  }
227  }
228 }
229 
239  const char *debug,
240  const BLOB_CHOICE_LIST_VECTOR &char_choices,
241  const BLOB_CHOICE &blob_choice,
242  int char_choice_index,
243  const CHAR_FRAGMENT_INFO *prev_char_frag_info,

◆ ProbabilityInContext()

double tesseract::Dict::ProbabilityInContext ( const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Calls probability_in_context_ member function.

Definition at line 390 of file dict.h.

393  {
394  return (this->*probability_in_context_)(
395  getCCUtil()->lang.c_str(),
396  context, context_bytes,
397  character, character_bytes);
398  }

◆ ProcessPatternEdges()

void tesseract::Dict::ProcessPatternEdges ( const Dawg dawg,
const DawgPosition info,
UNICHAR_ID  unichar_id,
bool  word_end,
DawgArgs dawg_args,
PermuterType current_permuter 
) const

For each of the character classes of the given unichar_id (and the unichar_id itself) finds the corresponding outgoing node or self-loop in the given dawg and (after checking that it is valid) records it in dawg_args->updated_ative_dawgs. Updates current_permuter if any valid edges were found.

Definition at line 561 of file dict.cpp.

564  {
565  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
566  // Try to find the edge corresponding to the exact unichar_id and to all the
567  // edges corresponding to the character class of unichar_id.
568  GenericVector<UNICHAR_ID> unichar_id_patterns;
569  unichar_id_patterns.push_back(unichar_id);
570  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
571  &unichar_id_patterns);
572  for (int i = 0; i < unichar_id_patterns.size(); ++i) {
573  // On the first iteration check all the outgoing edges.
574  // On the second iteration check all self-loops.
575  for (int k = 0; k < 2; ++k) {
576  EDGE_REF edge =
577  (k == 0) ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
578  : dawg->pattern_loop_edge(pos.dawg_ref,
579  unichar_id_patterns[i], word_end);
580  if (edge == NO_EDGE) continue;
581  if (dawg_debug_level >= 3) {
582  tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
583  pos.dawg_index, node, edge);
584  tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
585  }
586  if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
587  if (dawg->end_of_word(edge)) dawg_args->valid_end = true;
588  dawg_args->updated_dawgs->add_unique(
589  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
590  pos.back_to_punc),
591  dawg_debug_level > 0,
592  "Append current dawg to updated active dawgs: ");
593  }
594  }
595 }

◆ ReplaceAmbig()

void tesseract::Dict::ReplaceAmbig ( int  wrong_ngram_begin_index,
int  wrong_ngram_size,
UNICHAR_ID  correct_ngram_id,
WERD_CHOICE werd_choice,
MATRIX ratings 
)

Definition at line 386 of file stopper.cpp.

395  {
396  begin_blob_index += werd_choice->state(i);
397  }
398  }
399  new_certainty /= wrong_ngram_size;
400  // If there is no entry in the ratings matrix, add it.
401  MATRIX_COORD coord(begin_blob_index,
402  begin_blob_index + num_blobs_to_replace - 1);
403  if (!coord.Valid(*ratings)) {
404  ratings->IncreaseBandSize(coord.row - coord.col + 1);
405  }
406  if (ratings->get(coord.col, coord.row) == nullptr)
407  ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
408  BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
409  BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
410  if (choice != nullptr) {
411  // Already there. Upgrade if new rating better.
412  if (new_rating < choice->rating())
413  choice->set_rating(new_rating);
414  if (new_certainty < choice->certainty())
415  choice->set_certainty(new_certainty);
416  // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
417  } else {
418  // Need a new choice with the correct_ngram_id.
419  choice = new BLOB_CHOICE(*old_choice);
420  choice->set_unichar_id(correct_ngram_id);
421  choice->set_rating(new_rating);
422  choice->set_certainty(new_certainty);
423  choice->set_classifier(BCC_AMBIG);
424  choice->set_matrix_cell(coord.col, coord.row);
425  BLOB_CHOICE_IT it (new_choices);
426  it.add_to_end(choice);
427  }
428  // Remove current unichar from werd_choice. On the last iteration
429  // set the correct replacement unichar instead of removing a unichar.
430  for (int replaced_count = 0; replaced_count < wrong_ngram_size;
431  ++replaced_count) {
432  if (replaced_count + 1 == wrong_ngram_size) {
433  werd_choice->set_blob_choice(wrong_ngram_begin_index,
434  num_blobs_to_replace, choice);
435  } else {
436  werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
437  }
438  }
439  if (stopper_debug_level >= 1) {
440  werd_choice->print("ReplaceAmbig() ");
441  tprintf("Modified blob_choices: ");
442  print_ratings_list("\n", new_choices, getUnicharset());
443  }
444 }
445 
446 int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const {
447  int shortest = INT32_MAX;
448  int curr_len = 0;
449  for (int w = 0; w < WordChoice.length(); ++w) {
450  if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) {
451  curr_len++;
452  } else if (curr_len > 0) {
453  if (curr_len < shortest) shortest = curr_len;
454  curr_len = 0;
455  }
456  }
457  if (curr_len > 0 && curr_len < shortest) {
458  shortest = curr_len;

◆ reset_hyphen_vars()

void tesseract::Dict::reset_hyphen_vars ( bool  last_word_on_line)

Unless the previous word was the last one on the line, and the current one is not (thus it is the first one on the line), erase hyphen_word_, clear hyphen_active_dawgs_, update last_word_on_line_.

Definition at line 42 of file hyphen.cpp.

45  {
46  if (hyphen_word_ == nullptr) {
47  hyphen_word_ = new WERD_CHOICE(word.unicharset());
48  hyphen_word_->make_bad();
49  }
50  if (hyphen_word_->rating() > word.rating()) {
51  *hyphen_word_ = word;
52  // Remove the last unichar id as it is a hyphen, and remove
53  // any unichar_string/lengths that are present.
54  hyphen_word_->remove_last_unichar_id();
55  hyphen_active_dawgs_ = active_dawgs;

◆ ResetDocumentDictionary()

void tesseract::Dict::ResetDocumentDictionary ( )
inline

Definition at line 326 of file dict.h.

326  {
327  if (pending_words_ != nullptr)
328  pending_words_->clear();
329  if (document_words_ != nullptr)
330  document_words_->clear();
331  }

◆ set_hyphen_word()

void tesseract::Dict::set_hyphen_word ( const WERD_CHOICE word,
const DawgPositionVector active_dawgs 
)

Update hyphen_word_, and copy the given DawgPositionVectors into hyphen_active_dawgs_ .

Definition at line 59 of file hyphen.cpp.

◆ SettupStopperPass1()

void tesseract::Dict::SettupStopperPass1 ( )

Sets up stopper variables in preparation for the first pass.

Definition at line 378 of file stopper.cpp.

383  {

◆ SettupStopperPass2()

void tesseract::Dict::SettupStopperPass2 ( )

Sets up stopper variables in preparation for the second pass.

Definition at line 382 of file stopper.cpp.

383  {
384  if (i >= wrong_ngram_begin_index) {

◆ SetupForLoad()

void tesseract::Dict::SetupForLoad ( DawgCache dawg_cache)

Definition at line 192 of file dict.cpp.

192  {
193  if (dawgs_.size() != 0) this->End();
194 
195  apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
196  question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
197  slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
198  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
199 
200  if (dawg_cache != nullptr) {
201  dawg_cache_ = dawg_cache;
202  dawg_cache_is_ours_ = false;
203  } else {
204  dawg_cache_ = new DawgCache();
205  dawg_cache_is_ours_ = true;
206  }
207 }

◆ SetWildcardID()

void tesseract::Dict::SetWildcardID ( UNICHAR_ID  id)
inline

Definition at line 427 of file dict.h.

427 { wildcard_unichar_id_ = id; }

◆ SetWordsegRatingAdjustFactor()

void tesseract::Dict::SetWordsegRatingAdjustFactor ( float  f)
inline

Set wordseg_rating_adjust_factor_ to the given value.

Definition at line 510 of file dict.h.

510  {
511  wordseg_rating_adjust_factor_ = f;
512  }

◆ UniformCertainties()

int tesseract::Dict::UniformCertainties ( const WERD_CHOICE word)

Returns true if the certainty of the BestChoice word is within a reasonable range of the average certainties for the best choices for each character in the segmentation. This test is used to catch words in which one character is much worse than the other characters in the word (i.e. false will be returned in that case). The algorithm computes the mean and std deviation of the certainties in the word with the worst certainty thrown out.

Definition at line 479 of file stopper.cpp.

479  {
480  Certainty = word.certainty(i);
481  TotalCertainty += Certainty;
482  TotalCertaintySquared += static_cast<double>(Certainty) * Certainty;
483  if (Certainty < WorstCertainty)
484  WorstCertainty = Certainty;
485  }
486 
487  // Subtract off worst certainty from statistics.
488  word_length--;
489  TotalCertainty -= WorstCertainty;
490  TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty;
491 
492  Mean = TotalCertainty / word_length;
493  Variance = ((word_length * TotalCertaintySquared -
494  TotalCertainty * TotalCertainty) /
495  (word_length * (word_length - 1)));
496  if (Variance < 0.0)
497  Variance = 0.0;
498  StdDev = sqrt(Variance);
499 
500  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
501  if (CertaintyThreshold > stopper_nondict_certainty_base)
502  CertaintyThreshold = stopper_nondict_certainty_base;
503 
504  if (word.certainty() < CertaintyThreshold) {
505  if (stopper_debug_level >= 1)
506  tprintf("Stopper: Non-uniform certainty = %4.1f"
507  " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
508  word.certainty(), Mean, StdDev, CertaintyThreshold);
509  return false;
510  } else {
511  return true;
512  }
513 }
514 
515 } // namespace tesseract

◆ update_best_choice()

void tesseract::Dict::update_best_choice ( const WERD_CHOICE word,
WERD_CHOICE best_choice 
)
inline

Copies word into best_choice if its rating is smaller than that of best_choice.

Definition at line 182 of file dict.h.

183  {
184  if (word.rating() < best_choice->rating()) {
185  *best_choice = word;
186  }
187  }

◆ valid_bigram()

bool tesseract::Dict::valid_bigram ( const WERD_CHOICE word1,
const WERD_CHOICE word2 
) const

Definition at line 813 of file dict.cpp.

814  {
815  if (bigram_dawg_ == nullptr) return false;
816 
817  // Extract the core word from the middle of each word with any digits
818  // replaced with question marks.
819  int w1start, w1end, w2start, w2end;
820  word1.punct_stripped(&w1start, &w1end);
821  word2.punct_stripped(&w2start, &w2end);
822 
823  // We don't want to penalize a single guillemet, hyphen, etc.
824  // But our bigram list doesn't have any information about punctuation.
825  if (w1start >= w1end) return word1.length() < 3;
826  if (w2start >= w2end) return word2.length() < 3;
827 
828  const UNICHARSET& uchset = getUnicharset();
829  GenericVector<UNICHAR_ID> bigram_string;
830  bigram_string.reserve(w1end + w2end + 1);
831  for (int i = w1start; i < w1end; i++) {
832  const GenericVector<UNICHAR_ID>& normed_ids =
833  getUnicharset().normed_ids(word1.unichar_id(i));
834  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
835  bigram_string.push_back(question_unichar_id_);
836  else
837  bigram_string += normed_ids;
838  }
839  bigram_string.push_back(UNICHAR_SPACE);
840  for (int i = w2start; i < w2end; i++) {
841  const GenericVector<UNICHAR_ID>& normed_ids =
842  getUnicharset().normed_ids(word2.unichar_id(i));
843  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
844  bigram_string.push_back(question_unichar_id_);
845  else
846  bigram_string += normed_ids;
847  }
848  WERD_CHOICE normalized_word(&uchset, bigram_string.size());
849  for (int i = 0; i < bigram_string.size(); ++i) {
850  normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1, 0.0f,
851  0.0f);
852  }
853  return bigram_dawg_->word_in_dawg(normalized_word);
854 }

◆ valid_punctuation()

bool tesseract::Dict::valid_punctuation ( const WERD_CHOICE word)

Returns true if the word contains a valid punctuation pattern. Note: Since the domains of punctuation symbols and symblos used in numbers are not disjoint, a valid number might contain an invalid punctuation pattern (e.g. .99).

Definition at line 856 of file dict.cpp.

856  {
857  if (word.length() == 0) return NO_PERM;
858  int i;
859  WERD_CHOICE new_word(word.unicharset());
860  int last_index = word.length() - 1;
861  int new_len = 0;
862  for (i = 0; i <= last_index; ++i) {
863  UNICHAR_ID unichar_id = (word.unichar_id(i));
864  if (getUnicharset().get_ispunctuation(unichar_id)) {
865  new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
866  } else if (!getUnicharset().get_isalpha(unichar_id) &&
867  !getUnicharset().get_isdigit(unichar_id)) {
868  return false; // neither punc, nor alpha, nor digit
869  } else if ((new_len = new_word.length()) == 0 ||
870  new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {
871  new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
872  }
873  }
874  for (i = 0; i < dawgs_.size(); ++i) {
875  if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
876  dawgs_[i]->word_in_dawg(new_word))
877  return true;
878  }
879  return false;
880 }

◆ valid_word() [1/3]

int tesseract::Dict::valid_word ( const char *  string) const
inline

This function is used by api/tesseract_cube_combiner.cpp.

Definition at line 488 of file dict.h.

488  {
489  WERD_CHOICE word(string, getUnicharset());
490  return valid_word(word);
491  }

◆ valid_word() [2/3]

int tesseract::Dict::valid_word ( const WERD_CHOICE word) const
inline

Definition at line 481 of file dict.h.

481  {
482  return valid_word(word, false); // return NO_PERM for words with digits
483  }

◆ valid_word() [3/3]

int tesseract::Dict::valid_word ( const WERD_CHOICE word,
bool  numbers_ok 
) const

Definition at line 778 of file dict.cpp.

778  {
779  const WERD_CHOICE* word_ptr = &word;
780  WERD_CHOICE temp_word(word.unicharset());
781  if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
782  copy_hyphen_info(&temp_word);
783  temp_word += word;
784  word_ptr = &temp_word;
785  }
786  if (word_ptr->length() == 0) return NO_PERM;
787  // Allocate vectors for holding current and updated
788  // active_dawgs and initialize them.
789  auto* active_dawgs = new DawgPositionVector[2];
790  init_active_dawgs(&(active_dawgs[0]), false);
791  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
792  int last_index = word_ptr->length() - 1;
793  // Call letter_is_okay for each letter in the word.
794  for (int i = hyphen_base_size(); i <= last_index; ++i) {
795  if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(),
796  word_ptr->unichar_id(i), i == last_index)))
797  break;
798  // Swap active_dawgs, constraints with the corresponding updated vector.
799  if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
800  dawg_args.updated_dawgs = &(active_dawgs[0]);
801  ++(dawg_args.active_dawgs);
802  } else {
803  ++(dawg_args.updated_dawgs);
804  dawg_args.active_dawgs = &(active_dawgs[0]);
805  }
806  }
807  delete[] active_dawgs;
808  return valid_word_permuter(dawg_args.permuter, numbers_ok)
809  ? dawg_args.permuter
810  : NO_PERM;
811 }

◆ valid_word_or_number()

int tesseract::Dict::valid_word_or_number ( const WERD_CHOICE word) const
inline

Definition at line 484 of file dict.h.

484  {
485  return valid_word(word, true); // return NUMBER_PERM for valid numbers
486  }

◆ valid_word_permuter()

static bool tesseract::Dict::valid_word_permuter ( uint8_t  perm,
bool  numbers_ok 
)
inlinestatic

Check all the DAWGs to see if this word is in any of them.

Read/Write/Access special purpose dawgs which contain words only of a certain length (used for phrase search for non-space-delimited languages).

Definition at line 474 of file dict.h.

474  {
475  return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
476  perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
477  perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
478  (numbers_ok && perm == NUMBER_PERM));
479  }

◆ WildcardID()

UNICHAR_ID tesseract::Dict::WildcardID ( ) const
inline

Definition at line 428 of file dict.h.

428 { return wildcard_unichar_id_; }

Member Data Documentation

◆ certainty_scale

double tesseract::Dict::certainty_scale = 20.0

"Certainty scaling factor"

Definition at line 627 of file dict.h.

◆ dawg_debug_level

int tesseract::Dict::dawg_debug_level = 0

"Set to 1 for general debug info" ", to 2 for more details, to 3 to see all the debug messages"

Definition at line 622 of file dict.h.

◆ doc_dict_certainty_threshold

double tesseract::Dict::doc_dict_certainty_threshold = -2.25

"Worst certainty" " for words that can be inserted into the document dictionary"

Definition at line 653 of file dict.h.

◆ doc_dict_pending_threshold

double tesseract::Dict::doc_dict_pending_threshold = 0.0

"Worst certainty for using pending dictionary"

Definition at line 651 of file dict.h.

◆ go_deeper_fxn_

void(Dict::* tesseract::Dict::go_deeper_fxn_) (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)

Pointer to go_deeper function.

Definition at line 216 of file dict.h.

◆ hyphen_debug_level

int tesseract::Dict::hyphen_debug_level = 0

"Debug level for hyphenated words."

Definition at line 623 of file dict.h.

◆ letter_is_okay_

int(Dict::* tesseract::Dict::letter_is_okay_) (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const

Definition at line 372 of file dict.h.

◆ load_bigram_dawg

bool tesseract::Dict::load_bigram_dawg = true

"Load dawg with special word bigrams."

Definition at line 592 of file dict.h.

◆ load_freq_dawg

bool tesseract::Dict::load_freq_dawg = true

"Load frequent word dawg."

Definition at line 586 of file dict.h.

◆ load_number_dawg

bool tesseract::Dict::load_number_dawg = true

"Load dawg with number patterns."

Definition at line 590 of file dict.h.

◆ load_punc_dawg

bool tesseract::Dict::load_punc_dawg = true

"Load dawg with punctuation patterns."

Definition at line 589 of file dict.h.

◆ load_system_dawg

bool tesseract::Dict::load_system_dawg = true

"Load system word dawg."

Definition at line 585 of file dict.h.

◆ load_unambig_dawg

bool tesseract::Dict::load_unambig_dawg = true

"Load unambiguous word dawg."

Definition at line 587 of file dict.h.

◆ max_permuter_attempts

int tesseract::Dict::max_permuter_attempts = 10000

"Maximum number of different" " character choices to consider during permutation." " This limit is especially useful when user patterns" " are specified, since overly generic patterns can result in" " dawg search exploring an overly large number of options."

Definition at line 658 of file dict.h.

◆ output_ambig_words_file

char* tesseract::Dict::output_ambig_words_file = ""

"Output file for ambiguities found in the dictionary"

Definition at line 620 of file dict.h.

◆ params_model_classify_

float(Dict::* tesseract::Dict::params_model_classify_) (const char *lang, void *path)

Definition at line 418 of file dict.h.

◆ probability_in_context_

double(Dict::* tesseract::Dict::probability_in_context_) (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)

Probability in context function used by the ngram permuter.

Definition at line 384 of file dict.h.

◆ save_doc_words

bool tesseract::Dict::save_doc_words = 0

"Save Document Words"

Definition at line 649 of file dict.h.

◆ segment_nonalphabetic_script

bool tesseract::Dict::segment_nonalphabetic_script = false

"Don't use any alphabetic-specific tricks." "Set to true in the traineddata config file for" " scripts that are cursive or inherently fixed-pitch"

Definition at line 648 of file dict.h.

◆ segment_penalty_dict_case_bad

double tesseract::Dict::segment_penalty_dict_case_bad = 1.3125

"Default score multiplier for word matches, which may have " "case issues (lower is better)."

Definition at line 609 of file dict.h.

◆ segment_penalty_dict_case_ok

double tesseract::Dict::segment_penalty_dict_case_ok = 1.1

"Score multiplier for word matches that have good case " "(lower is better)."

Definition at line 605 of file dict.h.

◆ segment_penalty_dict_frequent_word

double tesseract::Dict::segment_penalty_dict_frequent_word = 1.0

"Score multiplier for word matches which have good case and" "are frequent in the given language (lower is better)."

Definition at line 601 of file dict.h.

◆ segment_penalty_dict_nonword

double tesseract::Dict::segment_penalty_dict_nonword = 1.25

"Score multiplier for glyph fragment segmentations which " "do not match a dictionary word (lower is better)."

Definition at line 613 of file dict.h.

◆ segment_penalty_garbage

double tesseract::Dict::segment_penalty_garbage = 1.50

"Score multiplier for poorly cased strings that are not in" " the dictionary and generally look like garbage (lower is" " better)."

Definition at line 618 of file dict.h.

◆ stopper_allowable_character_badness

double tesseract::Dict::stopper_allowable_character_badness = 3.0

"Max certaintly variation allowed in a word (in sigma)"

Definition at line 637 of file dict.h.

◆ stopper_certainty_per_char

double tesseract::Dict::stopper_certainty_per_char = -0.50

"Certainty to add for each dict char above small word size."

Definition at line 635 of file dict.h.

◆ stopper_debug_level

int tesseract::Dict::stopper_debug_level = 0

"Stopper debug level"

Definition at line 638 of file dict.h.

◆ stopper_no_acceptable_choices

bool tesseract::Dict::stopper_no_acceptable_choices = false

"Make AcceptableChoice() always return false. Useful" " when there is a need to explore all segmentations"

Definition at line 641 of file dict.h.

◆ stopper_nondict_certainty_base

double tesseract::Dict::stopper_nondict_certainty_base = -2.50

"Certainty threshold for non-dict words"

Definition at line 629 of file dict.h.

◆ stopper_phase2_certainty_rejection_offset

double tesseract::Dict::stopper_phase2_certainty_rejection_offset = 1.0

"Reject certainty offset"

Definition at line 631 of file dict.h.

◆ stopper_smallword_size

int tesseract::Dict::stopper_smallword_size = 2

"Size of dict word to be treated as non-dict word"

Definition at line 633 of file dict.h.

◆ tessedit_truncate_wordchoice_log

int tesseract::Dict::tessedit_truncate_wordchoice_log = 10

"Max words to keep in list"

Definition at line 642 of file dict.h.

◆ use_only_first_uft8_step

bool tesseract::Dict::use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities."

Definition at line 626 of file dict.h.

◆ user_patterns_file

char* tesseract::Dict::user_patterns_file = ""

"A filename of user-provided patterns."

Definition at line 582 of file dict.h.

◆ user_patterns_suffix

char* tesseract::Dict::user_patterns_suffix = ""

"A suffix of user-provided patterns located in tessdata."

Definition at line 584 of file dict.h.

◆ user_words_file

char* tesseract::Dict::user_words_file = ""

Variable members. These have to be declared and initialized after image_ptr_, which contains the pointer to the params vector - the member of its base CCUtil class. "A filename of user-provided words."

Definition at line 578 of file dict.h.

◆ user_words_suffix

char* tesseract::Dict::user_words_suffix = ""

"A suffix of user-provided words located in tessdata."

Definition at line 580 of file dict.h.

◆ word_to_debug

char* tesseract::Dict::word_to_debug = ""

"Word for which stopper debug information" " should be printed to stdout"

Definition at line 644 of file dict.h.

◆ xheight_penalty_inconsistent

double tesseract::Dict::xheight_penalty_inconsistent = 0.25

"Score penalty (0.1 = 10%) added if an xheight is " "inconsistent."

Definition at line 598 of file dict.h.

◆ xheight_penalty_subscripts

double tesseract::Dict::xheight_penalty_subscripts = 0.125

"Score penalty (0.1 = 10%) added if there are subscripts " "or superscripts in a word, but it is otherwise OK."

Definition at line 595 of file dict.h.


The documentation for this class was generated from the following files:
tesseract::Dict::getCCUtil
const CCUtil * getCCUtil() const
Definition: dict.h:95
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
GenericVector::delete_data_pointers
void delete_data_pointers()
Definition: genericvector.h:872
tesseract::Dict::ProcessPatternEdges
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
Definition: dict.cpp:561
tesseract::DAWG_TYPE_PUNCTUATION
Definition: dawg.h:67
CHAR_FRAGMENT_INFO::num_fragments
int num_fragments
Definition: dict.h:46
tesseract::Dict::max_permuter_attempts
int max_permuter_attempts
Definition: dict.h:658
tesseract::XH_SUBNORMAL
Definition: dict.h:78
WERD_CHOICE::set_adjust_factor
void set_adjust_factor(float factor)
Definition: ratngs.h:297
Mean
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:602
tesseract::Dict::go_deeper_fxn_
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:216
tesseract::Dict::segment_penalty_dict_case_bad
double segment_penalty_dict_case_bad
Definition: dict.h:609
UNICHARSET::contains_unichar_id
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
CHAR_FRAGMENT_INFO::fragment
const CHAR_FRAGMENT * fragment
Definition: dict.h:45
tesseract::CASE_AMBIG
Definition: ambigs.h:42
tesseract::Dict::probability_in_context_
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:384
tesseract::Dict::params_model_classify_
float(Dict::* params_model_classify_)(const char *lang, void *path)
Definition: dict.h:418
tesseract::Dict::output_ambig_words_file
char * output_ambig_words_file
Definition: dict.h:620
tesseract::Dict::UniformCertainties
int UniformCertainties(const WERD_CHOICE &word)
Definition: stopper.cpp:479
tesseract::TESSDATA_SYSTEM_DAWG
Definition: tessdatamanager.h:64
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
tesseract::Dict::xheight_penalty_subscripts
double xheight_penalty_subscripts
Definition: dict.h:595
tesseract::TESSDATA_BIGRAM_DAWG
Definition: tessdatamanager.h:71
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
WERD_CHOICE
Definition: ratngs.h:261
WERD_CHOICE::dangerous_ambig_found
bool dangerous_ambig_found() const
Definition: ratngs.h:351
UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:481
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::Dict::stopper_certainty_per_char
double stopper_certainty_per_char
Definition: dict.h:635
tesseract::Dict::dawg_permute_and_select
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:182
SYSTEM_DAWG_PERM
Definition: ratngs.h:239
tesseract::Dict::tessedit_truncate_wordchoice_log
int tessedit_truncate_wordchoice_log
Definition: dict.h:642
INT_MEMBER
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:312
tesseract::Dict::segment_penalty_garbage
double segment_penalty_garbage
Definition: dict.h:618
WERD_CHOICE::GetTopScriptID
int GetTopScriptID() const
Definition: ratngs.cpp:669
tesseract::Dict::getUnicharAmbigs
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:108
tesseract::Dict::hyphen_base_size
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:139
tesseract::Dict::doc_dict_certainty_threshold
double doc_dict_certainty_threshold
Definition: dict.h:653
WERD_CHOICE::set_certainty
void set_certainty(float new_val)
Definition: ratngs.h:360
PermuterType
PermuterType
Definition: ratngs.h:230
tesseract::DAWG_TYPE_NUMBER
Definition: dawg.h:69
FindMatchingChoice
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:182
tesseract::Dict::word_to_debug
char * word_to_debug
Definition: dict.h:644
tesseract::UnicharAmbigsVector
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:134
WERD_CHOICE::make_bad
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:431
MATRIX
Definition: matrix.h:574
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
NO_PERM
Definition: ratngs.h:231
STRING
Definition: strngs.h:45
tesseract::Dict::stopper_debug_level
int stopper_debug_level
Definition: dict.h:638
DANGERR_INFO
Definition: stopper.h:33
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:334
tesseract::Dict::char_for_dawg
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:448
WERD_RES
Definition: pageres.h:160
tesseract::Dict::EndDangerousAmbigs
void EndDangerousAmbigs()
Definition: stopper.cpp:374
MAX_AMBIG_SIZE
#define MAX_AMBIG_SIZE
Definition: ambigs.h:31
tesseract::XH_GOOD
Definition: dict.h:78
UNICHARSET::get_isngram
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:516
COMPOUND_PERM
Definition: ratngs.h:243
MATRIX::IncreaseBandSize
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:47
tesseract::DawgCache::FreeDawg
bool FreeDawg(Dawg *dawg)
Definition: dawg_cache.h:38
WERD_CHOICE::append_unichar_id_space_allocated
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:440
tesseract::Dict::def_probability_in_context
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:401
tesseract::Dict::user_patterns_file
char * user_patterns_file
Definition: dict.h:582
WERD_CHOICE::unicharset
const UNICHARSET * unicharset() const
Definition: ratngs.h:288
tesseract::Dict::SettupStopperPass1
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:378
tesseract::Dict::default_dawgs
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:617
WERD_CHOICE::state
int state(int index) const
Definition: ratngs.h:307
UNICHARSET::katakana_sid
int katakana_sid() const
Definition: unicharset.h:881
tesseract::CCUtil::language_data_path_prefix
STRING language_data_path_prefix
Definition: ccutil.h:56
UNICHARSET::step
int step(const char *str) const
Definition: unicharset.cpp:232
tesseract::Dict::segment_penalty_dict_nonword
double segment_penalty_dict_nonword
Definition: dict.h:613
tesseract::Dict::valid_word_permuter
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:474
tesseract::TESSDATA_LSTM_SYSTEM_DAWG
Definition: tessdatamanager.h:76
tesseract::DAWG_TYPE_WORD
Definition: dawg.h:68
tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:57
tesseract::Dict::xheight_penalty_inconsistent
double xheight_penalty_inconsistent
Definition: dict.h:598
BCC_AMBIG
Definition: ratngs.h:45
BOOL_INIT_MEMBER
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:327
tesseract::TESSDATA_PUNC_DAWG
Definition: tessdatamanager.h:63
BLOB_CHOICE::set_classifier
void set_classifier(BlobChoiceClassifier classifier)
Definition: ratngs.h:155
UNICHARSET::get_ispunctuation
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:509
tesseract::Dict::copy_hyphen_info
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:145
tesseract::Dict::stopper_no_acceptable_choices
bool stopper_no_acceptable_choices
Definition: dict.h:641
tesseract::Dict::AcceptableResult
bool AcceptableResult(WERD_RES *word) const
Definition: stopper.cpp:116
tesseract::Dict::hyphenated
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135
tesseract::Dict::load_system_dawg
bool load_system_dawg
Definition: dict.h:585
BLOB_CHOICE::set_rating
void set_rating(float newrat)
Definition: ratngs.h:142
WERD_CHOICE::string_and_lengths
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:451
tesseract::SuccessorList
GenericVector< int > SuccessorList
Definition: dawg.h:63
tesseract::Dict::user_words_suffix
char * user_words_suffix
Definition: dict.h:580
WERD_CHOICE::punct_stripped
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:385
BLOB_CHOICE::set_matrix_cell
void set_matrix_cell(int col, int row)
Definition: ratngs.h:151
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
STRING_MEMBER
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:318
WERD_CHOICE::min_x_height
float min_x_height() const
Definition: ratngs.h:324
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
tesseract::Dict::End
void End()
Definition: dict.cpp:372
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::Dict::update_best_choice
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:182
BLOB_CHOICE::set_certainty
void set_certainty(float newrat)
Definition: ratngs.h:145
tesseract::Trie::clear
void clear()
Definition: trie.cpp:71
tesseract::DawgCache::GetSquishedDawg
Dawg * GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
Definition: dawg_cache.cpp:44
tesseract::Dict::permute_choices
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:211
UNICHARSET::null_sid
int null_sid() const
Definition: unicharset.h:874
tesseract::Trie::RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:58
tesseract::Dict::stopper_allowable_character_badness
double stopper_allowable_character_badness
Definition: dict.h:637
UNICHARSET::han_sid
int han_sid() const
Definition: unicharset.h:879
tesseract::Dict::LengthOfShortestAlphaRun
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:460
UNICHARSET::thai_sid
int thai_sid() const
Definition: unicharset.h:882
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
tesseract::UnicharAmbigs::dang_ambigs
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:145
tesseract::TESSDATA_LSTM_NUMBER_DAWG
Definition: tessdatamanager.h:77
UNICHAR_SPACE
Definition: unicharset.h:34
GENERIC_2D_ARRAY::get
T get(ICOORD pos) const
Definition: matrix.h:227
tesseract::XH_INCONSISTENT
Definition: dict.h:78
tesseract::Dict::segment_penalty_dict_case_ok
double segment_penalty_dict_case_ok
Definition: dict.h:605
tesseract::Dict::def_letter_is_okay
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:395
tesseract::Dict::certainty_scale
double certainty_scale
Definition: dict.h:627
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
tesseract::Dict::load_punc_dawg
bool load_punc_dawg
Definition: dict.h:589
UNICHARSET
Definition: unicharset.h:145
double_MEMBER
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:321
tesseract::Dict::load_number_dawg
bool load_number_dawg
Definition: dict.h:590
tesseract::Dawg::edge_char_of
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
tesseract::Dict::stopper_smallword_size
int stopper_smallword_size
Definition: dict.h:633
tesseract::Dict::valid_punctuation
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:856
tesseract::Trie::add_word_to_dawg
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:183
tesseract::CCUtil::lang
STRING lang
Definition: ccutil.h:55
tesseract::Dict::dawg_debug_level
int dawg_debug_level
Definition: dict.h:622
tesseract::Dict::segment_penalty_dict_frequent_word
double segment_penalty_dict_frequent_word
Definition: dict.h:601
character
Definition: mfoutline.h:62
WERD_CHOICE::set_rating
void set_rating(float new_val)
Definition: ratngs.h:357
WERD_CHOICE::debug_string
const STRING debug_string() const
Definition: ratngs.h:493
REFFORMAT
#define REFFORMAT
Definition: dawg.h:87
WERD_RES::best_choices
WERD_CHOICE_LIST best_choices
Definition: pageres.h:243
tesseract::Dict::ReplaceAmbig
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
Definition: stopper.cpp:386
tesseract::Dawg::word_in_dawg
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:78
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
USER_PATTERN_PERM
Definition: ratngs.h:238
tesseract::UnicharIdArrayUtils::compare
static int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2)
Definition: ambigs.h:55
GenericVector< DANGERR_INFO >
GenericVector::reserve
void reserve(int size)
Definition: genericvector.h:679
STRING_INIT_MEMBER
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:330
CHAR_FRAGMENT_INFO::certainty
float certainty
Definition: dict.h:48
tesseract::UnicharIdArrayUtils::print
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:91
tesseract::CCUtil::unichar_ambigs
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:59
UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:495
WERD_CHOICE::remove_last_unichar_id
void remove_last_unichar_id()
Definition: ratngs.h:471
tesseract::Dict::init_active_dawgs
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:600
tesseract::TESSDATA_UNAMBIG_DAWG
Definition: tessdatamanager.h:72
tesseract::TESSDATA_LSTM_PUNC_DAWG
Definition: tessdatamanager.h:75
tesseract::Dict::GetStartingNode
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:438
tesseract::Dict::use_only_first_uft8_step
bool use_only_first_uft8_step
Definition: dict.h:626
tesseract::Dict::doc_dict_pending_threshold
double doc_dict_pending_threshold
Definition: dict.h:651
CHAR_FRAGMENT_INFO::unichar_id
UNICHAR_ID unichar_id
Definition: dict.h:44
WERD_CHOICE::print
void print() const
Definition: ratngs.h:568
tesseract::Dict::stopper_phase2_certainty_rejection_offset
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:631
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
tesseract::Dict::has_hyphen_end
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:152
BLOB_CHOICE
Definition: ratngs.h:49
MATRIX_COORD
Definition: matrix.h:604
CHAR_FRAGMENT_INFO
Definition: dict.h:43
GenericVector::get
T & get(int index) const
Definition: genericvector.h:716
tesseract::Dict::save_doc_words
bool save_doc_words
Definition: dict.h:649
EDGE_REF
int64_t EDGE_REF
Definition: dawg.h:49
GenericVector::clear
void clear()
Definition: genericvector.h:857
tesseract::Dict::case_ok
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:61
print_ratings_list
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:835
GENERIC_2D_ARRAY::put
void put(ICOORD pos, const T &thing)
Definition: matrix.h:219
UNICHARSET::normed_ids
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:825
tesseract::Dict::load_bigram_dawg
bool load_bigram_dawg
Definition: dict.h:592
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::Dict::getUnicharset
const UNICHARSET & getUnicharset() const
Definition: dict.h:101
tesseract::UnicharAmbigs::replace_ambigs
const UnicharAmbigsVector & replace_ambigs() const
Definition: ambigs.h:146
tesseract::DAWG_TYPE_PATTERN
Definition: dawg.h:70
tesstrain_utils.type
type
Definition: tesstrain_utils.py:141
tesseract::Dawg::kPatternUnicharID
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:120
tesseract::Dict::append_choices
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:253
tesseract::Dict::valid_word
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:778
tesseract::Dict::NoDangerousAmbig
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:158
CHAR_FRAGMENT_INFO::rating
float rating
Definition: dict.h:47
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
WERD_CHOICE::remove_unichar_id
void remove_unichar_id(int index)
Definition: ratngs.h:472
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
tesseract::Dict::hyphen_debug_level
int hyphen_debug_level
Definition: dict.h:623
BOOL_MEMBER
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:315
DOC_DAWG_PERM
Definition: ratngs.h:240
PUNC_PERM
Definition: ratngs.h:232
tesseract::Dict::load_freq_dawg
bool load_freq_dawg
Definition: dict.h:586
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::Dict::user_words_file
char * user_words_file
Definition: dict.h:578
tesseract::Dict::stopper_nondict_certainty_base
double stopper_nondict_certainty_base
Definition: dict.h:629
WERD_CHOICE::set_permuter
void set_permuter(uint8_t perm)
Definition: ratngs.h:363
tesseract::Dict::SettupStopperPass2
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:382
BLOB_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:139
WERD_CHOICE::set_blob_choice
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:314
FREQ_DAWG_PERM
Definition: ratngs.h:242
tesseract::TESSDATA_NUMBER_DAWG
Definition: tessdatamanager.h:65
UNICHARSET::size
int size() const
Definition: unicharset.h:341
WERD_CHOICE::append_unichar_id
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:470
tesseract::Dict::segment_nonalphabetic_script
bool segment_nonalphabetic_script
Definition: dict.h:648
NUMBER_PERM
Definition: ratngs.h:237
NODE_REF
int64_t NODE_REF
Definition: dawg.h:50
tesseract::Dict::user_patterns_suffix
char * user_patterns_suffix
Definition: dict.h:584
USER_DAWG_PERM
Definition: ratngs.h:241
tesseract::Dict::load_unambig_dawg
bool load_unambig_dawg
Definition: dict.h:587
tesseract::TESSDATA_FREQ_DAWG
Definition: tessdatamanager.h:66
WERD_CHOICE::max_x_height
float max_x_height() const
Definition: ratngs.h:327
tesseract::Dict::letter_is_okay_
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:372