tesseract  5.0.0-alpha-619-ge9db
dict.h
Go to the documentation of this file.
1 // File: dict.h
3 // Description: dict class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_DICT_DICT_H_
20 #define TESSERACT_DICT_DICT_H_
21 
22 #ifndef DISABLED_LEGACY_ENGINE
23 #include "ambigs.h"
24 #endif
25 #include "dawg.h"
26 #include "dawg_cache.h"
27 #include "ratngs.h"
28 #include "stopper.h"
29 #include "trie.h"
30 #include "unicharset.h"
31 #ifndef DISABLED_LEGACY_ENGINE
33 #endif // ndef DISABLED_LEGACY_ENGINE
34 
35 class MATRIX;
36 class WERD_RES;
37 
38 #define CHARS_PER_LINE 500
39 #define MAX_WERD_LENGTH (int64_t) 128
40 #define NO_RATING -1
41 
47  float rating;
48  float certainty;
49 };
50 
51 namespace tesseract {
52 
54 
55 //
56 // Constants
57 //
58 static const int kRatingPad = 4;
59 static const char kDictWildcard[] = "\u2606"; // WHITE STAR
60 static const int kDictMaxWildcards = 2; // max wildcards for a word
61 // TODO(daria): If hyphens are different in different languages and can be
62 // inferred from training data we should load their values dynamically.
63 static const char kHyphenSymbol[] = "-";
64 static const char kSlashSymbol[] = "/";
65 static const char kQuestionSymbol[] = "?";
66 static const char kApostropheSymbol[] = "'";
67 static const float kSimCertaintyScale = -10.0; // similarity matcher scaling
68 static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
69 static const float kSimilarityFloor = 100.0; // worst E*L product to stop on
70 static const int kDocDictMaxRepChars = 4;
71 
72 // Enum for describing whether the x-height for the word is consistent:
73 // 0 - everything is good.
74 // 1 - there are one or two secondary (but consistent) baselines
75 // [think subscript and superscript], or there is an oversized
76 // first character.
77 // 2 - the word is inconsistent.
79 
80 struct DawgArgs {
82  : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {}
83 
87  // True if the current position is a valid word end.
88  bool valid_end;
89 };
90 
91 class Dict {
92  public:
93  Dict(CCUtil* image_ptr);
94  ~Dict();
95  const CCUtil* getCCUtil() const {
96  return ccutil_;
97  }
99  return ccutil_;
100  }
101  const UNICHARSET& getUnicharset() const {
102  return getCCUtil()->unicharset;
103  }
105  return getCCUtil()->unicharset;
106  }
107 #ifndef DISABLED_LEGACY_ENGINE
109  return getCCUtil()->unichar_ambigs;
110  }
111 #endif
112  // Returns true if unichar_id is a word compounding character like - or /.
113  inline bool compound_marker(UNICHAR_ID unichar_id) {
114  const UNICHARSET& unicharset = getUnicharset();
115  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
116  const GenericVector<UNICHAR_ID>& normed_ids =
117  unicharset.normed_ids(unichar_id);
118  return normed_ids.size() == 1 &&
119  (normed_ids[0] == hyphen_unichar_id_ ||
120  normed_ids[0] == slash_unichar_id_);
121  }
122  // Returns true if unichar_id is an apostrophe-like character that may
123  // separate prefix/suffix words from a main body word.
124  inline bool is_apostrophe(UNICHAR_ID unichar_id) {
125  const UNICHARSET& unicharset = getUnicharset();
126  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
127  const GenericVector<UNICHAR_ID>& normed_ids =
128  unicharset.normed_ids(unichar_id);
129  return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
130  }
131 
132  /* hyphen.cpp ************************************************************/
133 
135  inline bool hyphenated() const { return
136  !last_word_on_line_ && hyphen_word_;
137  }
139  inline int hyphen_base_size() const {
140  return this->hyphenated() ? hyphen_word_->length() : 0;
141  }
145  inline void copy_hyphen_info(WERD_CHOICE *word) const {
146  if (this->hyphenated()) {
147  *word = *hyphen_word_;
148  if (hyphen_debug_level) word->print("copy_hyphen_info: ");
149  }
150  }
152  inline bool has_hyphen_end(const UNICHARSET* unicharset,
153  UNICHAR_ID unichar_id, bool first_pos) const {
154  if (!last_word_on_line_ || first_pos)
155  return false;
156  ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
157  const GenericVector<UNICHAR_ID>& normed_ids =
158  unicharset->normed_ids(unichar_id);
159  return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
160  }
162  inline bool has_hyphen_end(const WERD_CHOICE &word) const {
163  int word_index = word.length() - 1;
164  return has_hyphen_end(word.unicharset(), word.unichar_id(word_index),
165  word_index == 0);
166  }
170  void reset_hyphen_vars(bool last_word_on_line);
173  void set_hyphen_word(const WERD_CHOICE &word,
174  const DawgPositionVector &active_dawgs);
175 
176  /* permdawg.cpp ************************************************************/
177  // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().
178  // When this function is refactored, permdawg.cpp can be removed.
179 
182  inline void update_best_choice(const WERD_CHOICE &word,
183  WERD_CHOICE *best_choice) {
184  if (word.rating() < best_choice->rating()) {
185  *best_choice = word;
186  }
187  }
191  void init_active_dawgs(DawgPositionVector *active_dawgs,
192  bool ambigs_mode) const;
193  // Fill the given vector with the default collection of any-length dawgs
194  void default_dawgs(DawgPositionVector *anylength_dawgs,
195  bool suppress_patterns) const;
196 
197 
204  const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit);
208  void go_deeper_dawg_fxn(
209  const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
210  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
211  bool word_ending, WERD_CHOICE *word, float certainties[],
212  float *limit, WERD_CHOICE *best_choice, int *attempts_left,
213  void *void_more_args);
214 
216  void (Dict::*go_deeper_fxn_)(const char *debug,
217  const BLOB_CHOICE_LIST_VECTOR &char_choices,
218  int char_choice_index,
219  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
220  bool word_ending, WERD_CHOICE *word,
221  float certainties[], float *limit,
222  WERD_CHOICE *best_choice, int *attempts_left,
223  void *void_more_args);
224  //
225  // Helper functions for dawg_permute_and_select().
226  //
227  void permute_choices(
228  const char *debug,
229  const BLOB_CHOICE_LIST_VECTOR &char_choices,
230  int char_choice_index,
231  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
232  WERD_CHOICE *word,
233  float certainties[],
234  float *limit,
235  WERD_CHOICE *best_choice,
236  int *attempts_left,
237  void *more_args);
238 
239  void append_choices(
240  const char *debug,
241  const BLOB_CHOICE_LIST_VECTOR &char_choices,
242  const BLOB_CHOICE &blob_choice,
243  int char_choice_index,
244  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
245  WERD_CHOICE *word,
246  float certainties[],
247  float *limit,
248  WERD_CHOICE *best_choice,
249  int *attempts_left,
250  void *more_args);
251 
252  bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
253  float curr_rating, float curr_certainty,
254  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
255  const char *debug, int word_ending,
256  CHAR_FRAGMENT_INFO *char_frag_info);
257 
258  /* stopper.cpp *************************************************************/
259 #if !defined(DISABLED_LEGACY_ENGINE)
260  bool NoDangerousAmbig(WERD_CHOICE *BestChoice,
261  DANGERR *fixpt,
262  bool fix_replaceable,
263  MATRIX* ratings);
264 #endif // !defined(DISABLED_LEGACY_ENGINE)
265  // Replaces the corresponding wrong ngram in werd_choice with the correct
266  // one. The whole correct n-gram is inserted into the ratings matrix and
267  // the werd_choice: no more fragments!. Rating and certainty of new entries
268  // in matrix and werd_choice are the sum and mean of the wrong ngram
269  // respectively.
270  // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes
271  // mystring", with a new entry in the ratings matrix for ".
272  void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
273  UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
274  MATRIX *ratings);
275 
277  int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const;
285  int UniformCertainties(const WERD_CHOICE& word);
287  bool AcceptableChoice(const WERD_CHOICE& best_choice,
288  XHeightConsistencyEnum xheight_consistency);
292  bool AcceptableResult(WERD_RES *word) const;
293 #if !defined(DISABLED_LEGACY_ENGINE)
294  void EndDangerousAmbigs();
295 #endif // !defined(DISABLED_LEGACY_ENGINE)
296  void DebugWordChoices();
299  void SettupStopperPass1();
301  void SettupStopperPass2();
302  /* context.cpp *************************************************************/
304  int case_ok(const WERD_CHOICE& word) const;
307  bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
308 
309  /* dict.cpp ****************************************************************/
310 
314  // Sets up ready for a Load or LoadLSTM.
315  void SetupForLoad(DawgCache *dawg_cache);
316  // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
317  void Load(const STRING &lang, TessdataManager *data_file);
318  // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
319  void LoadLSTM(const STRING &lang, TessdataManager *data_file);
320  // Completes the loading process after Load() and/or LoadLSTM().
321  // Returns false if no dictionaries were loaded.
322  bool FinishLoad();
323  void End();
324 
325  // Resets the document dictionary analogous to ResetAdaptiveClassifier.
327  if (pending_words_ != nullptr)
328  pending_words_->clear();
329  if (document_words_ != nullptr)
330  document_words_->clear();
331  }
332 
368  //
369  int def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
370  UNICHAR_ID unichar_id, bool word_end) const;
371 
372  int (Dict::*letter_is_okay_)(void* void_dawg_args,
373  const UNICHARSET& unicharset,
374  UNICHAR_ID unichar_id, bool word_end) const;
376  int LetterIsOkay(void* void_dawg_args, const UNICHARSET& unicharset,
377  UNICHAR_ID unichar_id, bool word_end) const {
378  return (this->*letter_is_okay_)(void_dawg_args,
379  unicharset, unichar_id, word_end);
380  }
381 
382 
384  double (Dict::*probability_in_context_)(const char* lang,
385  const char* context,
386  int context_bytes,
387  const char* character,
388  int character_bytes);
390  double ProbabilityInContext(const char* context,
391  int context_bytes,
392  const char* character,
393  int character_bytes) {
394  return (this->*probability_in_context_)(
395  getCCUtil()->lang.c_str(),
396  context, context_bytes,
397  character, character_bytes);
398  }
399 
402  const char* lang, const char* context, int context_bytes,
403  const char* character, int character_bytes) {
404  (void)lang;
405  (void)context;
406  (void)context_bytes;
407  (void)character;
408  (void)character_bytes;
409  return 0.0;
410  }
411  double ngram_probability_in_context(const char* lang,
412  const char* context,
413  int context_bytes,
414  const char* character,
415  int character_bytes);
416 
417  // Interface with params model.
418  float (Dict::*params_model_classify_)(const char *lang, void *path);
419  float ParamsModelClassify(const char *lang, void *path);
420  // Call params_model_classify_ member function.
421  float CallParamsModelClassify(void *path) {
422  ASSERT_HOST(params_model_classify_ != nullptr); // ASSERT_HOST -> assert
423  return (this->*params_model_classify_)(
424  getCCUtil()->lang.c_str(), path);
425  }
426 
427  inline void SetWildcardID(UNICHAR_ID id) { wildcard_unichar_id_ = id; }
428  inline UNICHAR_ID WildcardID() const { return wildcard_unichar_id_; }
430  inline int NumDawgs() const { return dawgs_.size(); }
432  inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
434  inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
436  inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
438  static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
439  if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg
440  NODE_REF node = dawg->next_node(edge_ref);
441  if (node == 0) node = NO_EDGE; // end of word
442  return node;
443  }
444 
445  // Given a unichar from a string and a given dawg, return the unichar
446  // we should use to match in that dawg type. (for example, in the number
447  // dawg, all numbers are transformed to kPatternUnicharId).
449  const Dawg *dawg) const {
450  if (!dawg) return ch;
451  switch (dawg->type()) {
452  case DAWG_TYPE_NUMBER:
453  return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
454  default:
455  return ch;
456  }
457  }
458 
464  void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info,
465  UNICHAR_ID unichar_id, bool word_end,
466  DawgArgs *dawg_args,
467  PermuterType *current_permuter) const;
468 
472 
474  inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) {
475  return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
476  perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
477  perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
478  (numbers_ok && perm == NUMBER_PERM));
479  }
480  int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
481  int valid_word(const WERD_CHOICE &word) const {
482  return valid_word(word, false); // return NO_PERM for words with digits
483  }
484  int valid_word_or_number(const WERD_CHOICE &word) const {
485  return valid_word(word, true); // return NUMBER_PERM for valid numbers
486  }
488  int valid_word(const char *string) const {
489  WERD_CHOICE word(string, getUnicharset());
490  return valid_word(word);
491  }
492  // Do the two WERD_CHOICEs form a meaningful bigram?
493  bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
498  bool valid_punctuation(const WERD_CHOICE &word);
500  int good_choice(const WERD_CHOICE &choice);
502  void add_document_word(const WERD_CHOICE &best_choice);
504  void adjust_word(WERD_CHOICE *word,
505  bool nonword, XHeightConsistencyEnum xheight_consistency,
506  float additional_adjust,
507  bool modify_rating,
508  bool debug);
510  inline void SetWordsegRatingAdjustFactor(float f) {
511  wordseg_rating_adjust_factor_ = f;
512  }
514  bool IsSpaceDelimitedLang() const;
515 
516  private:
518  CCUtil* ccutil_;
525 #ifndef DISABLED_LEGACY_ENGINE
526  UnicharAmbigs* dang_ambigs_table_ = nullptr;
528  UnicharAmbigs* replace_ambigs_table_ = nullptr;
529 #endif
530 
531  float reject_offset_;
532  // Cached UNICHAR_IDs:
533  UNICHAR_ID wildcard_unichar_id_; // kDictWildcard.
534  UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol.
535  UNICHAR_ID question_unichar_id_; // kQuestionSymbol.
536  UNICHAR_ID slash_unichar_id_; // kSlashSymbol.
537  UNICHAR_ID hyphen_unichar_id_; // kHyphenSymbol.
538  // Hyphen-related variables.
539  WERD_CHOICE *hyphen_word_;
540  DawgPositionVector hyphen_active_dawgs_;
541  bool last_word_on_line_;
542  // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
543  // matching. The first member of each list is taken as canonical. For
544  // example, the first list contains hyphens and dashes with the first symbol
545  // being the ASCII hyphen minus.
546  GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_;
547  // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
548  DawgCache *dawg_cache_;
549  bool dawg_cache_is_ours_; // we should delete our own dawg_cache_
550  // Dawgs.
551  DawgVector dawgs_;
552  SuccessorListsVector successors_;
553  Trie *pending_words_;
556  // bigram_dawg_ points to a dawg of two-word bigrams which always supersede if
557  // any of them are present on the best choices list for a word pair.
558  // the bigrams are stored as space-separated words where:
559  // (1) leading and trailing punctuation has been removed from each word and
560  // (2) any digits have been replaced with '?' marks.
561  Dawg *bigram_dawg_;
562  // TODO(daria): need to support multiple languages in the future,
563  // so maybe will need to maintain a list of dawgs of each kind.
564  Dawg *freq_dawg_;
565  Dawg *unambig_dawg_;
566  Dawg *punc_dawg_;
567  Trie *document_words_;
570  float wordseg_rating_adjust_factor_;
571  // File for recording ambiguities discovered during dictionary search.
572  FILE *output_ambig_words_file_;
573 
574  public:
578  STRING_VAR_H(user_words_file, "", "A filename of user-provided words.");
580  "A suffix of user-provided words located in tessdata.");
582  "A filename of user-provided patterns.");
584  "A suffix of user-provided patterns located in tessdata.");
585  BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
586  BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
587  BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
589  "Load dawg with punctuation patterns.");
590  BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
592  "Load dawg with special word bigrams.");
594  "Score penalty (0.1 = 10%) added if there are subscripts "
595  "or superscripts in a word, but it is otherwise OK.");
597  "Score penalty (0.1 = 10%) added if an xheight is "
598  "inconsistent.");
600  "Score multiplier for word matches which have good case and"
601  "are frequent in the given language (lower is better).");
602 
604  "Score multiplier for word matches that have good case "
605  "(lower is better).");
606 
608  "Default score multiplier for word matches, which may have "
609  "case issues (lower is better).");
610 
612  "Score multiplier for glyph fragment segmentations which "
613  "do not match a dictionary word (lower is better).");
614 
616  "Score multiplier for poorly cased strings that are not in"
617  " the dictionary and generally look like garbage (lower is"
618  " better).");
620  "Output file for ambiguities found in the dictionary");
621  INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
622  ", to 2 for more details, to 3 to see all the debug messages");
623  INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
625  "Use only the first UTF8 step of the given string"
626  " when computing log probabilities.");
627  double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
629  "Certainty threshold for non-dict words");
631  "Reject certainty offset");
633  "Size of dict word to be treated as non-dict word");
635  "Certainty to add for each dict char above small word size.");
637  "Max certaintly variation allowed in a word (in sigma)");
638  INT_VAR_H(stopper_debug_level, 0, "Stopper debug level");
640  "Make AcceptableChoice() always return false. Useful"
641  " when there is a need to explore all segmentations");
642  INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
643  STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
644  " should be printed to stdout");
646  "Don't use any alphabetic-specific tricks."
647  "Set to true in the traineddata config file for"
648  " scripts that are cursive or inherently fixed-pitch");
649  BOOL_VAR_H(save_doc_words, 0, "Save Document Words");
651  "Worst certainty for using pending dictionary");
652  double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty"
653  " for words that can be inserted into the document dictionary");
654  INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different"
655  " character choices to consider during permutation."
656  " This limit is especially useful when user patterns"
657  " are specified, since overly generic patterns can result in"
658  " dawg search exploring an overly large number of options.");
659 };
660 } // namespace tesseract
661 
662 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_
tesseract::Dict::getCCUtil
const CCUtil * getCCUtil() const
Definition: dict.h:95
tesseract::Dict::fragment_state_okay
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:328
tesseract::Dict::ProcessPatternEdges
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
Definition: dict.cpp:561
tesseract::Dict::NumDawgs
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:430
CHAR_FRAGMENT_INFO::num_fragments
int num_fragments
Definition: dict.h:46
tesseract::Dict::GetPuncDawg
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:434
tesseract::DawgArgs::DawgArgs
DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
Definition: dict.h:81
tesseract::Dict::max_permuter_attempts
int max_permuter_attempts
Definition: dict.h:658
tesseract::XH_SUBNORMAL
Definition: dict.h:78
tesseract::Dict::SetWordsegRatingAdjustFactor
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
Definition: dict.h:510
tesseract::Dict::go_deeper_fxn_
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:216
tesseract::Dict::segment_penalty_dict_case_bad
double segment_penalty_dict_case_bad
Definition: dict.h:609
UNICHARSET::contains_unichar_id
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
CHAR_FRAGMENT_INFO::fragment
const CHAR_FRAGMENT * fragment
Definition: dict.h:45
tesseract::Dict::probability_in_context_
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:384
tesseract::Dawg::type
DawgType type() const
Definition: dawg.h:122
tesseract::Dict::params_model_classify_
float(Dict::* params_model_classify_)(const char *lang, void *path)
Definition: dict.h:418
tesseract::Dict::output_ambig_words_file
char * output_ambig_words_file
Definition: dict.h:620
tesseract::Dict::UniformCertainties
int UniformCertainties(const WERD_CHOICE &word)
Definition: stopper.cpp:479
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
tesseract::TessdataManager
Definition: tessdatamanager.h:126
tesseract::Dict::xheight_penalty_subscripts
double xheight_penalty_subscripts
Definition: dict.h:595
tesseract::DawgPosition
Definition: dawg.h:348
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
WERD_CHOICE
Definition: ratngs.h:261
dawg_cache.h
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::Dict::stopper_certainty_per_char
double stopper_certainty_per_char
Definition: dict.h:635
tesseract::Dict::dawg_permute_and_select
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:182
SYSTEM_DAWG_PERM
Definition: ratngs.h:239
tesseract::Dict::tessedit_truncate_wordchoice_log
int tessedit_truncate_wordchoice_log
Definition: dict.h:642
tesseract::Trie
Definition: trie.h:54
tesseract::Dict::segment_penalty_garbage
double segment_penalty_garbage
Definition: dict.h:618
tesseract::Dict::getUnicharAmbigs
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:108
tesseract::Dict::hyphen_base_size
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:139
tesseract::Dict::doc_dict_certainty_threshold
double doc_dict_certainty_threshold
Definition: dict.h:653
PermuterType
PermuterType
Definition: ratngs.h:230
tesseract::DAWG_TYPE_NUMBER
Definition: dawg.h:69
tesseract::Dict::word_to_debug
char * word_to_debug
Definition: dict.h:644
params_training_featdef.h
MATRIX
Definition: matrix.h:574
STRING
Definition: strngs.h:45
tesseract::Dict::valid_word
int valid_word(const char *string) const
This function is used by api/tesseract_cube_combiner.cpp.
Definition: dict.h:488
tesseract::Dict::stopper_debug_level
int stopper_debug_level
Definition: dict.h:638
tesseract::Dict::char_for_dawg
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:448
WERD_RES
Definition: pageres.h:160
tesseract::Dict::EndDangerousAmbigs
void EndDangerousAmbigs()
Definition: stopper.cpp:374
tesseract::Dict::Load
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:210
tesseract::Dict::DebugWordChoices
void DebugWordChoices()
Prints the current choices for this word to stdout.
tesseract::Dict::absolute_garbage
bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Definition: context.cpp:80
tesseract::XH_GOOD
Definition: dict.h:78
tesseract::DawgArgs
Definition: dict.h:80
tesseract::Dict::add_document_word
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:644
COMPOUND_PERM
Definition: ratngs.h:243
stopper.h
tesseract::Dict::def_probability_in_context
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:401
tesseract::Dict::user_patterns_file
char * user_patterns_file
Definition: dict.h:582
WERD_CHOICE::unicharset
const UNICHARSET * unicharset() const
Definition: ratngs.h:288
tesseract::Dict::AcceptableChoice
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:56
tesseract::Dict::SettupStopperPass1
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:378
tesseract::Dict::default_dawgs
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:617
tesseract::Dict::GlobalDawgCache
static TESS_API DawgCache * GlobalDawgCache()
Definition: dict.cpp:184
double_VAR_H
#define double_VAR_H(name, val, comment)
Definition: params.h:298
tesseract::Dict::segment_penalty_dict_nonword
double segment_penalty_dict_nonword
Definition: dict.h:613
tesseract::Dict::valid_word_permuter
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:474
tesseract::XHeightConsistencyEnum
XHeightConsistencyEnum
Definition: dict.h:78
tesseract::DawgCache
Definition: dawg_cache.h:30
tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:57
tesseract::Dict::xheight_penalty_inconsistent
double xheight_penalty_inconsistent
Definition: dict.h:598
ratngs.h
tesseract::Dict::copy_hyphen_info
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:145
tesseract::Dict::stopper_no_acceptable_choices
bool stopper_no_acceptable_choices
Definition: dict.h:641
tesseract::Dict::AcceptableResult
bool AcceptableResult(WERD_RES *word) const
Definition: stopper.cpp:116
tesseract::Dict::hyphenated
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135
tesseract::Dict::load_system_dawg
bool load_system_dawg
Definition: dict.h:585
tesseract::Dict::good_choice
int good_choice(const WERD_CHOICE &choice)
Returns true if a good answer is found for the unknown blob rating.
tesseract::Dict::ngram_probability_in_context
double ngram_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
tesseract::Dict::LetterIsOkay
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:376
tesseract::Dict::user_words_suffix
char * user_words_suffix
Definition: dict.h:580
tesseract::Dict::getCCUtil
CCUtil * getCCUtil()
Definition: dict.h:98
tesseract::DawgArgs::permuter
PermuterType permuter
Definition: dict.h:86
tesseract::Dict::WildcardID
UNICHAR_ID WildcardID() const
Definition: dict.h:428
tesseract::Dict::go_deeper_dawg_fxn
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:58
tesseract::Dict::End
void End()
Definition: dict.cpp:372
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
unicharset.h
dawg.h
tesseract::Dict::update_best_choice
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:182
tesseract::Dict::set_hyphen_word
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:59
tesseract::Trie::clear
void clear()
Definition: trie.cpp:71
tesseract::Dict::permute_choices
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:211
tesseract::Dict::valid_word_or_number
int valid_word_or_number(const WERD_CHOICE &word) const
Definition: dict.h:484
tesstrain_utils.int
int
Definition: tesstrain_utils.py:154
tesseract::Dict::stopper_allowable_character_badness
double stopper_allowable_character_badness
Definition: dict.h:637
tesseract::DawgArgs::updated_dawgs
DawgPositionVector * updated_dawgs
Definition: dict.h:85
tesseract::Dict::LengthOfShortestAlphaRun
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:460
tesseract::XH_INCONSISTENT
Definition: dict.h:78
tesseract::Dict::segment_penalty_dict_case_ok
double segment_penalty_dict_case_ok
Definition: dict.h:605
tesseract::Dict::def_letter_is_okay
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:395
tesseract::Dict::certainty_scale
double certainty_scale
Definition: dict.h:627
tesseract::UnicharAmbigs
Definition: ambigs.h:136
tesseract::Dict::load_punc_dawg
bool load_punc_dawg
Definition: dict.h:589
UNICHARSET
Definition: unicharset.h:145
tesseract::Dict::SetupForLoad
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:192
tesseract::Dict::load_number_dawg
bool load_number_dawg
Definition: dict.h:590
tesseract::Dict::has_hyphen_end
bool has_hyphen_end(const WERD_CHOICE &word) const
Same as above, but check the unichar at the end of the word.
Definition: dict.h:162
tesseract::Dict::stopper_smallword_size
int stopper_smallword_size
Definition: dict.h:633
tesseract::Dict::valid_punctuation
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:856
tesseract::CCUtil::lang
STRING lang
Definition: ccutil.h:55
tesseract::Dict::dawg_debug_level
int dawg_debug_level
Definition: dict.h:622
tesseract::Dict::segment_penalty_dict_frequent_word
double segment_penalty_dict_frequent_word
Definition: dict.h:601
character
Definition: mfoutline.h:62
tesseract
Definition: baseapi.h:65
INT_VAR_H
#define INT_VAR_H(name, val, comment)
Definition: params.h:292
tesseract::Dawg::next_node
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
tesseract::Dict::Dict
Dict(CCUtil *image_ptr)
Definition: dict.cpp:30
tesseract::Dict::ReplaceAmbig
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
Definition: stopper.cpp:386
tesseract::DawgPositionVector
Definition: dawg.h:373
tesseract::Dict::compound_marker
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:113
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
USER_PATTERN_PERM
Definition: ratngs.h:238
GenericVector< Dawg * >
CHAR_FRAGMENT_INFO::certainty
float certainty
Definition: dict.h:48
tesseract::Dict
Definition: dict.h:91
tesseract::CCUtil::unichar_ambigs
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:59
tesseract::Dict::init_active_dawgs
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:600
CHAR_FRAGMENT
Definition: unicharset.h:48
tesseract::DawgArgs::valid_end
bool valid_end
Definition: dict.h:88
tesseract::Dict::LoadLSTM
void LoadLSTM(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:291
tesseract::Dict::FinishLoad
bool FinishLoad()
Definition: dict.cpp:351
tesseract::Dict::GetStartingNode
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:438
tesseract::Dict::use_only_first_uft8_step
bool use_only_first_uft8_step
Definition: dict.h:626
tesseract::Dict::doc_dict_pending_threshold
double doc_dict_pending_threshold
Definition: dict.h:651
CHAR_FRAGMENT_INFO::unichar_id
UNICHAR_ID unichar_id
Definition: dict.h:44
WERD_CHOICE::print
void print() const
Definition: ratngs.h:568
tesseract::Dict::stopper_phase2_certainty_rejection_offset
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:631
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
tesseract::Dawg
Definition: dawg.h:113
tesseract::Dict::has_hyphen_end
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:152
BLOB_CHOICE
Definition: ratngs.h:49
tesseract::Dict::reset_hyphen_vars
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:42
BOOL_VAR_H
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:294
tesseract::Dict::ParamsModelClassify
float ParamsModelClassify(const char *lang, void *path)
CHAR_FRAGMENT_INFO
Definition: dict.h:43
tesseract::Dict::save_doc_words
bool save_doc_words
Definition: dict.h:649
tesseract::Dict::adjust_word
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:701
EDGE_REF
int64_t EDGE_REF
Definition: dawg.h:49
tesseract::Dict::case_ok
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:61
tesseract::Dict::ProbabilityInContext
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:390
tesseract::Dict::CallParamsModelClassify
float CallParamsModelClassify(void *path)
Definition: dict.h:421
TESS_API
#define TESS_API
Definition: platform.h:54
UNICHARSET::normed_ids
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:825
tesseract::Dict::valid_bigram
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:813
tesseract::Dict::load_bigram_dawg
bool load_bigram_dawg
Definition: dict.h:592
tesseract::Dict::getUnicharset
const UNICHARSET & getUnicharset() const
Definition: dict.h:101
tesseract::Dict::GetDawg
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:432
tesseract::Dawg::kPatternUnicharID
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:120
tesseract::Dict::append_choices
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:253
tesseract::Dict::getUnicharset
UNICHARSET & getUnicharset()
Definition: dict.h:104
tesseract::Dict::~Dict
~Dict()
Definition: dict.cpp:178
tesseract::Dict::valid_word
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:778
tesseract::DawgArgs::active_dawgs
DawgPositionVector * active_dawgs
Definition: dict.h:84
tesseract::Dict::NoDangerousAmbig
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:158
tesseract::Dict::is_apostrophe
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:124
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
CHAR_FRAGMENT_INFO::rating
float rating
Definition: dict.h:47
ambigs.h
tesseract::Dict::IsSpaceDelimitedLang
bool IsSpaceDelimitedLang() const
Returns true if the language is space-delimited (not CJ, or T).
Definition: dict.cpp:883
tesseract::Dict::GetUnambigDawg
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
Definition: dict.h:436
tesseract::Dict::valid_word
int valid_word(const WERD_CHOICE &word) const
Definition: dict.h:481
tesseract::Dict::hyphen_debug_level
int hyphen_debug_level
Definition: dict.h:623
DOC_DAWG_PERM
Definition: ratngs.h:240
tesseract::Dict::load_freq_dawg
bool load_freq_dawg
Definition: dict.h:586
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::Dict::ResetDocumentDictionary
void ResetDocumentDictionary()
Definition: dict.h:326
tesseract::Dict::user_words_file
char * user_words_file
Definition: dict.h:578
tesseract::Dict::stopper_nondict_certainty_base
double stopper_nondict_certainty_base
Definition: dict.h:629
tesseract::CCUtil
Definition: ccutil.h:40
tesseract::Dict::SettupStopperPass2
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:382
FREQ_DAWG_PERM
Definition: ratngs.h:242
tesseract::Dict::SetWildcardID
void SetWildcardID(UNICHAR_ID id)
Definition: dict.h:427
STRING_VAR_H
#define STRING_VAR_H(name, val, comment)
Definition: params.h:296
tesseract::Dict::segment_nonalphabetic_script
bool segment_nonalphabetic_script
Definition: dict.h:648
NUMBER_PERM
Definition: ratngs.h:237
tesseract::Dict::user_patterns_suffix
char * user_patterns_suffix
Definition: dict.h:584
NODE_REF
int64_t NODE_REF
Definition: dawg.h:50
trie.h
USER_DAWG_PERM
Definition: ratngs.h:241
tesseract::Dict::load_unambig_dawg
bool load_unambig_dawg
Definition: dict.h:587
tesseract::Dict::letter_is_okay_
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:372