tesseract  4.0.0-1-g2a2b
dict.h
Go to the documentation of this file.
1 // File: dict.h
3 // Description: dict class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_DICT_DICT_H_
20 #define TESSERACT_DICT_DICT_H_
21 
22 #include "ambigs.h"
23 #include "dawg.h"
24 #include "dawg_cache.h"
25 #include "host.h"
26 #include "ratngs.h"
27 #include "stopper.h"
28 #include "trie.h"
29 #include "unicharset.h"
31 
32 class MATRIX;
33 class WERD_RES;
34 
35 #define CHARS_PER_LINE 500
36 #define MAX_WERD_LENGTH (int64_t) 128
37 #define NO_RATING -1
38 
44  float rating;
45  float certainty;
46 };
47 
48 namespace tesseract {
49 
51 
52 //
53 // Constants
54 //
55 static const int kRatingPad = 4;
56 static const char kDictWildcard[] = "\u2606"; // WHITE STAR
57 static const int kDictMaxWildcards = 2; // max wildcards for a word
58 // TODO(daria): If hyphens are different in different languages and can be
59 // inferred from training data we should load their values dynamically.
60 static const char kHyphenSymbol[] = "-";
61 static const char kSlashSymbol[] = "/";
62 static const char kQuestionSymbol[] = "?";
63 static const char kApostropheSymbol[] = "'";
64 static const float kSimCertaintyScale = -10.0; // similarity matcher scaling
65 static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
66 static const float kSimilarityFloor = 100.0; // worst E*L product to stop on
67 static const int kDocDictMaxRepChars = 4;
68 
69 // Enum for describing whether the x-height for the word is consistent:
70 // 0 - everything is good.
71 // 1 - there are one or two secondary (but consistent) baselines
72 // [think subscript and superscript], or there is an oversized
73 // first character.
74 // 2 - the word is inconsistent.
76 
77 struct DawgArgs {
79  : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {}
80 
84  // True if the current position is a valid word end.
85  bool valid_end;
86 };
87 
88 class Dict {
89  public:
90  Dict(CCUtil* image_ptr);
91  ~Dict();
92  const CCUtil* getCCUtil() const {
93  return ccutil_;
94  }
96  return ccutil_;
97  }
98  const UNICHARSET& getUnicharset() const {
99  return getCCUtil()->unicharset;
100  }
102  return getCCUtil()->unicharset;
103  }
105  return getCCUtil()->unichar_ambigs;
106  }
107 
108  // Returns true if unichar_id is a word compounding character like - or /.
109  inline bool compound_marker(UNICHAR_ID unichar_id) {
110  const GenericVector<UNICHAR_ID>& normed_ids =
111  getUnicharset().normed_ids(unichar_id);
112  return normed_ids.size() == 1 &&
113  (normed_ids[0] == hyphen_unichar_id_ ||
114  normed_ids[0] == slash_unichar_id_);
115  }
116  // Returns true if unichar_id is an apostrophe-like character that may
117  // separate prefix/suffix words from a main body word.
118  inline bool is_apostrophe(UNICHAR_ID unichar_id) {
119  const GenericVector<UNICHAR_ID>& normed_ids =
120  getUnicharset().normed_ids(unichar_id);
121  return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
122  }
123 
124  /* hyphen.cpp ************************************************************/
125 
127  inline bool hyphenated() const { return
128  !last_word_on_line_ && hyphen_word_;
129  }
131  inline int hyphen_base_size() const {
132  return this->hyphenated() ? hyphen_word_->length() : 0;
133  }
137  inline void copy_hyphen_info(WERD_CHOICE *word) const {
138  if (this->hyphenated()) {
139  *word = *hyphen_word_;
140  if (hyphen_debug_level) word->print("copy_hyphen_info: ");
141  }
142  }
144  inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const {
145  if (!last_word_on_line_ || first_pos)
146  return false;
147  const GenericVector<UNICHAR_ID>& normed_ids =
148  getUnicharset().normed_ids(unichar_id);
149  return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
150  }
152  inline bool has_hyphen_end(const WERD_CHOICE &word) const {
153  int word_index = word.length() - 1;
154  return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
155  }
159  void reset_hyphen_vars(bool last_word_on_line);
162  void set_hyphen_word(const WERD_CHOICE &word,
163  const DawgPositionVector &active_dawgs);
164 
165  /* permdawg.cpp ************************************************************/
166  // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().
167  // When this function is refactored, permdawg.cpp can be removed.
168 
171  inline void update_best_choice(const WERD_CHOICE &word,
172  WERD_CHOICE *best_choice) {
173  if (word.rating() < best_choice->rating()) {
174  *best_choice = word;
175  }
176  }
180  void init_active_dawgs(DawgPositionVector *active_dawgs,
181  bool ambigs_mode) const;
182  // Fill the given vector with the default collection of any-length dawgs
183  void default_dawgs(DawgPositionVector *anylength_dawgs,
184  bool suppress_patterns) const;
185 
186 
193  const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit);
197  void go_deeper_dawg_fxn(
198  const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
199  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
200  bool word_ending, WERD_CHOICE *word, float certainties[],
201  float *limit, WERD_CHOICE *best_choice, int *attempts_left,
202  void *void_more_args);
203 
205  void (Dict::*go_deeper_fxn_)(const char *debug,
206  const BLOB_CHOICE_LIST_VECTOR &char_choices,
207  int char_choice_index,
208  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
209  bool word_ending, WERD_CHOICE *word,
210  float certainties[], float *limit,
211  WERD_CHOICE *best_choice, int *attempts_left,
212  void *void_more_args);
213  //
214  // Helper functions for dawg_permute_and_select().
215  //
216  void permute_choices(
217  const char *debug,
218  const BLOB_CHOICE_LIST_VECTOR &char_choices,
219  int char_choice_index,
220  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
221  WERD_CHOICE *word,
222  float certainties[],
223  float *limit,
224  WERD_CHOICE *best_choice,
225  int *attempts_left,
226  void *more_args);
227 
228  void append_choices(
229  const char *debug,
230  const BLOB_CHOICE_LIST_VECTOR &char_choices,
231  const BLOB_CHOICE &blob_choice,
232  int char_choice_index,
233  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
234  WERD_CHOICE *word,
235  float certainties[],
236  float *limit,
237  WERD_CHOICE *best_choice,
238  int *attempts_left,
239  void *more_args);
240 
241  bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
242  float curr_rating, float curr_certainty,
243  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
244  const char *debug, int word_ending,
245  CHAR_FRAGMENT_INFO *char_frag_info);
246 
247  /* stopper.cpp *************************************************************/
248  bool NoDangerousAmbig(WERD_CHOICE *BestChoice,
249  DANGERR *fixpt,
250  bool fix_replaceable,
251  MATRIX* ratings);
252  // Replaces the corresponding wrong ngram in werd_choice with the correct
253  // one. The whole correct n-gram is inserted into the ratings matrix and
254  // the werd_choice: no more fragments!. Rating and certainty of new entries
255  // in matrix and werd_choice are the sum and mean of the wrong ngram
256  // respectively.
257  // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes
258  // mystring", with a new entry in the ratings matrix for ".
259  void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
260  UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
261  MATRIX *ratings);
262 
264  int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const;
272  int UniformCertainties(const WERD_CHOICE& word);
274  bool AcceptableChoice(const WERD_CHOICE& best_choice,
275  XHeightConsistencyEnum xheight_consistency);
279  bool AcceptableResult(WERD_RES *word) const;
280  void EndDangerousAmbigs();
282  void DebugWordChoices();
284  void SettupStopperPass1();
286  void SettupStopperPass2();
287  /* context.cpp *************************************************************/
289  int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const;
292  bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
293 
294  /* dict.cpp ****************************************************************/
295 
298  static DawgCache *GlobalDawgCache();
299  // Sets up ready for a Load or LoadLSTM.
300  void SetupForLoad(DawgCache *dawg_cache);
301  // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
302  void Load(const STRING &lang, TessdataManager *data_file);
303  // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
304  void LoadLSTM(const STRING &lang, TessdataManager *data_file);
305  // Completes the loading process after Load() and/or LoadLSTM().
306  // Returns false if no dictionaries were loaded.
307  bool FinishLoad();
308  void End();
309 
310  // Resets the document dictionary analogous to ResetAdaptiveClassifier.
312  if (pending_words_ != nullptr)
313  pending_words_->clear();
314  if (document_words_ != nullptr)
315  document_words_->clear();
316  }
317 
353  //
354  int def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
355  UNICHAR_ID unichar_id, bool word_end) const;
356 
357  int (Dict::*letter_is_okay_)(void* void_dawg_args,
358  const UNICHARSET& unicharset,
359  UNICHAR_ID unichar_id, bool word_end) const;
361  int LetterIsOkay(void* void_dawg_args, const UNICHARSET& unicharset,
362  UNICHAR_ID unichar_id, bool word_end) const {
363  return (this->*letter_is_okay_)(void_dawg_args,
364  unicharset, unichar_id, word_end);
365  }
366 
367 
369  double (Dict::*probability_in_context_)(const char* lang,
370  const char* context,
371  int context_bytes,
372  const char* character,
373  int character_bytes);
375  double ProbabilityInContext(const char* context,
376  int context_bytes,
377  const char* character,
378  int character_bytes) {
379  return (this->*probability_in_context_)(
380  getCCUtil()->lang.string(),
381  context, context_bytes,
382  character, character_bytes);
383  }
384 
387  const char* lang, const char* context, int context_bytes,
388  const char* character, int character_bytes) {
389  (void)lang;
390  (void)context;
391  (void)context_bytes;
392  (void)character;
393  (void)character_bytes;
394  return 0.0;
395  }
396  double ngram_probability_in_context(const char* lang,
397  const char* context,
398  int context_bytes,
399  const char* character,
400  int character_bytes);
401 
402  // Interface with params model.
403  float (Dict::*params_model_classify_)(const char *lang, void *path);
404  float ParamsModelClassify(const char *lang, void *path);
405  // Call params_model_classify_ member function.
406  float CallParamsModelClassify(void *path) {
407  ASSERT_HOST(params_model_classify_ != nullptr); // ASSERT_HOST -> assert
408  return (this->*params_model_classify_)(
409  getCCUtil()->lang.string(), path);
410  }
411 
412  inline void SetWildcardID(UNICHAR_ID id) { wildcard_unichar_id_ = id; }
413  inline UNICHAR_ID WildcardID() const { return wildcard_unichar_id_; }
415  inline int NumDawgs() const { return dawgs_.size(); }
417  inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
419  inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
421  inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
423  static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
424  if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg
425  NODE_REF node = dawg->next_node(edge_ref);
426  if (node == 0) node = NO_EDGE; // end of word
427  return node;
428  }
429 
430  // Given a unichar from a string and a given dawg, return the unichar
431  // we should use to match in that dawg type. (for example, in the number
432  // dawg, all numbers are transformed to kPatternUnicharId).
434  const Dawg *dawg) const {
435  if (!dawg) return ch;
436  switch (dawg->type()) {
437  case DAWG_TYPE_NUMBER:
438  return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
439  default:
440  return ch;
441  }
442  }
443 
449  void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info,
450  UNICHAR_ID unichar_id, bool word_end,
451  DawgArgs *dawg_args,
452  PermuterType *current_permuter) const;
453 
457 
459  inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) {
460  return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
461  perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
462  perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
463  (numbers_ok && perm == NUMBER_PERM));
464  }
465  int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
466  int valid_word(const WERD_CHOICE &word) const {
467  return valid_word(word, false); // return NO_PERM for words with digits
468  }
469  int valid_word_or_number(const WERD_CHOICE &word) const {
470  return valid_word(word, true); // return NUMBER_PERM for valid numbers
471  }
473  int valid_word(const char *string) const {
474  WERD_CHOICE word(string, getUnicharset());
475  return valid_word(word);
476  }
477  // Do the two WERD_CHOICEs form a meaningful bigram?
478  bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
483  bool valid_punctuation(const WERD_CHOICE &word);
485  int good_choice(const WERD_CHOICE &choice);
487  void add_document_word(const WERD_CHOICE &best_choice);
489  void adjust_word(WERD_CHOICE *word,
490  bool nonword, XHeightConsistencyEnum xheight_consistency,
491  float additional_adjust,
492  bool modify_rating,
493  bool debug);
495  inline void SetWordsegRatingAdjustFactor(float f) {
496  wordseg_rating_adjust_factor_ = f;
497  }
499  bool IsSpaceDelimitedLang() const;
500 
501  private:
503  CCUtil* ccutil_;
510  UnicharAmbigs *dang_ambigs_table_;
512  UnicharAmbigs *replace_ambigs_table_;
514  float reject_offset_;
515  // Cached UNICHAR_IDs:
516  UNICHAR_ID wildcard_unichar_id_; // kDictWildcard.
517  UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol.
518  UNICHAR_ID question_unichar_id_; // kQuestionSymbol.
519  UNICHAR_ID slash_unichar_id_; // kSlashSymbol.
520  UNICHAR_ID hyphen_unichar_id_; // kHyphenSymbol.
521  // Hyphen-related variables.
522  WERD_CHOICE *hyphen_word_;
523  DawgPositionVector hyphen_active_dawgs_;
524  bool last_word_on_line_;
525  // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
526  // matching. The first member of each list is taken as canonical. For
527  // example, the first list contains hyphens and dashes with the first symbol
528  // being the ASCII hyphen minus.
529  GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_;
530  // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
531  DawgCache *dawg_cache_;
532  bool dawg_cache_is_ours_; // we should delete our own dawg_cache_
533  // Dawgs.
534  DawgVector dawgs_;
535  SuccessorListsVector successors_;
536  Trie *pending_words_;
539  // bigram_dawg_ points to a dawg of two-word bigrams which always supersede if
540  // any of them are present on the best choices list for a word pair.
541  // the bigrams are stored as space-separated words where:
542  // (1) leading and trailing punctuation has been removed from each word and
543  // (2) any digits have been replaced with '?' marks.
544  Dawg *bigram_dawg_;
545  // TODO(daria): need to support multiple languages in the future,
546  // so maybe will need to maintain a list of dawgs of each kind.
547  Dawg *freq_dawg_;
548  Dawg *unambig_dawg_;
549  Dawg *punc_dawg_;
550  Trie *document_words_;
553  float wordseg_rating_adjust_factor_;
554  // File for recording ambiguities discovered during dictionary search.
555  FILE *output_ambig_words_file_;
556 
557  public:
561  STRING_VAR_H(user_words_file, "", "A filename of user-provided words.");
563  "A suffix of user-provided words located in tessdata.");
565  "A filename of user-provided patterns.");
567  "A suffix of user-provided patterns located in tessdata.");
568  BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
569  BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
570  BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
572  "Load dawg with punctuation patterns.");
573  BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
575  "Load dawg with special word bigrams.");
577  "Score penalty (0.1 = 10%) added if there are subscripts "
578  "or superscripts in a word, but it is otherwise OK.");
580  "Score penalty (0.1 = 10%) added if an xheight is "
581  "inconsistent.");
583  "Score multiplier for word matches which have good case and"
584  "are frequent in the given language (lower is better).");
585 
587  "Score multiplier for word matches that have good case "
588  "(lower is better).");
589 
591  "Default score multiplier for word matches, which may have "
592  "case issues (lower is better).");
593 
595  "Score multiplier for glyph fragment segmentations which "
596  "do not match a dictionary word (lower is better).");
597 
599  "Score multiplier for poorly cased strings that are not in"
600  " the dictionary and generally look like garbage (lower is"
601  " better).");
603  "Output file for ambiguities found in the dictionary");
604  INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
605  ", to 2 for more details, to 3 to see all the debug messages");
606  INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
607  INT_VAR_H(max_viterbi_list_size, 10, "Maximum size of viterbi list.");
609  "Use only the first UTF8 step of the given string"
610  " when computing log probabilities.");
611  double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
613  "Certainty threshold for non-dict words");
615  "Reject certainty offset");
617  "Size of dict word to be treated as non-dict word");
619  "Certainty to add for each dict char above small word size.");
621  "Max certaintly variation allowed in a word (in sigma)");
622  INT_VAR_H(stopper_debug_level, 0, "Stopper debug level");
624  "Make AcceptableChoice() always return false. Useful"
625  " when there is a need to explore all segmentations");
626  INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
627  STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
628  " should be printed to stdout");
630  "Lengths of unichars in word_to_debug");
631  INT_VAR_H(fragments_debug, 0, "Debug character fragments");
633  "Don't use any alphabetic-specific tricks."
634  "Set to true in the traineddata config file for"
635  " scripts that are cursive or inherently fixed-pitch");
636  BOOL_VAR_H(save_doc_words, 0, "Save Document Words");
638  "Worst certainty for using pending dictionary");
639  double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty"
640  " for words that can be inserted into the document dictionary");
641  INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different"
642  " character choices to consider during permutation."
643  " This limit is especially useful when user patterns"
644  " are specified, since overly generic patterns can result in"
645  " dawg search exploring an overly large number of options.");
646 };
647 } // namespace tesseract
648 
649 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:193
void End()
Definition: dict.cpp:343
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:52
#define INT_VAR_H(name, val, comment)
Definition: params.h:264
PermuterType permuter
Definition: dict.h:83
int UniformCertainties(const WERD_CHOICE &word)
Definition: stopper.cpp:461
int UNICHAR_ID
Definition: unichar.h:35
UNICHAR_ID WildcardID() const
Definition: dict.h:413
int size() const
Definition: genericvector.h:71
float ParamsModelClassify(const char *lang, void *path)
double segment_penalty_dict_case_ok
Definition: dict.h:588
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:419
const CCUtil * getCCUtil() const
Definition: dict.h:92
int valid_word(const WERD_CHOICE &word) const
Definition: dict.h:466
void ResetDocumentDictionary()
Definition: dict.h:311
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:753
int fragments_debug
Definition: dict.h:631
bool valid_end
Definition: dict.h:85
char * word_to_debug
Definition: dict.h:628
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:369
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:28
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:360
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:104
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:320
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:673
float certainty
Definition: dict.h:45
double certainty_scale
Definition: dict.h:611
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:267
double segment_penalty_dict_frequent_word
Definition: dict.h:584
const char * string() const
Definition: strngs.cpp:196
int tessedit_truncate_wordchoice_log
Definition: dict.h:626
void print() const
Definition: ratngs.h:580
double ngram_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
double doc_dict_pending_threshold
Definition: dict.h:638
int max_viterbi_list_size
Definition: dict.h:607
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:569
double segment_penalty_dict_case_bad
Definition: dict.h:592
#define double_VAR_H(name, val, comment)
Definition: params.h:273
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:41
bool stopper_no_acceptable_choices
Definition: dict.h:625
double segment_penalty_dict_nonword
Definition: dict.h:596
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:830
float rating() const
Definition: ratngs.h:327
bool load_number_dawg
Definition: dict.h:573
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:171
int hyphen_debug_level
Definition: dict.h:606
#define STRING_VAR_H(name, val, comment)
Definition: params.h:270
bool load_punc_dawg
Definition: dict.h:572
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:787
bool load_freq_dawg
Definition: dict.h:569
int stopper_debug_level
Definition: dict.h:622
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
Definition: stopper.cpp:368
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:131
char * user_words_file
Definition: dict.h:561
void EndDangerousAmbigs()
Definition: stopper.cpp:358
char * output_ambig_words_file
Definition: dict.h:603
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:415
int valid_word_or_number(const WERD_CHOICE &word) const
Definition: dict.h:469
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:423
double segment_penalty_garbage
Definition: dict.h:601
float(Dict::* params_model_classify_)(const char *lang, void *path)
Definition: dict.h:403
bool load_unambig_dawg
Definition: dict.h:570
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
Definition: dict.h:421
const CHAR_FRAGMENT * fragment
Definition: dict.h:42
int valid_word(const char *string) const
This function is used by api/tesseract_cube_combiner.cpp.
Definition: dict.h:473
int good_choice(const WERD_CHOICE &choice)
Returns true if a good answer is found for the unknown blob rating.
int64_t EDGE_REF
Definition: dawg.h:55
int64_t NODE_REF
Definition: dawg.h:56
UNICHAR_ID unichar_id
Definition: dict.h:41
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:417
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:50
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:45
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
Definition: dict.cpp:531
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
char * user_words_suffix
Definition: dict.h:563
UNICHARSET unicharset
Definition: ccutil.h:68
STRING lang
Definition: ccutil.h:66
double stopper_allowable_character_badness
Definition: dict.h:621
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:205
float CallParamsModelClassify(void *path)
Definition: dict.h:406
double stopper_nondict_certainty_base
Definition: dict.h:613
DawgPositionVector * updated_dawgs
Definition: dict.h:82
void SetWildcardID(UNICHAR_ID id)
Definition: dict.h:412
bool IsSpaceDelimitedLang() const
Returns true if the language is space-delimited (not CJ, or T).
Definition: dict.cpp:857
bool load_bigram_dawg
Definition: dict.h:575
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:219
bool FinishLoad()
Definition: dict.cpp:323
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Definition: context.cpp:70
XHeightConsistencyEnum
Definition: dict.h:75
int length() const
Definition: ratngs.h:303
void LoadLSTM(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:302
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:367
CCUtil * getCCUtil()
Definition: dict.h:95
DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
Definition: dict.h:78
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:201
double doc_dict_certainty_threshold
Definition: dict.h:640
bool use_only_first_uft8_step
Definition: dict.h:610
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:830
double xheight_penalty_subscripts
Definition: dict.h:578
bool load_system_dawg
Definition: dict.h:568
int num_fragments
Definition: dict.h:43
double stopper_certainty_per_char
Definition: dict.h:619
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:245
void DebugWordChoices()
Prints the current choices for this word to stdout.
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:137
int dawg_debug_level
Definition: dict.h:605
Definition: strngs.h:45
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:375
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:127
int max_permuter_attempts
Definition: dict.h:645
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
DawgPositionVector * active_dawgs
Definition: dict.h:81
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
char * word_to_debug_lengths
Definition: dict.h:630
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:126
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:118
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:613
int stopper_smallword_size
Definition: dict.h:617
char * user_patterns_file
Definition: dict.h:565
char * user_patterns_suffix
Definition: dict.h:567
bool AcceptableResult(WERD_RES *word) const
Definition: stopper.cpp:101
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:586
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:386
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:109
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:144
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:615
Definition: matrix.h:575
bool has_hyphen_end(const WERD_CHOICE &word) const
Same as above, but check the unichar at the end of the word.
Definition: dict.h:152
UNICHARSET & getUnicharset()
Definition: dict.h:101
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:364
DawgType type() const
Definition: dawg.h:128
Dict(CCUtil *image_ptr)
Definition: dict.cpp:30
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:174
double xheight_penalty_inconsistent
Definition: dict.h:581
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:459
bool segment_nonalphabetic_script
Definition: dict.h:635
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:203
PermuterType
Definition: ratngs.h:242
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:357
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
Definition: dict.h:495
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:442
bool save_doc_words
Definition: dict.h:636
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:361
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:142
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:69
#define ASSERT_HOST(x)
Definition: errcode.h:84
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:433
float rating
Definition: dict.h:44
void clear()
Definition: trie.cpp:62