tessapi/3.x/a01064_source.html

 // File:        dict.h

 // Description: dict class.

 // Author:      Samuel Charron

 //

 // (C) Copyright 2006, Google Inc.

 // Licensed under the Apache License, Version 2.0 (the "License");

 // you may not use this file except in compliance with the License.

 // You may obtain a copy of the License at

 // http://www.apache.org/licenses/LICENSE-2.0

 // Unless required by applicable law or agreed to in writing, software

 // distributed under the License is distributed on an "AS IS" BASIS,

 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 // See the License for the specific language governing permissions and

 // limitations under the License.

 //


 #ifndef TESSERACT_DICT_DICT_H_

 #define TESSERACT_DICT_DICT_H_


 #include "ambigs.h"

 #include "dawg.h"

 #include "dawg_cache.h"

 #include "host.h"

 #include "oldlist.h"

 #include "ratngs.h"

 #include "stopper.h"

 #include "trie.h"

 #include "unicharset.h"

 #include "params_training_featdef.h"


 class MATRIX;

 class WERD_RES;


 #define MAX_WERD_LENGTH        (inT64) 128

 #define NO_RATING               -1


 struct CHAR_FRAGMENT_INFO {

   UNICHAR_ID unichar_id;

   const CHAR_FRAGMENT *fragment;

   int num_fragments;

   float rating;

   float certainty;

 };


 namespace tesseract {


 typedef GenericVector<Dawg *> DawgVector;


 //

 // Constants

 //

 static const int kRatingPad = 4;

 static const char kDictWildcard[] = "\u2606";   // WHITE STAR

 static const int kDictMaxWildcards = 2;  // max wildcards for a word

 // TODO(daria): If hyphens are different in different languages and can be

 // inferred from training data we should load their values dynamically.

 static const char kHyphenSymbol[] = "-";

 static const char kSlashSymbol[] = "/";

 static const char kQuestionSymbol[] = "?";

 static const char kApostropheSymbol[] = "'";

 static const float kSimCertaintyScale = -10.0;   // similarity matcher scaling

 static const float kSimCertaintyOffset = -10.0;  // similarity matcher offset

 static const float kSimilarityFloor = 100.0;  // worst E*L product to stop on

 static const int kDocDictMaxRepChars = 4;


 // Enum for describing whether the x-height for the word is consistent:

 //  0 - everything is good.

 //  1 - there are one or two secondary (but consistent) baselines

 //      [think subscript and superscript], or there is an oversized

 //      first character.

 //  2 - the word is inconsistent.

 enum XHeightConsistencyEnum {XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT};


 struct DawgArgs {

   DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)

       : active_dawgs(d), updated_dawgs(up), permuter(p) {}


   DawgPositionVector *active_dawgs;

   DawgPositionVector *updated_dawgs;

   PermuterType permuter;

 };


 class Dict {

  public:

   Dict(CCUtil* image_ptr);

   ~Dict();

   const CCUtil* getCCUtil() const {

     return ccutil_;

   }

   CCUtil* getCCUtil() {

     return ccutil_;

   }

   const UNICHARSET& getUnicharset() const {

     return getCCUtil()->unicharset;

   }

   UNICHARSET& getUnicharset() {

     return getCCUtil()->unicharset;

   }

   const UnicharAmbigs &getUnicharAmbigs() const {

     return getCCUtil()->unichar_ambigs;

   }


   // Returns true if unichar_id is a word compounding character like - or /.

   inline bool compound_marker(UNICHAR_ID unichar_id) {

     const GenericVector<UNICHAR_ID>& normed_ids =

         getUnicharset().normed_ids(unichar_id);

     return normed_ids.size() == 1 &&

         (normed_ids[0] == hyphen_unichar_id_ ||

          normed_ids[0] == slash_unichar_id_);

   }

   // Returns true if unichar_id is an apostrophe-like character that may

   // separate prefix/suffix words from a main body word.

   inline bool is_apostrophe(UNICHAR_ID unichar_id) {

     const GenericVector<UNICHAR_ID>& normed_ids =

         getUnicharset().normed_ids(unichar_id);

     return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;

   }


   /* hyphen.cpp ************************************************************/


   inline bool hyphenated() const { return

     !last_word_on_line_ && hyphen_word_;

   }

   inline int hyphen_base_size() const {

     return this->hyphenated() ? hyphen_word_->length() : 0;

   }

   inline void copy_hyphen_info(WERD_CHOICE *word) const {

     if (this->hyphenated()) {

       *word = *hyphen_word_;

       if (hyphen_debug_level) word->print("copy_hyphen_info: ");

     }

   }

   inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const {

     if (!last_word_on_line_ || first_pos)

       return false;

     const GenericVector<UNICHAR_ID>& normed_ids =

         getUnicharset().normed_ids(unichar_id);

     return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;

   }

   inline bool has_hyphen_end(const WERD_CHOICE &word) const {

     int word_index = word.length() - 1;

     return has_hyphen_end(word.unichar_id(word_index), word_index == 0);

   }

   void reset_hyphen_vars(bool last_word_on_line);

   void set_hyphen_word(const WERD_CHOICE &word,

                        const DawgPositionVector &active_dawgs);


   /* permdawg.cpp ************************************************************/

   // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().

   // When this function is refactored, permdawg.cpp can be removed.


   inline void update_best_choice(const WERD_CHOICE &word,

                                  WERD_CHOICE *best_choice) {

     if (word.rating() < best_choice->rating()) {

       *best_choice = word;

     }

   }

   void init_active_dawgs(DawgPositionVector *active_dawgs,

                          bool ambigs_mode) const;

   // Fill the given vector with the default collection of any-length dawgs

   void default_dawgs(DawgPositionVector *anylength_dawgs,

                                bool suppress_patterns) const;


   WERD_CHOICE *dawg_permute_and_select(

       const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit);

   void go_deeper_dawg_fxn(

       const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,

       int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,

       bool word_ending, WERD_CHOICE *word, float certainties[],

       float *limit, WERD_CHOICE *best_choice, int *attempts_left,

       void *void_more_args);


   void (Dict::*go_deeper_fxn_)(const char *debug,

                                const BLOB_CHOICE_LIST_VECTOR &char_choices,

                                int char_choice_index,

                                const CHAR_FRAGMENT_INFO *prev_char_frag_info,

                                bool word_ending, WERD_CHOICE *word,

                                float certainties[], float *limit,

                                WERD_CHOICE *best_choice, int *attempts_left,

                                void *void_more_args);

   //

   // Helper functions for dawg_permute_and_select().

   //

   void permute_choices(

       const char *debug,

       const BLOB_CHOICE_LIST_VECTOR &char_choices,

       int char_choice_index,

       const CHAR_FRAGMENT_INFO *prev_char_frag_info,

       WERD_CHOICE *word,

       float certainties[],

       float *limit,

       WERD_CHOICE *best_choice,

       int *attempts_left,

       void *more_args);


   void append_choices(

       const char *debug,

       const BLOB_CHOICE_LIST_VECTOR &char_choices,

       const BLOB_CHOICE &blob_choice,

       int char_choice_index,

       const CHAR_FRAGMENT_INFO *prev_char_frag_info,

       WERD_CHOICE *word,

       float certainties[],

       float *limit,

       WERD_CHOICE *best_choice,

       int *attempts_left,

       void *more_args);


     bool fragment_state_okay(UNICHAR_ID curr_unichar_id,

                              float curr_rating, float curr_certainty,

                              const CHAR_FRAGMENT_INFO *prev_char_frag_info,

                              const char *debug, int word_ending,

                              CHAR_FRAGMENT_INFO *char_frag_info);


   /* stopper.cpp *************************************************************/

   bool NoDangerousAmbig(WERD_CHOICE *BestChoice,

                         DANGERR *fixpt,

                         bool fix_replaceable,

                         MATRIX* ratings);

   // Replaces the corresponding wrong ngram in werd_choice with the correct

   // one. The whole correct n-gram is inserted into the ratings matrix and

   // the werd_choice: no more fragments!. Rating and certainty of new entries

   // in matrix and werd_choice are the sum and mean of the wrong ngram

   // respectively.

   // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes

   // mystring", with a new entry in the ratings matrix for ".

   void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,

                     UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,

                     MATRIX *ratings);


   int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice);

   int UniformCertainties(const WERD_CHOICE& word);

   bool AcceptableChoice(const WERD_CHOICE& best_choice,

                         XHeightConsistencyEnum xheight_consistency);

   bool AcceptableResult(WERD_RES* word);

   void EndDangerousAmbigs();

   void DebugWordChoices();

   void SettupStopperPass1();

   void SettupStopperPass2();

   /* context.cpp *************************************************************/

   int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset);

   bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);


   /* dict.cpp ****************************************************************/


   static DawgCache *GlobalDawgCache();

   void Load(DawgCache *dawg_cache);

   void End();


   // Resets the document dictionary analogous to ResetAdaptiveClassifier.

   void ResetDocumentDictionary() {

     if (pending_words_ != NULL)

       pending_words_->clear();

     if (document_words_ != NULL)

       document_words_->clear();

   }


   //

   int def_letter_is_okay(void* void_dawg_args,

                          UNICHAR_ID unichar_id, bool word_end) const;


   int (Dict::*letter_is_okay_)(void* void_dawg_args,

                                UNICHAR_ID unichar_id, bool word_end) const;

   int LetterIsOkay(void* void_dawg_args,

                    UNICHAR_ID unichar_id, bool word_end) const {

     return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);

   }


   double (Dict::*probability_in_context_)(const char* lang,

                                           const char* context,

                                           int context_bytes,

                                           const char* character,

                                           int character_bytes);

   double ProbabilityInContext(const char* context,

                               int context_bytes,

                               const char* character,

                               int character_bytes) {

     return (this->*probability_in_context_)(

         getCCUtil()->lang.string(),

         context, context_bytes,

         character, character_bytes);

   }


   double def_probability_in_context(

       const char* lang, const char* context, int context_bytes,

       const char* character, int character_bytes) {

     (void) context;

     (void) context_bytes;

     (void) character;

     (void) character_bytes;

     return 0.0;

   }

   double ngram_probability_in_context(const char* lang,

                                       const char* context,

                                       int context_bytes,

                                       const char* character,

                                       int character_bytes);


   // Interface with params model.

   float (Dict::*params_model_classify_)(const char *lang, void *path);

   float ParamsModelClassify(const char *lang, void *path);

   // Call params_model_classify_ member function.

   float CallParamsModelClassify(void *path) {

     ASSERT_HOST(params_model_classify_ != NULL);  // ASSERT_HOST -> assert

     return (this->*params_model_classify_)(

         getCCUtil()->lang.string(), path);

   }


   inline void SetWildcardID(UNICHAR_ID id) { wildcard_unichar_id_ = id; }

   inline const UNICHAR_ID WildcardID() const {

     return wildcard_unichar_id_;

   }

   inline const int NumDawgs() const { return dawgs_.size(); }

   inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }

   inline const Dawg *GetPuncDawg() const { return punc_dawg_; }

   inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }

   static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {

     if (edge_ref == NO_EDGE) return 0;  // beginning to explore the dawg

     NODE_REF node = dawg->next_node(edge_ref);

     if (node == 0) node = NO_EDGE;  // end of word

     return node;

   }


   // Given a unichar from a string and a given dawg, return the unichar

   // we should use to match in that dawg type.  (for example, in the number

   // dawg, all numbers are transformed to kPatternUnicharId).

   inline UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const {

     if (!dawg) return ch;

     switch (dawg->type()) {

       case DAWG_TYPE_NUMBER:

         return getUnicharset().get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;

       default:

         return ch;

     }

   }


   void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info,

                            UNICHAR_ID unichar_id, bool word_end,

                            DawgPositionVector *updated_dawgs,

                            PermuterType *current_permuter) const;


   inline static bool valid_word_permuter(uinT8 perm, bool numbers_ok) {

     return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||

             perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||

             perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||

             (numbers_ok && perm == NUMBER_PERM));

   }

   int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;

   int valid_word(const WERD_CHOICE &word) const {

     return valid_word(word, false);  // return NO_PERM for words with digits

   }

   int valid_word_or_number(const WERD_CHOICE &word) const {

     return valid_word(word, true);  // return NUMBER_PERM for valid numbers

   }

   int valid_word(const char *string) const {

     WERD_CHOICE word(string, getUnicharset());

     return valid_word(word);

   }

   // Do the two WERD_CHOICEs form a meaningful bigram?

   bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;

   bool valid_punctuation(const WERD_CHOICE &word);

   int good_choice(const WERD_CHOICE &choice);

   void add_document_word(const WERD_CHOICE &best_choice);

   void adjust_word(WERD_CHOICE *word,

                    bool nonword, XHeightConsistencyEnum xheight_consistency,

                    float additional_adjust,

                    bool modify_rating,

                    bool debug);

   inline void SetWordsegRatingAdjustFactor(float f) {

     wordseg_rating_adjust_factor_ = f;

   }


  private:

   CCUtil* ccutil_;

   UnicharAmbigs *dang_ambigs_table_;

   UnicharAmbigs *replace_ambigs_table_;

   FLOAT32 reject_offset_;

   // Cached UNICHAR_IDs:

   UNICHAR_ID wildcard_unichar_id_;    // kDictWildcard.

   UNICHAR_ID apostrophe_unichar_id_;  // kApostropheSymbol.

   UNICHAR_ID question_unichar_id_;    // kQuestionSymbol.

   UNICHAR_ID slash_unichar_id_;       // kSlashSymbol.

   UNICHAR_ID hyphen_unichar_id_;      // kHyphenSymbol.

   // Hyphen-related variables.

   WERD_CHOICE *hyphen_word_;

   DawgPositionVector hyphen_active_dawgs_;

   bool last_word_on_line_;

   // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary

   // matching.  The first member of each list is taken as canonical.  For

   // example, the first list contains hyphens and dashes with the first symbol

   // being the ASCII hyphen minus.

   GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_;

   // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.

   DawgCache *dawg_cache_;

   bool dawg_cache_is_ours_;  // we should delete our own dawg_cache_

   // Dawgs.

   DawgVector dawgs_;

   SuccessorListsVector successors_;

   Trie *pending_words_;

   // bigram_dawg_ points to a dawg of two-word bigrams which always supercede if

   // any of them are present on the best choices list for a word pair.

   // the bigrams are stored as space-separated words where:

   // (1) leading and trailing punctuation has been removed from each word and

   // (2) any digits have been replaced with '?' marks.

   Dawg *bigram_dawg_;

   // TODO(daria): need to support multiple languages in the future,

   // so maybe will need to maintain a list of dawgs of each kind.

   Dawg *freq_dawg_;

   Dawg *unambig_dawg_;

   Dawg *punc_dawg_;

   Trie *document_words_;

   float wordseg_rating_adjust_factor_;

   // File for recording ambiguities discovered during dictionary search.

   FILE *output_ambig_words_file_;


  public:

   STRING_VAR_H(user_words_file, "", "A filename of user-provided words.");

   STRING_VAR_H(user_words_suffix, "",

                "A suffix of user-provided words located in tessdata.");

   STRING_VAR_H(user_patterns_file, "",

                "A filename of user-provided patterns.");

   STRING_VAR_H(user_patterns_suffix, "",

                "A suffix of user-provided patterns located in tessdata.");

   BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");

   BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");

   BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");

   BOOL_VAR_H(load_punc_dawg, true,

              "Load dawg with punctuation patterns.");

   BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");

   BOOL_VAR_H(load_bigram_dawg, true,

              "Load dawg with special word bigrams.");

   double_VAR_H(xheight_penalty_subscripts, 0.125,

                "Score penalty (0.1 = 10%) added if there are subscripts "

                "or superscripts in a word, but it is otherwise OK.");

   double_VAR_H(xheight_penalty_inconsistent, 0.25,

                "Score penalty (0.1 = 10%) added if an xheight is "

                "inconsistent.");

   double_VAR_H(segment_penalty_dict_frequent_word, 1.0,

                "Score multiplier for word matches which have good case and"

                "are frequent in the given language (lower is better).");


   double_VAR_H(segment_penalty_dict_case_ok, 1.1,

                "Score multiplier for word matches that have good case "

                "(lower is better).");


   double_VAR_H(segment_penalty_dict_case_bad, 1.3125,

                "Default score multiplier for word matches, which may have "

                "case issues (lower is better).");


   // TODO(daria): remove this param when ngram permuter is deprecated.

   double_VAR_H(segment_penalty_ngram_best_choice, 1.24,

                "Multipler to for the best choice from the ngram model.");


   double_VAR_H(segment_penalty_dict_nonword, 1.25,

                "Score multiplier for glyph fragment segmentations which "

                "do not match a dictionary word (lower is better).");


   double_VAR_H(segment_penalty_garbage, 1.50,

                "Score multiplier for poorly cased strings that are not in"

                " the dictionary and generally look like garbage (lower is"

                " better).");

   STRING_VAR_H(output_ambig_words_file, "",

                "Output file for ambiguities found in the dictionary");

   INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"

             ", to 2 for more details, to 3 to see all the debug messages");

   INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");

   INT_VAR_H(max_viterbi_list_size, 10, "Maximum size of viterbi list.");

   BOOL_VAR_H(use_only_first_uft8_step, false,

              "Use only the first UTF8 step of the given string"

              " when computing log probabilities.");

   double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");

   double_VAR_H(stopper_nondict_certainty_base, -2.50,

                "Certainty threshold for non-dict words");

   double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0,

                "Reject certainty offset");

   INT_VAR_H(stopper_smallword_size, 2,

             "Size of dict word to be treated as non-dict word");

   double_VAR_H(stopper_certainty_per_char, -0.50,

                "Certainty to add for each dict char above small word size.");

   double_VAR_H(stopper_allowable_character_badness, 3.0,

                "Max certaintly variation allowed in a word (in sigma)");

   INT_VAR_H(stopper_debug_level, 0, "Stopper debug level");

   BOOL_VAR_H(stopper_no_acceptable_choices, false,

              "Make AcceptableChoice() always return false. Useful"

              " when there is a need to explore all segmentations");

   BOOL_VAR_H(save_raw_choices, false,

              "Deprecated- backward compatability only");

   INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");

   STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"

                " should be printed to stdout");

   STRING_VAR_H(word_to_debug_lengths, "",

                "Lengths of unichars in word_to_debug");

   INT_VAR_H(fragments_debug, 0, "Debug character fragments");

   BOOL_VAR_H(segment_nonalphabetic_script, false,

              "Don't use any alphabetic-specific tricks."

              "Set to true in the traineddata config file for"

              " scripts that are cursive or inherently fixed-pitch");

   BOOL_VAR_H(save_doc_words, 0, "Save Document Words");

   double_VAR_H(doc_dict_pending_threshold, 0.0,

                "Worst certainty for using pending dictionary");

   double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty"

                " for words that can be inserted into the document dictionary");

   INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different"

               " character choices to consider during permutation."

               " This limit is especially useful when user patterns"

               " are specified, since overly generic patterns can result in"

               " dawg search exploring an overly large number of options.");

 };

 }  // namespace tesseract


 #endif  // THIRD_PARTY_TESSERACT_DICT_DICT_H_

tesseract::Dict::Dict
Dict(CCUtil *image_ptr)
Definition: dict.cpp:33

tesseract::Dict::ReplaceAmbig
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
Definition: stopper.cpp:378

tesseract::Dict::stopper_phase2_certainty_rejection_offset
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:605

tesseract::DawgArgs::active_dawgs
DawgPositionVector * active_dawgs
Definition: dict.h:81

COMPOUND_PERM
Definition: ratngs.h:253

character
Definition: mfoutline.h:54

tesseract::Dict::probability_in_context_
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:357

tesseract::Dict::hyphen_debug_level
int hyphen_debug_level
Definition: dict.h:596

GenericVector::size
int size() const
Definition: genericvector.h:72

ratngs.h

WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:324

tesseract::Dict::go_deeper_dawg_fxn
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:52

CHAR_FRAGMENT_INFO::fragment
const CHAR_FRAGMENT * fragment
Definition: dict.h:42

params_training_featdef.h

tesseract::Dict::def_probability_in_context
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:374

WERD_RES
Definition: pageres.h:155

FLOAT32
float FLOAT32
Definition: host.h:111

tesseract::Dict::max_permuter_attempts
int max_permuter_attempts
Definition: dict.h:637

tesseract::DawgCache
Definition: dawg_cache.h:30

tesseract::Dict::word_to_debug_lengths
char * word_to_debug_lengths
Definition: dict.h:622

tesseract::Dict::has_hyphen_end
bool has_hyphen_end(const WERD_CHOICE &word) const
Same as above, but check the unichar at the end of the word.
Definition: dict.h:150

tesseract::Dict::valid_word
int valid_word(const char *string) const
This function is used by api/tesseract_cube_combiner.cpp.
Definition: dict.h:461

tesseract::Dict::getCCUtil
const CCUtil * getCCUtil() const
Definition: dict.h:90

tesseract::Dict::EndDangerousAmbigs
void EndDangerousAmbigs()
Definition: stopper.cpp:368

trie.h

tesseract::Dict::update_best_choice
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:169

CHAR_FRAGMENT
Definition: unicharset.h:42

WERD_CHOICE::length
int length() const
Definition: ratngs.h:300

tesseract::Dict::word_to_debug
char * word_to_debug
Definition: dict.h:620

USER_DAWG_PERM
Definition: ratngs.h:251

tesseract::Dict::load_bigram_dawg
bool load_bigram_dawg
Definition: dict.h:561

tesseract::Dict::valid_word
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:705

tesseract::DawgPositionVector
Definition: dawg.h:369

INT_VAR_H
#define INT_VAR_H(name, val, comment)
Definition: params.h:265

unicharset.h

tesseract::Dict::ProcessPatternEdges
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgPositionVector *updated_dawgs, PermuterType *current_permuter) const
Definition: dict.cpp:486

tesseract::Dict::absolute_garbage
bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Definition: context.cpp:76

tesseract::Dict::good_choice
int good_choice(const WERD_CHOICE &choice)
Returns true if a good answer is found for the unknown blob rating.

PermuterType
PermuterType
Definition: ratngs.h:240

tesseract::Dict::stopper_allowable_character_badness
double stopper_allowable_character_badness
Definition: dict.h:611

tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:72

tesseract::Dict::segment_penalty_dict_case_ok
double segment_penalty_dict_case_ok
Definition: dict.h:574

tesseract::XHeightConsistencyEnum
XHeightConsistencyEnum
Definition: dict.h:75

tesseract::Dict::reset_hyphen_vars
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32

tesseract::Dict::WildcardID
const UNICHAR_ID WildcardID() const
Definition: dict.h:400

tesseract::Dict
Definition: dict.h:86

tesseract::Dict::segment_nonalphabetic_script
bool segment_nonalphabetic_script
Definition: dict.h:627

tesseract::Dict::def_letter_is_okay
int def_letter_is_okay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:336

tesseract::Dict::xheight_penalty_inconsistent
double xheight_penalty_inconsistent
Definition: dict.h:567

tesseract::DawgVector
GenericVector< Dawg * > DawgVector
Definition: dict.h:50

tesseract::Trie
Definition: trie.h:62

tesseract::Dict::init_active_dawgs
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:523

CHAR_FRAGMENT_INFO::unichar_id
UNICHAR_ID unichar_id
Definition: dict.h:41

USER_PATTERN_PERM
Definition: ratngs.h:248

BLOB_CHOICE
Definition: ratngs.h:48

tesseract::Dict::segment_penalty_ngram_best_choice
double segment_penalty_ngram_best_choice
Definition: dict.h:582

tesseract::Dict::segment_penalty_dict_case_bad
double segment_penalty_dict_case_bad
Definition: dict.h:578

tesseract::DawgArgs::DawgArgs
DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
Definition: dict.h:78

UNICHARSET::normed_ids
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783

tesseract::CCUtil::unichar_ambigs
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:73

WERD_CHOICE
Definition: ratngs.h:271

DOC_DAWG_PERM
Definition: ratngs.h:250

tesseract::Dict::default_dawgs
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:540

tesseract::Trie::clear
void clear()
Definition: trie.cpp:66

tesseract::DawgArgs
Definition: dict.h:77

tesseract::Dict::save_doc_words
bool save_doc_words
Definition: dict.h:628

tesseract::Dict::permute_choices
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:204

tesseract::UnicharAmbigs
Definition: ambigs.h:144

tesseract::Dict::stopper_smallword_size
int stopper_smallword_size
Definition: dict.h:607

tesseract::Dict::go_deeper_fxn_
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:203

tesseract::Dict::fragments_debug
int fragments_debug
Definition: dict.h:623

tesseract::Dict::user_patterns_file
char * user_patterns_file
Definition: dict.h:551

dawg_cache.h

STRING_VAR_H
#define STRING_VAR_H(name, val, comment)
Definition: params.h:271

tesseract::Dict::SetWordsegRatingAdjustFactor
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
Definition: dict.h:483

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:84

tesseract::Dict::valid_word
int valid_word(const WERD_CHOICE &word) const
Definition: dict.h:454

tesseract::Dawg::kPatternUnicharID
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:125

tesseract::Dict::stopper_certainty_per_char
double stopper_certainty_per_char
Definition: dict.h:609

tesseract::Dict::params_model_classify_
float(Dict::* params_model_classify_)(const char *lang, void *path)
Definition: dict.h:390

tesseract::Dict::End
void End()
Definition: dict.cpp:310

tesseract::Dict::AcceptableChoice
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:51

tesseract::Dict::SettupStopperPass1
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:370

UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470

oldlist.h

tesseract::XH_INCONSISTENT
Definition: dict.h:75

tesseract::Dict::getCCUtil
CCUtil * getCCUtil()
Definition: dict.h:93

tesseract::Dict::LengthOfShortestAlphaRun
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice)
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:452

tesseract::Dict::doc_dict_pending_threshold
double doc_dict_pending_threshold
Definition: dict.h:630

tesseract::Dict::GlobalDawgCache
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:186

tesseract::Dict::copy_hyphen_info
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:135

tesseract::Dict::valid_word_permuter
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:447

tesseract::Dict::valid_punctuation
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:781

tesseract::Dict::case_ok
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:58

tesseract::Dict::ParamsModelClassify
float ParamsModelClassify(const char *lang, void *path)

tesseract::Dict::getUnicharAmbigs
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:102

tesseract::Dict::xheight_penalty_subscripts
double xheight_penalty_subscripts
Definition: dict.h:564

tesseract::Dict::ngram_probability_in_context
double ngram_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)

tesseract::Dict::max_viterbi_list_size
int max_viterbi_list_size
Definition: dict.h:597

tesseract::Dict::hyphen_base_size
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:129

tesseract::DAWG_TYPE_NUMBER
Definition: dawg.h:74

tesseract::CCUtil
Definition: ccutil.h:54

WERD_CHOICE::unichar_id
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312

tesseract::Dict::compound_marker
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:107

tesseract::Dict::valid_word_or_number
int valid_word_or_number(const WERD_CHOICE &word) const
Definition: dict.h:457

tesseract::DawgPosition
Definition: dawg.h:342

tesseract::Dict::NoDangerousAmbig
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:152

FREQ_DAWG_PERM
Definition: ratngs.h:252

NUMBER_PERM
Definition: ratngs.h:247

SYSTEM_DAWG_PERM
Definition: ratngs.h:249

tesseract::XH_GOOD
Definition: dict.h:75

tesseract::Dict::use_only_first_uft8_step
bool use_only_first_uft8_step
Definition: dict.h:600

tesseract::Dict::stopper_nondict_certainty_base
double stopper_nondict_certainty_base
Definition: dict.h:603

tesseract::Dict::char_for_dawg
UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:422

tesseract::Dict::segment_penalty_dict_frequent_word
double segment_penalty_dict_frequent_word
Definition: dict.h:570

tesseract::Dict::getUnicharset
UNICHARSET & getUnicharset()
Definition: dict.h:99

tesseract::Dict::GetPuncDawg
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:408

tesseract::Dict::hyphenated
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:125

tesseract::Dict::append_choices
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:246

tesseract::Dict::stopper_debug_level
int stopper_debug_level
Definition: dict.h:612

tesseract::Dict::user_patterns_suffix
char * user_patterns_suffix
Definition: dict.h:553

tesseract::Dict::dawg_permute_and_select
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:175

tesseract::Dict::SettupStopperPass2
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:374

tesseract::Dict::load_system_dawg
bool load_system_dawg
Definition: dict.h:554

double_VAR_H
#define double_VAR_H(name, val, comment)
Definition: params.h:274

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:33

tesseract::Dict::valid_bigram
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:738

tesseract::Dict::stopper_no_acceptable_choices
bool stopper_no_acceptable_choices
Definition: dict.h:615

tesseract::Dict::LetterIsOkay
int LetterIsOkay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:350

tesseract::Dict::~Dict
~Dict()
Definition: dict.cpp:181

tesseract::Dict::doc_dict_certainty_threshold
double doc_dict_certainty_threshold
Definition: dict.h:632

tesseract::Dict::GetUnambigDawg
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
Definition: dict.h:410

tesseract::Dawg
Definition: dawg.h:118

tesseract::Dict::dawg_debug_level
int dawg_debug_level
Definition: dict.h:595

tesseract::Dict::ProbabilityInContext
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:363

tesseract::Dict::load_unambig_dawg
bool load_unambig_dawg
Definition: dict.h:556

tesseract::Dict::load_freq_dawg
bool load_freq_dawg
Definition: dict.h:555

tesseract::Dict::NumDawgs
const int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:404

CHAR_FRAGMENT_INFO::num_fragments
int num_fragments
Definition: dict.h:43

tesseract::Dict::Load
void Load(DawgCache *dawg_cache)
Definition: dict.cpp:194

tesseract::Dict::letter_is_okay_
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:347

tesseract::Dict::user_words_suffix
char * user_words_suffix
Definition: dict.h:549

tesseract::Dict::DebugWordChoices
void DebugWordChoices()
Prints the current choices for this word to stdout.

tesseract
Definition: baseapi.cpp:83

tesseract::Dict::output_ambig_words_file
char * output_ambig_words_file
Definition: dict.h:593

tesseract::Dict::getUnicharset
const UNICHARSET & getUnicharset() const
Definition: dict.h:96

tesseract::Dawg::type
DawgType type() const
Definition: dawg.h:127

tesseract::Dict::segment_penalty_dict_nonword
double segment_penalty_dict_nonword
Definition: dict.h:586

tesseract::Dict::fragment_state_okay
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:322

tesseract::Dict::CallParamsModelClassify
float CallParamsModelClassify(void *path)
Definition: dict.h:393

tesseract-c_api-demo.lang
string lang
Definition: tesseract-c_api-demo.py:28

CHAR_FRAGMENT_INFO::rating
float rating
Definition: dict.h:44

UNICHARSET
Definition: unicharset.h:139

tesseract::Dict::AcceptableResult
bool AcceptableResult(WERD_RES *word)
Definition: stopper.cpp:111

tesseract::Dict::segment_penalty_garbage
double segment_penalty_garbage
Definition: dict.h:591

tesseract::CCUtil::lang
STRING lang
Definition: ccutil.h:69

tesseract::Dict::GetDawg
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:406

MATRIX
Definition: matrix.h:289

tesseract::DawgArgs::updated_dawgs
DawgPositionVector * updated_dawgs
Definition: dict.h:82

WERD_CHOICE::print
void print() const
Definition: ratngs.h:563

EDGE_REF
inT64 EDGE_REF
Definition: dawg.h:54

stopper.h

tesseract::Dict::user_words_file
char * user_words_file
Definition: dict.h:547

tesseract::Dict::save_raw_choices
bool save_raw_choices
Definition: dict.h:617

NULL
#define NULL
Definition: host.h:144

tesseract::Dict::set_hyphen_word
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:49

tesseract::Dict::tessedit_truncate_wordchoice_log
int tessedit_truncate_wordchoice_log
Definition: dict.h:618

NODE_REF
inT64 NODE_REF
Definition: dawg.h:55

host.h

ambigs.h

tesseract::Dict::add_document_word
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:567

tesseract::Dict::has_hyphen_end
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:142

GenericVector< Dawg * >

tesseract::XH_SUBNORMAL
Definition: dict.h:75

STRING::string
const char * string() const
Definition: strngs.cpp:193

dawg.h

CHAR_FRAGMENT_INFO
Definition: dict.h:40

CHAR_FRAGMENT_INFO::certainty
float certainty
Definition: dict.h:45

tesseract::Dict::UniformCertainties
int UniformCertainties(const WERD_CHOICE &word)
Definition: stopper.cpp:471

tesseract::Dict::is_apostrophe
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:116

tesseract::Dawg::next_node
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0

BOOL_VAR_H
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:268

tesseract::Dict::load_number_dawg
bool load_number_dawg
Definition: dict.h:559

tesseract::Dict::GetStartingNode
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:412

tesseract::Dict::adjust_word
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:625

tesseract::Dict::ResetDocumentDictionary
void ResetDocumentDictionary()
Definition: dict.h:301

tesseract::DawgArgs::permuter
PermuterType permuter
Definition: dict.h:83

tesseract::Dict::load_punc_dawg
bool load_punc_dawg
Definition: dict.h:558

tesseract::Dict::certainty_scale
double certainty_scale
Definition: dict.h:601

uinT8
unsigned char uinT8
Definition: host.h:99

tesseract::Dict::SetWildcardID
void SetWildcardID(UNICHAR_ID id)
Definition: dict.h:399