tesseract  4.0.0-1-g2a2b
tesseract::Classify Class Reference

#include <classify.h>

Inheritance diagram for tesseract::Classify:
tesseract::CCStruct tesseract::CUtil tesseract::CCUtil tesseract::Wordrec tesseract::Tesseract

Public Member Functions

 Classify ()
 
virtual ~Classify ()
 
virtual DictgetDict ()
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uint8_t ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (TFile *fp, CLASS_CUTOFF_ARRAY Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (TFile *File)
 
float ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (TFile *fp)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (TessdataManager *mgr)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (float Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uint8_t *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (TFile *fp)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, float *XScale, float *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const STRING &filename)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()=default
 
virtual ~CCStruct ()
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()=default
 
virtual ~CUtil ()
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 

Static Public Member Functions

static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
 

Public Attributes

bool allow_blob_division = true
 
bool prioritize_division = FALSE
 
int tessedit_single_match = FALSE
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_min_norm_scale_x = 0.0
 
double classify_max_norm_scale_x = 0.325
 
double classify_min_norm_scale_y = 0.0
 
double classify_max_norm_scale_y = 0.325
 
double classify_max_rating_ratio = 1.5
 
double classify_max_certainty_margin = 5.5
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
bool classify_nonlinear_norm = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_reliable_adaptive_result = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
double classify_adapted_pruning_factor = 2.5
 
double classify_adapted_pruning_threshold = -1.0
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = TRUE
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = FALSE
 
bool matcher_debug_separate_windows = FALSE
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 15
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 10
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
ADAPT_TEMPLATES BackupAdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
bool EnableLearning
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
int il1_adaption_test = 0
 
bool classify_bln_numeric_mode = 0
 
double speckle_large_max_size = 0.30
 
double speckle_rating_penalty = 10.0
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
int ambigs_debug_level = 0
 
bool use_definite_ambigs_for_classifier = 0
 
bool use_ambigs_for_adaption = 0
 

Protected Attributes

IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 

Additional Inherited Members

- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 

Detailed Description

Definition at line 103 of file classify.h.

Constructor & Destructor Documentation

◆ Classify()

tesseract::Classify::Classify ( )

Definition at line 60 of file classify.cpp.

61  : BOOL_MEMBER(allow_blob_division, true, "Use divisible blobs chopping",
62  this->params()),
64  "Prioritize blob division over chopping", this->params()),
65  INT_MEMBER(tessedit_single_match, FALSE, "Top choice only from CP",
66  this->params()),
67  BOOL_MEMBER(classify_enable_learning, true, "Enable adaptive classifier",
68  this->params()),
69  INT_MEMBER(classify_debug_level, 0, "Classify debug level",
70  this->params()),
71  INT_MEMBER(classify_norm_method, character, "Normalization Method ...",
72  this->params()),
74  "Character Normalization Range ...", this->params()),
75  double_MEMBER(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...",
76  this->params()), /* PREV DEFAULT 0.1 */
78  "Max char x-norm scale ...",
79  this->params()), /* PREV DEFAULT 0.3 */
80  double_MEMBER(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...",
81  this->params()), /* PREV DEFAULT 0.1 */
83  "Max char y-norm scale ...",
84  this->params()), /* PREV DEFAULT 0.3 */
86  "Veto ratio between classifier ratings", this->params()),
88  "Veto difference between classifier certainties",
89  this->params()),
90  BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
91  this->params()),
92  BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
93  this->params()),
95  "Enable adaptive classifier", this->params()),
97  "Use pre-adapted classifier templates", this->params()),
99  "Save adapted templates to a file", this->params()),
100  BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
101  this->params()),
103  "Non-linear stroke-density normalization", this->params()),
104  INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
105  INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
106  INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
107  this->params()),
108  double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)",
109  this->params()),
110  double_MEMBER(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)",
111  this->params()),
112  double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)",
113  this->params()),
114  double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)",
115  this->params()),
116  double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)",
117  this->params()),
118  double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length",
119  this->params()),
120  INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes",
121  this->params()),
123  "Reliable Config Threshold", this->params()),
125  "Enable adaption even if the ambiguities have not been seen",
126  this->params()),
128  "Maximum angle delta for prototype clustering",
129  this->params()),
131  "Penalty to apply when a non-alnum is vertically out of "
132  "its expected textline position",
133  this->params()),
134  double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()),
135  double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
136  this->params()),
138  "Scale factor for features not used", this->params()),
141  "Prune poor adapted results this much worse than best result",
142  this->params()),
144  "Threshold at which classify_adapted_pruning_factor starts",
145  this->params()),
147  "Threshold for good protos during adaptive 0-255",
148  this->params()),
150  "Threshold for good features during adaptive 0-255",
151  this->params()),
153  "Do not include character fragments in the"
154  " results of the classifier",
155  this->params()),
157  -3.0,
158  "Exclude fragments that do not look like whole"
159  " characters from training and adaption",
160  this->params()),
162  "Bring up graphical debugging windows for fragments training",
163  this->params()),
165  "Use two different windows for debugging the matching: "
166  "One for the protos and one for the features.",
167  this->params()),
168  STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning",
169  this->params()),
171  "Class Pruner Threshold 0-255", this->params()),
173  "Class Pruner Multiplier 0-255: ", this->params()),
175  "Class Pruner CutoffStrength: ", this->params()),
177  "Integer Matcher Multiplier 0-255: ", this->params()),
178  EnableLearning(true),
180  "Don't adapt to i/I at beginning of word", this->params()),
182  "Assume the input is numbers [0-9].", this->params()),
183  double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size",
184  this->params()),
186  "Penalty to add to worst rating for noise", this->params()),
188  shape_table_(nullptr),
189  dict_(this),
190  static_classifier_(nullptr) {
191  fontinfo_table_.set_compare_callback(
193  fontinfo_table_.set_clear_callback(
195  fontset_table_.set_compare_callback(
197  fontset_table_.set_clear_callback(
199  AdaptedTemplates = nullptr;
200  BackupAdaptedTemplates = nullptr;
201  PreTrainedTemplates = nullptr;
202  AllProtosOn = nullptr;
203  AllConfigsOn = nullptr;
204  AllConfigsOff = nullptr;
205  TempProtoMask = nullptr;
206  NormProtos = nullptr;
207 
208  NumAdaptationsFailed = 0;
209 
210  learn_debug_win_ = nullptr;
211  learn_fragmented_word_debug_win_ = nullptr;
212  learn_fragments_debug_win_ = nullptr;
213 }
bool allow_blob_division
Definition: classify.h:423
double classify_max_certainty_margin
Definition: classify.h:445
double tessedit_class_miss_scale
Definition: classify.h:480
double classify_max_norm_scale_x
Definition: classify.h:439
double classify_min_norm_scale_y
Definition: classify.h:440
int matcher_permanent_classes_min
Definition: classify.h:467
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:294
#define TRUE
Definition: capi.h:51
bool classify_enable_adaptive_debugger
Definition: classify.h:455
NORM_PROTOS * NormProtos
Definition: classify.h:527
BIT_VECTOR AllProtosOn
Definition: classify.h:521
bool classify_bln_numeric_mode
Definition: classify.h:541
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:288
bool matcher_debug_separate_windows
Definition: classify.h:499
double matcher_rating_margin
Definition: classify.h:465
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
bool CompareFontSet(const FontSet &fs1, const FontSet &fs2)
Definition: fontinfo.cpp:128
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:291
bool prioritize_division
Definition: classify.h:428
double classify_min_norm_scale_x
Definition: classify.h:438
bool classify_nonlinear_norm
Definition: classify.h:457
int classify_adapt_proto_threshold
Definition: classify.h:486
int classify_class_pruner_multiplier
Definition: classify.h:506
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:297
double classify_character_fragments_garbage_certainty_threshold
Definition: classify.h:494
int classify_class_pruner_threshold
Definition: classify.h:504
bool classify_debug_character_fragments
Definition: classify.h:496
double classify_adapted_pruning_threshold
Definition: classify.h:484
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:120
int classify_learning_debug_level
Definition: classify.h:460
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:510
BIT_VECTOR AllConfigsOff
Definition: classify.h:523
#define FALSE
Definition: capi.h:52
double matcher_good_threshold
Definition: classify.h:461
double speckle_large_max_size
Definition: classify.h:542
double matcher_reliable_adaptive_result
Definition: classify.h:462
void FontSetDeleteCallback(FontSet fs)
Definition: fontinfo.cpp:146
double classify_adapted_pruning_factor
Definition: classify.h:482
double classify_max_rating_ratio
Definition: classify.h:443
double matcher_clustering_max_angle_delta
Definition: classify.h:473
double classify_misfit_junk_penalty
Definition: classify.h:476
ParamsVectors * params()
Definition: ccutil.h:62
double certainty_scale
Definition: classify.h:478
ShapeTable * shape_table_
Definition: classify.h:553
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:518
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:514
double matcher_perfect_threshold
Definition: classify.h:463
double matcher_avg_noise_size
Definition: classify.h:466
int matcher_sufficient_examples_for_prototyping
Definition: classify.h:471
bool disable_character_fragments
Definition: classify.h:491
bool classify_enable_adaptive_matcher
Definition: classify.h:450
int classify_adapt_feature_threshold
Definition: classify.h:488
void FontInfoDeleteCallback(FontInfo f)
Definition: fontinfo.cpp:139
int classify_integer_matcher_multiplier
Definition: classify.h:510
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
BIT_VECTOR AllConfigsOn
Definition: classify.h:522
int matcher_min_examples_for_prototyping
Definition: classify.h:469
double matcher_bad_match_pad
Definition: classify.h:464
BIT_VECTOR TempProtoMask
Definition: classify.h:524
char * classify_learn_debug_str
Definition: classify.h:500
double classify_max_norm_scale_y
Definition: classify.h:441
double classify_char_norm_range
Definition: classify.h:437
double speckle_rating_penalty
Definition: classify.h:544
int classify_cp_cutoff_strength
Definition: classify.h:508
bool classify_enable_learning
Definition: classify.h:430
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
IntegerMatcher im_
Definition: classify.h:544
bool classify_use_pre_adapted_templates
Definition: classify.h:452
bool classify_save_adapted_templates
Definition: classify.h:454

◆ ~Classify()

tesseract::Classify::~Classify ( )
virtual

Definition at line 215 of file classify.cpp.

215  {
217  delete learn_debug_win_;
218  delete learn_fragmented_word_debug_win_;
219  delete learn_fragments_debug_win_;
220 }
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:460

Member Function Documentation

◆ AdaptableWord()

bool tesseract::Classify::AdaptableWord ( WERD_RES word)

Return TRUE if the specified word is acceptable for adaptation.

Globals: none

Parameters
wordcurrent word
Returns
true or false

Definition at line 823 of file adaptmatch.cpp.

823  {
824  if (word->best_choice == nullptr) return false;
825  int BestChoiceLength = word->best_choice->length();
826  float adaptable_score =
828  return // rules that apply in general - simplest to compute first
829  BestChoiceLength > 0 &&
830  BestChoiceLength == word->rebuild_word->NumBlobs() &&
831  BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
832  // This basically ensures that the word is at least a dictionary match
833  // (freq word, user word, system dawg word, etc).
834  // Since all the other adjustments will make adjust factor higher
835  // than higher than adaptable_score=1.1+0.05=1.15
836  // Since these are other flags that ensure that the word is dict word,
837  // this check could be at times redundant.
838  word->best_choice->adjust_factor() <= adaptable_score &&
839  // Make sure that alternative choices are not dictionary words.
840  word->AlternativeChoiceAdjustmentsWorseThan(adaptable_score);
841 }
TWERD * rebuild_word
Definition: pageres.h:260
double segment_penalty_dict_case_ok
Definition: dict.h:588
int NumBlobs() const
Definition: blobs.h:432
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:445
float adjust_factor() const
Definition: ratngs.h:306
int length() const
Definition: ratngs.h:303
#define MAX_ADAPTABLE_WERD_SIZE
Definition: adaptmatch.cpp:81
#define ADAPTABLE_WERD_ADJUSTMENT
Definition: adaptmatch.cpp:83
virtual Dict & getDict()
Definition: classify.h:107
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ AdaptiveClassifier()

void tesseract::Classify::AdaptiveClassifier ( TBLOB Blob,
BLOB_CHOICE_LIST *  Choices 
)

This routine calls the adaptive matcher which returns (in an array) the class id of each class matched.

It also returns the number of classes matched. For each class matched it places the best rating found for that class into the Ratings array.

Bad matches are then removed so that they don't need to be sorted. The remaining good matches are then sorted and converted to choices.

This routine also performs some simple speckle filtering.

Parameters
Blobblob to be classified
[out]ChoicesList of choices found by adaptive matcher. filled on return with the choices found by the class pruner and the ratings therefrom. Also contains the detailed results of the integer matcher.

Definition at line 192 of file adaptmatch.cpp.

192  {
193  assert(Choices != nullptr);
194  ADAPT_RESULTS *Results = new ADAPT_RESULTS;
195  Results->Initialize();
196 
197  ASSERT_HOST(AdaptedTemplates != nullptr);
198 
199  DoAdaptiveMatch(Blob, Results);
200 
201  RemoveBadMatches(Results);
203  RemoveExtraPuncs(Results);
204  Results->ComputeBest();
205  ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results,
206  Choices);
207 
208  // TODO(rays) Move to before ConvertMatchesToChoices!
209  if (LargeSpeckle(*Blob) || Choices->length() == 0)
210  AddLargeSpeckleTo(Results->BlobLength, Choices);
211 
212  if (matcher_debug_level >= 1) {
213  tprintf("AD Matches = ");
214  PrintAdaptiveMatchResults(*Results);
215  }
216 
217 #ifndef GRAPHICS_DISABLED
219  DebugAdaptiveClassifier(Blob, Results);
220 #endif
221 
222  delete Results;
223 } /* AdaptiveClassifier */
bool LargeSpeckle(const TBLOB &blob)
Definition: classify.cpp:256
int32_t BlobLength
Definition: adaptmatch.cpp:93
void Initialize()
Definition: adaptmatch.cpp:103
bool classify_enable_adaptive_debugger
Definition: classify.h:455
void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results)
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:98
void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices)
Definition: classify.cpp:233
static int SortDescendingRating(const void *t1, const void *t2)
Definition: shapetable.h:56
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:514
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results)
TBOX bounding_box() const
Definition: blobs.cpp:478
void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
void RemoveExtraPuncs(ADAPT_RESULTS *Results)
void ComputeBest()
Definition: adaptmatch.cpp:109
const DENORM & denorm() const
Definition: blobs.h:347
void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results)
void RemoveBadMatches(ADAPT_RESULTS *Results)
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ AdaptiveClassifierIsEmpty()

bool tesseract::Classify::AdaptiveClassifierIsEmpty ( ) const
inline

Definition at line 326 of file classify.h.

326  {
327  return AdaptedTemplates->NumPermClasses == 0;
328  }
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:514
uint8_t NumPermClasses
Definition: adaptive.h:78

◆ AdaptiveClassifierIsFull()

bool tesseract::Classify::AdaptiveClassifierIsFull ( ) const
inline

Definition at line 325 of file classify.h.

325 { return NumAdaptationsFailed > 0; }

◆ AdaptToChar()

void tesseract::Classify::AdaptToChar ( TBLOB Blob,
CLASS_ID  ClassId,
int  FontinfoId,
float  Threshold,
ADAPT_TEMPLATES  adaptive_templates 
)
Parameters
Blobblob to add to templates for ClassId
ClassIdclass to add blob to
FontinfoIdfont information from pre-trained templates
Thresholdminimum match rating to existing template
adaptive_templatescurrent set of adapted templates

Globals:

  • AllProtosOn dummy mask to match against all protos
  • AllConfigsOn dummy mask to match against all configs
Returns
none

Definition at line 857 of file adaptmatch.cpp.

859  {
860  int NumFeatures;
861  INT_FEATURE_ARRAY IntFeatures;
862  UnicharRating int_result;
863  INT_CLASS IClass;
864  ADAPT_CLASS Class;
865  TEMP_CONFIG TempConfig;
866  FEATURE_SET FloatFeatures;
867  int NewTempConfigId;
868 
869  if (!LegalClassId (ClassId))
870  return;
871 
872  int_result.unichar_id = ClassId;
873  Class = adaptive_templates->Class[ClassId];
874  assert(Class != nullptr);
875  if (IsEmptyAdaptedClass(Class)) {
876  InitAdaptedClass(Blob, ClassId, FontinfoId, Class, adaptive_templates);
877  } else {
878  IClass = ClassForClassId(adaptive_templates->Templates, ClassId);
879 
880  NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
881  if (NumFeatures <= 0) {
882  return; // Features already freed by GetAdaptiveFeatures.
883  }
884 
885  // Only match configs with the matching font.
886  BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);
887  for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {
888  if (GetFontinfoId(Class, cfg) == FontinfoId) {
889  SET_BIT(MatchingFontConfigs, cfg);
890  } else {
891  reset_bit(MatchingFontConfigs, cfg);
892  }
893  }
894  im_.Match(IClass, AllProtosOn, MatchingFontConfigs,
895  NumFeatures, IntFeatures,
898  FreeBitVector(MatchingFontConfigs);
899 
900  SetAdaptiveThreshold(Threshold);
901 
902  if (1.0f - int_result.rating <= Threshold) {
903  if (ConfigIsPermanent(Class, int_result.config)) {
905  tprintf("Found good match to perm config %d = %4.1f%%.\n",
906  int_result.config, int_result.rating * 100.0);
907  FreeFeatureSet(FloatFeatures);
908  return;
909  }
910 
911  TempConfig = TempConfigFor(Class, int_result.config);
912  IncreaseConfidence(TempConfig);
913  if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
914  Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
915  }
917  tprintf("Increasing reliability of temp config %d to %d.\n",
918  int_result.config, TempConfig->NumTimesSeen);
919 
920  if (TempConfigReliable(ClassId, TempConfig)) {
921  MakePermanent(adaptive_templates, ClassId, int_result.config, Blob);
922  UpdateAmbigsGroup(ClassId, Blob);
923  }
924  } else {
926  tprintf("Found poor match to temp config %d = %4.1f%%.\n",
927  int_result.config, int_result.rating * 100.0);
929  DisplayAdaptedChar(Blob, IClass);
930  }
931  NewTempConfigId =
932  MakeNewTemporaryConfig(adaptive_templates, ClassId, FontinfoId,
933  NumFeatures, IntFeatures, FloatFeatures);
934  if (NewTempConfigId >= 0 &&
935  TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
936  MakePermanent(adaptive_templates, ClassId, NewTempConfigId, Blob);
937  UpdateAmbigsGroup(ClassId, Blob);
938  }
939 
940 #ifndef GRAPHICS_DISABLED
942  DisplayAdaptedChar(Blob, IClass);
943  }
944 #endif
945  }
946  FreeFeatureSet(FloatFeatures);
947  }
948 } /* AdaptToChar */
void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class)
Definition: adaptmatch.cpp:950
BIT_VECTOR AllProtosOn
Definition: classify.h:521
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config)
int GetFontinfoId(ADAPT_CLASS Class, uint8_t ConfigId)
Definition: adaptive.cpp:174
bool matcher_debug_separate_windows
Definition: classify.h:499
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob)
void FreeBitVector(BIT_VECTOR BitVector)
Definition: bitvec.cpp:51
void Match(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:470
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:101
uint8_t NumConfigs
Definition: intproto.h:108
#define LegalClassId(c)
Definition: intproto.h:174
#define MAX_NUM_PROTOS
Definition: intproto.h:48
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:80
BIT_VECTOR NewBitVector(int NumBits)
Definition: bitvec.cpp:82
uint8_t NumTimesSeen
Definition: adaptive.h:41
int classify_learning_debug_level
Definition: classify.h:460
#define IncreaseConfidence(TempConfig)
Definition: adaptive.h:107
void SetAdaptiveThreshold(float Threshold)
void MakePermanent(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES]
Definition: intproto.h:150
#define ConfigIsPermanent(Class, ConfigId)
Definition: adaptive.h:92
#define IsEmptyAdaptedClass(Class)
Definition: adaptive.h:89
int classify_adapt_feature_threshold
Definition: classify.h:488
#define reset_bit(array, bit)
Definition: bitvec.h:59
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:65
uint8_t MaxNumTimesSeen
Definition: adaptive.h:65
void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
Definition: adaptmatch.cpp:694
int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
Definition: adaptmatch.cpp:787
#define ClassForClassId(T, c)
Definition: intproto.h:176
INT_TEMPLATES Templates
Definition: adaptive.h:76
#define NO_DEBUG
Definition: adaptmatch.cpp:80
#define SET_BIT(array, bit)
Definition: bitvec.h:57
IntegerMatcher im_
Definition: classify.h:544

◆ AddLargeSpeckleTo()

void tesseract::Classify::AddLargeSpeckleTo ( int  blob_length,
BLOB_CHOICE_LIST *  choices 
)

Definition at line 233 of file classify.cpp.

233  {
234  BLOB_CHOICE_IT bc_it(choices);
235  // If there is no classifier result, we will use the worst possible certainty
236  // and corresponding rating.
237  float certainty = -getDict().certainty_scale;
238  float rating = rating_scale * blob_length;
239  if (!choices->empty() && blob_length > 0) {
240  bc_it.move_to_last();
241  BLOB_CHOICE* worst_choice = bc_it.data();
242  // Add speckle_rating_penalty to worst rating, matching old value.
243  rating = worst_choice->rating() + speckle_rating_penalty;
244  // Compute the rating to correspond to the certainty. (Used to be kept
245  // the same, but that messes up the language model search.)
246  certainty = -rating * getDict().certainty_scale /
247  (rating_scale * blob_length);
248  }
249  BLOB_CHOICE* blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty,
250  -1, 0.0f, FLT_MAX, 0,
252  bc_it.add_to_end(blob_choice);
253 }
double certainty_scale
Definition: dict.h:611
float rating() const
Definition: ratngs.h:80
virtual Dict & getDict()
Definition: classify.h:107
double speckle_rating_penalty
Definition: classify.h:544

◆ AddNewResult()

void tesseract::Classify::AddNewResult ( const UnicharRating new_result,
ADAPT_RESULTS results 
)

This routine adds the result of a classification into Results. If the new rating is much worse than the current best rating, it is not entered into results because it would end up being stripped later anyway. If the new rating is better than the old rating for the class, it replaces the old rating. If this is the first rating for the class, the class is added to the list of matched classes in Results. If the new rating is better than the best so far, it becomes the best so far.

Globals:

Parameters
new_resultnew result to add
[out]resultsresults to add new result to

Definition at line 998 of file adaptmatch.cpp.

999  {
1000  int old_match = FindScoredUnichar(new_result.unichar_id, *results);
1001 
1002  if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
1003  (old_match < results->match.size() &&
1004  new_result.rating <= results->match[old_match].rating))
1005  return; // New one not good enough.
1006 
1007  if (!unicharset.get_fragment(new_result.unichar_id))
1008  results->HasNonfragment = true;
1009 
1010  if (old_match < results->match.size()) {
1011  results->match[old_match].rating = new_result.rating;
1012  } else {
1013  results->match.push_back(new_result);
1014  }
1015 
1016  if (new_result.rating > results->best_rating &&
1017  // Ensure that fragments do not affect best rating, class and config.
1018  // This is needed so that at least one non-fragmented character is
1019  // always present in the results.
1020  // TODO(daria): verify that this helps accuracy and does not
1021  // hurt performance.
1022  !unicharset.get_fragment(new_result.unichar_id)) {
1023  results->best_match_index = old_match;
1024  results->best_rating = new_result.rating;
1025  results->best_unichar_id = new_result.unichar_id;
1026  }
1027 } /* AddNewResult */
UNICHAR_ID best_unichar_id
Definition: adaptmatch.cpp:95
bool HasNonfragment
Definition: adaptmatch.cpp:94
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:98
int best_match_index
Definition: adaptmatch.cpp:96
float best_rating
Definition: adaptmatch.cpp:97
UNICHARSET unicharset
Definition: ccutil.h:68
int push_back(T object)
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729

◆ AmbigClassifier()

void tesseract::Classify::AmbigClassifier ( const GenericVector< INT_FEATURE_STRUCT > &  int_features,
const INT_FX_RESULT_STRUCT fx_info,
const TBLOB blob,
INT_TEMPLATES  templates,
ADAPT_CLASS classes,
UNICHAR_ID ambiguities,
ADAPT_RESULTS results 
)

This routine is identical to CharNormClassifier() except that it does no class pruning. It simply matches the unknown blob against the classes listed in Ambiguities.

Globals:

Parameters
blobblob to be classified
templatesbuilt-in templates to classify against
classesadapted class templates
ambiguitiesarray of unichar id's to match against
[out]resultsplace to put match results
int_features
fx_info

Definition at line 1049 of file adaptmatch.cpp.

1056  {
1057  if (int_features.empty()) return;
1058  uint8_t* CharNormArray = new uint8_t[unicharset.size()];
1059  UnicharRating int_result;
1060 
1061  results->BlobLength = GetCharNormFeature(fx_info, templates, nullptr,
1062  CharNormArray);
1063  bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1064  if (debug)
1065  tprintf("AM Matches = ");
1066 
1067  int top = blob->bounding_box().top();
1068  int bottom = blob->bounding_box().bottom();
1069  while (*ambiguities >= 0) {
1070  CLASS_ID class_id = *ambiguities;
1071 
1072  int_result.unichar_id = class_id;
1073  im_.Match(ClassForClassId(templates, class_id),
1075  int_features.size(), &int_features[0],
1076  &int_result,
1079 
1080  ExpandShapesAndApplyCorrections(nullptr, debug, class_id, bottom, top, 0,
1081  results->BlobLength,
1083  CharNormArray, &int_result, results);
1084  ambiguities++;
1085  }
1086  delete [] CharNormArray;
1087 } /* AmbigClassifier */
int32_t BlobLength
Definition: adaptmatch.cpp:93
int size() const
Definition: genericvector.h:71
int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
BIT_VECTOR AllProtosOn
Definition: classify.h:521
bool matcher_debug_separate_windows
Definition: classify.h:499
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:36
void Match(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:470
int size() const
Definition: unicharset.h:336
int16_t top() const
Definition: rect.h:58
UNICHARSET unicharset
Definition: ccutil.h:68
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
TBOX bounding_box() const
Definition: blobs.cpp:478
void ExpandShapesAndApplyCorrections(ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
int classify_adapt_feature_threshold
Definition: classify.h:488
int classify_integer_matcher_multiplier
Definition: classify.h:510
BIT_VECTOR AllConfigsOn
Definition: classify.h:522
#define ClassForClassId(T, c)
Definition: intproto.h:176
int16_t bottom() const
Definition: rect.h:65
#define NO_DEBUG
Definition: adaptmatch.cpp:80
IntegerMatcher im_
Definition: classify.h:544

◆ BaselineClassifier()

UNICHAR_ID * tesseract::Classify::BaselineClassifier ( TBLOB Blob,
const GenericVector< INT_FEATURE_STRUCT > &  int_features,
const INT_FX_RESULT_STRUCT fx_info,
ADAPT_TEMPLATES  Templates,
ADAPT_RESULTS Results 
)

This routine extracts baseline normalized features from the unknown character and matches them against the specified set of templates. The classes which match are added to Results.

Globals:

  • BaselineCutoffs expected num features for each class
Parameters
Blobblob to be classified
Templatescurrent set of adapted templates
Resultsplace to put match results
int_features
fx_info
Returns
Array of possible ambiguous chars that should be checked.

Definition at line 1269 of file adaptmatch.cpp.

1272  {
1273  if (int_features.empty()) return nullptr;
1274  uint8_t* CharNormArray = new uint8_t[unicharset.size()];
1275  ClearCharNormArray(CharNormArray);
1276 
1278  PruneClasses(Templates->Templates, int_features.size(), -1, &int_features[0],
1279  CharNormArray, BaselineCutoffs, &Results->CPResults);
1280 
1281  if (matcher_debug_level >= 2 || classify_debug_level > 1)
1282  tprintf("BL Matches = ");
1283 
1284  MasterMatcher(Templates->Templates, int_features.size(), &int_features[0],
1285  CharNormArray,
1286  Templates->Class, matcher_debug_flags, 0,
1287  Blob->bounding_box(), Results->CPResults, Results);
1288 
1289  delete [] CharNormArray;
1290  CLASS_ID ClassId = Results->best_unichar_id;
1291  if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0)
1292  return nullptr;
1293 
1294  return Templates->Class[ClassId]->
1295  Config[Results->match[Results->best_match_index].config].Perm->Ambigs;
1296 } /* BaselineClassifier */
int32_t BlobLength
Definition: adaptmatch.cpp:93
CLUSTERCONFIG Config
int size() const
Definition: genericvector.h:71
UNICHAR_ID best_unichar_id
Definition: adaptmatch.cpp:95
void MasterMatcher(INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
void ClearCharNormArray(uint8_t *char_norm_array)
Definition: float2int.cpp:44
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:36
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:98
int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
Definition: intmatcher.cpp:409
int size() const
Definition: unicharset.h:336
int best_match_index
Definition: adaptmatch.cpp:96
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:80
UNICHARSET unicharset
Definition: ccutil.h:68
int IntCastRounded(double x)
Definition: helpers.h:168
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
TBOX bounding_box() const
Definition: blobs.cpp:478
GenericVector< CP_RESULT_STRUCT > CPResults
Definition: adaptmatch.cpp:99
int32_t Length
Definition: intfx.h:36
const double kStandardFeatureLength
Definition: intfx.h:46
INT_TEMPLATES Templates
Definition: adaptive.h:76

◆ CharNormClassifier()

int tesseract::Classify::CharNormClassifier ( TBLOB blob,
const TrainingSample sample,
ADAPT_RESULTS adapt_results 
)

This routine extracts character normalized features from the unknown character and matches them against the specified set of templates. The classes which match are added to Results.

Parameters
blobblob to be classified
sampletemplates to classify unknown against
adapt_resultsplace to put match results

Globals:

  • CharNormCutoffs expected num features for each class
  • AllProtosOn mask that enables all protos
  • AllConfigsOn mask that enables all configs

Definition at line 1315 of file adaptmatch.cpp.

1317  {
1318  // This is the length that is used for scaling ratings vs certainty.
1319  adapt_results->BlobLength =
1320  IntCastRounded(sample.outline_length() / kStandardFeatureLength);
1321  GenericVector<UnicharRating> unichar_results;
1322  static_classifier_->UnicharClassifySample(sample, blob->denorm().pix(), 0,
1323  -1, &unichar_results);
1324  // Convert results to the format used internally by AdaptiveClassifier.
1325  for (int r = 0; r < unichar_results.size(); ++r) {
1326  AddNewResult(unichar_results[r], adapt_results);
1327  }
1328  return sample.num_features();
1329 } /* CharNormClassifier */
int32_t BlobLength
Definition: adaptmatch.cpp:93
Definition: cluster.h:32
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:998
int IntCastRounded(double x)
Definition: helpers.h:168
virtual int UnicharClassifySample(const TrainingSample &sample, Pix *page_pix, int debug, UNICHAR_ID keep_this, GenericVector< UnicharRating > *results)
const DENORM & denorm() const
Definition: blobs.h:347
const double kStandardFeatureLength
Definition: intfx.h:46
Pix * pix() const
Definition: normalis.h:246

◆ CharNormTrainingSample()

int tesseract::Classify::CharNormTrainingSample ( bool  pruner_only,
int  keep_this,
const TrainingSample sample,
GenericVector< UnicharRating > *  results 
)

Definition at line 1333 of file adaptmatch.cpp.

1336  {
1337  results->clear();
1338  ADAPT_RESULTS* adapt_results = new ADAPT_RESULTS();
1339  adapt_results->Initialize();
1340  // Compute the bounding box of the features.
1341  uint32_t num_features = sample.num_features();
1342  // Only the top and bottom of the blob_box are used by MasterMatcher, so
1343  // fabricate right and left using top and bottom.
1344  TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
1345  sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
1346  // Compute the char_norm_array from the saved cn_feature.
1347  FEATURE norm_feature = sample.GetCNFeature();
1348  uint8_t* char_norm_array = new uint8_t[unicharset.size()];
1349  int num_pruner_classes = std::max(unicharset.size(),
1351  uint8_t* pruner_norm_array = new uint8_t[num_pruner_classes];
1352  adapt_results->BlobLength =
1353  static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5);
1354  ComputeCharNormArrays(norm_feature, PreTrainedTemplates, char_norm_array,
1355  pruner_norm_array);
1356 
1357  PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(),
1358  pruner_norm_array,
1359  shape_table_ != nullptr ? &shapetable_cutoffs_[0] : CharNormCutoffs,
1360  &adapt_results->CPResults);
1361  delete [] pruner_norm_array;
1362  if (keep_this >= 0) {
1363  adapt_results->CPResults[0].Class = keep_this;
1364  adapt_results->CPResults.truncate(1);
1365  }
1366  if (pruner_only) {
1367  // Convert pruner results to output format.
1368  for (int i = 0; i < adapt_results->CPResults.size(); ++i) {
1369  int class_id = adapt_results->CPResults[i].Class;
1370  results->push_back(
1371  UnicharRating(class_id, 1.0f - adapt_results->CPResults[i].Rating));
1372  }
1373  } else {
1374  MasterMatcher(PreTrainedTemplates, num_features, sample.features(),
1375  char_norm_array,
1376  nullptr, matcher_debug_flags,
1378  blob_box, adapt_results->CPResults, adapt_results);
1379  // Convert master matcher results to output format.
1380  for (int i = 0; i < adapt_results->match.size(); i++) {
1381  results->push_back(adapt_results->match[i]);
1382  }
1384  }
1385  delete [] char_norm_array;
1386  delete adapt_results;
1387  return num_features;
1388 } /* CharNormTrainingSample */
int32_t BlobLength
Definition: adaptmatch.cpp:93
int size() const
Definition: genericvector.h:71
void Initialize()
Definition: adaptmatch.cpp:103
Definition: cluster.h:32
void MasterMatcher(INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
Definition: rect.h:34
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:98
int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
Definition: intmatcher.cpp:409
int size() const
Definition: unicharset.h:336
void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
static int SortDescendingRating(const void *t1, const void *t2)
Definition: shapetable.h:56
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:510
UNICHARSET unicharset
Definition: ccutil.h:68
float ActualOutlineLength(FEATURE Feature)
Definition: normfeat.cpp:32
ShapeTable * shape_table_
Definition: classify.h:553
int push_back(T object)
GenericVector< CP_RESULT_STRUCT > CPResults
Definition: adaptmatch.cpp:99
int classify_integer_matcher_multiplier
Definition: classify.h:510
void truncate(int size)

◆ ClassAndConfigIDToFontOrShapeID()

int tesseract::Classify::ClassAndConfigIDToFontOrShapeID ( int  class_id,
int  int_result_config 
) const

Definition at line 2212 of file adaptmatch.cpp.

2213  {
2214  int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
2215  // Older inttemps have no font_ids.
2216  if (font_set_id < 0)
2217  return kBlankFontinfoId;
2218  const FontSet &fs = fontset_table_.get(font_set_id);
2219  ASSERT_HOST(int_result_config >= 0 && int_result_config < fs.size);
2220  return fs.configs[int_result_config];
2221 }
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:121
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:510
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ ClassIDToDebugStr()

STRING tesseract::Classify::ClassIDToDebugStr ( const INT_TEMPLATES_STRUCT templates,
int  class_id,
int  config_id 
) const

Definition at line 2199 of file adaptmatch.cpp.

2200  {
2201  STRING class_string;
2202  if (templates == PreTrainedTemplates && shape_table_ != nullptr) {
2203  int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);
2204  class_string = shape_table_->DebugStr(shape_id);
2205  } else {
2206  class_string = unicharset.debug_str(class_id);
2207  }
2208  return class_string;
2209 }
STRING DebugStr(int shape_id) const
Definition: shapetable.cpp:281
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:510
UNICHARSET unicharset
Definition: ccutil.h:68
int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:342
ShapeTable * shape_table_
Definition: classify.h:553
Definition: strngs.h:45

◆ ClassifyAsNoise()

void tesseract::Classify::ClassifyAsNoise ( ADAPT_RESULTS results)

This routine computes a rating which reflects the likelihood that the blob being classified is a noise blob. NOTE: assumes that the blob length has already been computed and placed into Results.

Parameters
resultsresults to add noise classification to

Globals:

  • matcher_avg_noise_size avg. length of a noise blob

Definition at line 1403 of file adaptmatch.cpp.

1403  {
1404  float rating = results->BlobLength / matcher_avg_noise_size;
1405  rating *= rating;
1406  rating /= 1.0 + rating;
1407 
1408  AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results);
1409 } /* ClassifyAsNoise */
int32_t BlobLength
Definition: adaptmatch.cpp:93
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:998
double matcher_avg_noise_size
Definition: classify.h:466

◆ ClearCharNormArray()

void tesseract::Classify::ClearCharNormArray ( uint8_t *  char_norm_array)

For each class in the unicharset, clears the corresponding entry in char_norm_array. char_norm_array is indexed by unichar_id.

Globals:

  • none
Parameters
char_norm_arrayarray to be cleared

Definition at line 44 of file float2int.cpp.

44  {
45  memset(char_norm_array, 0, sizeof(*char_norm_array) * unicharset.size());
46 } /* ClearCharNormArray */
int size() const
Definition: unicharset.h:336
UNICHARSET unicharset
Definition: ccutil.h:68

◆ ComputeCharNormArrays()

void tesseract::Classify::ComputeCharNormArrays ( FEATURE_STRUCT norm_feature,
INT_TEMPLATES_STRUCT templates,
uint8_t *  char_norm_array,
uint8_t *  pruner_array 
)

Definition at line 1702 of file adaptmatch.cpp.

1705  {
1706  ComputeIntCharNormArray(*norm_feature, char_norm_array);
1707  if (pruner_array != nullptr) {
1708  if (shape_table_ == nullptr) {
1709  ComputeIntCharNormArray(*norm_feature, pruner_array);
1710  } else {
1711  memset(pruner_array, UINT8_MAX,
1712  templates->NumClasses * sizeof(pruner_array[0]));
1713  // Each entry in the pruner norm array is the MIN of all the entries of
1714  // the corresponding unichars in the CharNormArray.
1715  for (int id = 0; id < templates->NumClasses; ++id) {
1716  int font_set_id = templates->Class[id]->font_set_id;
1717  const FontSet &fs = fontset_table_.get(font_set_id);
1718  for (int config = 0; config < fs.size; ++config) {
1719  const Shape& shape = shape_table_->GetShape(fs.configs[config]);
1720  for (int c = 0; c < shape.size(); ++c) {
1721  if (char_norm_array[shape[c].unichar_id] < pruner_array[id])
1722  pruner_array[id] = char_norm_array[shape[c].unichar_id];
1723  }
1724  }
1725  }
1726  }
1727  }
1728  FreeFeature(norm_feature);
1729 }
void FreeFeature(FEATURE Feature)
Definition: ocrfeatures.cpp:56
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:121
ShapeTable * shape_table_
Definition: classify.h:553
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:320
void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
Definition: float2int.cpp:62
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537

◆ ComputeCorrectedRating()

double tesseract::Classify::ComputeCorrectedRating ( bool  debug,
int  unichar_id,
double  cp_rating,
double  im_rating,
int  feature_misses,
int  bottom,
int  top,
int  blob_length,
int  matcher_multiplier,
const uint8_t *  cn_factors 
)

Definition at line 1206 of file adaptmatch.cpp.

1211  {
1212  // Compute class feature corrections.
1213  double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length,
1214  cn_factors[unichar_id],
1215  matcher_multiplier);
1216  double miss_penalty = tessedit_class_miss_scale * feature_misses;
1217  double vertical_penalty = 0.0;
1218  // Penalize non-alnums for being vertical misfits.
1219  if (!unicharset.get_isalpha(unichar_id) &&
1220  !unicharset.get_isdigit(unichar_id) &&
1221  cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {
1222  int min_bottom, max_bottom, min_top, max_top;
1223  unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom,
1224  &min_top, &max_top);
1225  if (debug) {
1226  tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n",
1227  top, min_top, max_top, bottom, min_bottom, max_bottom);
1228  }
1229  if (top < min_top || top > max_top ||
1230  bottom < min_bottom || bottom > max_bottom) {
1231  vertical_penalty = classify_misfit_junk_penalty;
1232  }
1233  }
1234  double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty);
1235  if (result < WORST_POSSIBLE_RATING)
1236  result = WORST_POSSIBLE_RATING;
1237  if (debug) {
1238  tprintf("%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
1239  unicharset.id_to_unichar(unichar_id),
1240  result * 100.0,
1241  cp_rating * 100.0,
1242  (1.0 - im_rating) * 100.0,
1243  (cn_corrected - (1.0 - im_rating)) * 100.0,
1244  cn_factors[unichar_id],
1245  miss_penalty * 100.0,
1246  vertical_penalty * 100.0);
1247  }
1248  return result;
1249 }
double tessedit_class_miss_scale
Definition: classify.h:480
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
#define WORST_POSSIBLE_RATING
Definition: adaptmatch.cpp:87
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
float ApplyCNCorrection(float rating, int blob_length, int normalization_factor, int matcher_multiplier)
UNICHARSET unicharset
Definition: ccutil.h:68
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:563
double classify_misfit_junk_penalty
Definition: classify.h:476
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
IntegerMatcher im_
Definition: classify.h:544

◆ ComputeIntCharNormArray()

void tesseract::Classify::ComputeIntCharNormArray ( const FEATURE_STRUCT norm_feature,
uint8_t *  char_norm_array 
)

For each class in unicharset, computes the match between norm_feature and the normalization protos for that class. Converts this number to the range from 0 - 255 and stores it into char_norm_array. CharNormArray is indexed by unichar_id.

Globals:

  • PreTrainedTemplates current set of built-in templates
Parameters
norm_featurecharacter normalization feature
[out]char_norm_arrayplace to put results of size unicharset.size()

Definition at line 62 of file float2int.cpp.

63  {
64  for (int i = 0; i < unicharset.size(); i++) {
65  if (i < PreTrainedTemplates->NumClasses) {
66  int norm_adjust = static_cast<int>(INT_CHAR_NORM_RANGE *
67  ComputeNormMatch(i, norm_feature, false));
68  char_norm_array[i] = ClipToRange(norm_adjust, 0, MAX_INT_CHAR_NORM);
69  } else {
70  // Classes with no templates (eg. ambigs & ligatures) default
71  // to worst match.
72  char_norm_array[i] = MAX_INT_CHAR_NORM;
73  }
74  }
75 } /* ComputeIntCharNormArray */
float ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
Definition: normmatch.cpp:83
#define INT_CHAR_NORM_RANGE
Definition: intproto.h:130
int size() const
Definition: unicharset.h:336
UNICHARSET unicharset
Definition: ccutil.h:68
#define MAX_INT_CHAR_NORM
Definition: float2int.cpp:27
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:111

◆ ComputeIntFeatures()

void tesseract::Classify::ComputeIntFeatures ( FEATURE_SET  Features,
INT_FEATURE_ARRAY  IntFeatures 
)

This routine converts each floating point pico-feature in Features into integer format and saves it into IntFeatures.

Globals:

  • none
Parameters
Featuresfloating point pico-features to be converted
[out]IntFeaturesarray to put converted features into

Definition at line 90 of file float2int.cpp.

91  {
92  float YShift;
93 
95  YShift = BASELINE_Y_SHIFT;
96  else
97  YShift = Y_SHIFT;
98 
99  for (int Fid = 0; Fid < Features->NumFeatures; Fid++) {
100  FEATURE Feature = Features->Features[Fid];
101 
102  IntFeatures[Fid].X =
104  IntFeatures[Fid].Y =
105  Bucket8For(Feature->Params[PicoFeatY], YShift, INT_FEAT_RANGE);
106  IntFeatures[Fid].Theta = CircBucketFor(Feature->Params[PicoFeatDir],
108  IntFeatures[Fid].CP_misses = 0;
109  }
110 } /* ComputeIntFeatures */
uint8_t CircBucketFor(float param, float offset, int num_buckets)
Definition: intproto.cpp:435
#define BASELINE_Y_SHIFT
Definition: float2int.h:28
#define Y_SHIFT
Definition: intproto.h:42
float Params[1]
Definition: ocrfeatures.h:62
FEATURE Features[1]
Definition: ocrfeatures.h:69
uint16_t NumFeatures
Definition: ocrfeatures.h:67
#define ANGLE_SHIFT
Definition: intproto.h:40
uint8_t Bucket8For(float param, float offset, int num_buckets)
Definition: intproto.cpp:421
#define X_SHIFT
Definition: intproto.h:41
#define INT_FEAT_RANGE
Definition: float2int.h:27

◆ ComputeNormMatch()

float tesseract::Classify::ComputeNormMatch ( CLASS_ID  ClassId,
const FEATURE_STRUCT feature,
bool  DebugMatch 
)

This routine compares Features against each character normalization proto for ClassId and returns the match rating of the best match.

Parameters
ClassIdid of class to match against
featurecharacter normalization feature
DebugMatchcontrols dump of debug info

Globals: NormProtos character normalization prototypes

Returns
Best match rating for Feature against protos of ClassId.

Definition at line 83 of file normmatch.cpp.

85  {
86  LIST Protos;
87  float BestMatch;
88  float Match;
89  float Delta;
90  PROTOTYPE *Proto;
91  int ProtoId;
92 
93  if (ClassId >= NormProtos->NumProtos) {
94  ClassId = NO_CLASS;
95  }
96 
97  /* handle requests for classification as noise */
98  if (ClassId == NO_CLASS) {
99  /* kludge - clean up constants and make into control knobs later */
100  Match = (feature.Params[CharNormLength] *
101  feature.Params[CharNormLength] * 500.0 +
102  feature.Params[CharNormRx] *
103  feature.Params[CharNormRx] * 8000.0 +
104  feature.Params[CharNormRy] *
105  feature.Params[CharNormRy] * 8000.0);
106  return (1.0 - NormEvidenceOf (Match));
107  }
108 
109  BestMatch = FLT_MAX;
110  Protos = NormProtos->Protos[ClassId];
111 
112  if (DebugMatch) {
113  tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
114  }
115 
116  ProtoId = 0;
117  iterate(Protos) {
118  Proto = (PROTOTYPE *) first_node (Protos);
119  Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
120  Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
121  if (DebugMatch) {
122  tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
123  Proto->Mean[CharNormY], Delta,
124  Proto->Weight.Elliptical[CharNormY], Match);
125  }
126  Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
127  Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
128  if (DebugMatch) {
129  tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
130  Proto->Mean[CharNormRx], Delta,
131  Proto->Weight.Elliptical[CharNormRx], Match);
132  }
133  // Ry is width! See intfx.cpp.
134  Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
135  if (DebugMatch) {
136  tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
137  Proto->Mean[CharNormRy], Delta,
138  Proto->Weight.Elliptical[CharNormRy]);
139  }
140  Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
141  Delta *= kWidthErrorWeighting;
142  Match += Delta;
143  if (DebugMatch) {
144  tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
145  Match, Match / classify_norm_adj_midpoint,
146  NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
147  }
148 
149  if (Match < BestMatch)
150  BestMatch = Match;
151 
152  ProtoId++;
153  }
154  return 1.0 - NormEvidenceOf(BestMatch);
155 } /* ComputeNormMatch */
float * Mean
Definition: cluster.h:78
NORM_PROTOS * NormProtos
Definition: classify.h:527
LIST * Protos
Definition: normmatch.cpp:39
float * Elliptical
Definition: cluster.h:64
FLOATUNION Weight
Definition: cluster.h:83
float Params[1]
Definition: ocrfeatures.h:62
UNICHARSET unicharset
Definition: ccutil.h:68
const double kWidthErrorWeighting
Definition: normmatch.cpp:63
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
#define first_node(l)
Definition: oldlist.h:141
#define iterate(l)
Definition: oldlist.h:161
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
double NormEvidenceOf(double NormAdj)
Definition: normmatch.cpp:179
#define NO_CLASS
Definition: matchdefs.h:37
double classify_norm_adj_midpoint
Definition: normmatch.cpp:60

◆ ConvertMatchesToChoices()

void tesseract::Classify::ConvertMatchesToChoices ( const DENORM denorm,
const TBOX box,
ADAPT_RESULTS Results,
BLOB_CHOICE_LIST *  Choices 
)

The function converts the given match ratings to the list of blob choices with ratings and certainties (used by the context checkers). If character fragments are present in the results, this function also makes sure that there is at least one non-fragmented classification included. For each classification result check the unicharset for "definite" ambiguities and modify the resulting Choices accordingly.

Definition at line 1417 of file adaptmatch.cpp.

1419  {
1420  assert(Choices != nullptr);
1421  float Rating;
1422  float Certainty;
1423  BLOB_CHOICE_IT temp_it;
1424  bool contains_nonfrag = false;
1425  temp_it.set_to_list(Choices);
1426  int choices_length = 0;
1427  // With no shape_table_ maintain the previous MAX_MATCHES as the maximum
1428  // number of returned results, but with a shape_table_ we want to have room
1429  // for at least the biggest shape (which might contain hundreds of Indic
1430  // grapheme fragments) and more, so use double the size of the biggest shape
1431  // if that is more than the default.
1432  int max_matches = MAX_MATCHES;
1433  if (shape_table_ != nullptr) {
1434  max_matches = shape_table_->MaxNumUnichars() * 2;
1435  if (max_matches < MAX_MATCHES)
1436  max_matches = MAX_MATCHES;
1437  }
1438 
1439  float best_certainty = -FLT_MAX;
1440  for (int i = 0; i < Results->match.size(); i++) {
1441  const UnicharRating& result = Results->match[i];
1442  bool adapted = result.adapted;
1443  bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != nullptr);
1444  if (temp_it.length()+1 == max_matches &&
1445  !contains_nonfrag && current_is_frag) {
1446  continue; // look for a non-fragmented character to fill the
1447  // last spot in Choices if only fragments are present
1448  }
1449  // BlobLength can never be legally 0, this means recognition failed.
1450  // But we must return a classification result because some invoking
1451  // functions (chopper/permuter) do not anticipate a null blob choice.
1452  // So we need to assign a poor, but not infinitely bad score.
1453  if (Results->BlobLength == 0) {
1454  Certainty = -20;
1455  Rating = 100; // should be -certainty * real_blob_length
1456  } else {
1457  Rating = Certainty = (1.0f - result.rating);
1458  Rating *= rating_scale * Results->BlobLength;
1459  Certainty *= -(getDict().certainty_scale);
1460  }
1461  // Adapted results, by their very nature, should have good certainty.
1462  // Those that don't are at best misleading, and often lead to errors,
1463  // so don't accept adapted results that are too far behind the best result,
1464  // whether adapted or static.
1465  // TODO(rays) find some way of automatically tuning these constants.
1466  if (Certainty > best_certainty) {
1467  best_certainty = std::min(Certainty, static_cast<float>(classify_adapted_pruning_threshold));
1468  } else if (adapted &&
1469  Certainty / classify_adapted_pruning_factor < best_certainty) {
1470  continue; // Don't accept bad adapted results.
1471  }
1472 
1473  float min_xheight, max_xheight, yshift;
1474  denorm.XHeightRange(result.unichar_id, unicharset, box,
1475  &min_xheight, &max_xheight, &yshift);
1476  BLOB_CHOICE* choice =
1477  new BLOB_CHOICE(result.unichar_id, Rating, Certainty,
1479  min_xheight, max_xheight, yshift,
1480  adapted ? BCC_ADAPTED_CLASSIFIER
1482  choice->set_fonts(result.fonts);
1483  temp_it.add_to_end(choice);
1484  contains_nonfrag |= !current_is_frag; // update contains_nonfrag
1485  choices_length++;
1486  if (choices_length >= max_matches) break;
1487  }
1488  Results->match.truncate(choices_length);
1489 } // ConvertMatchesToChoices
#define MAX_MATCHES
Definition: adaptmatch.cpp:78
int32_t BlobLength
Definition: adaptmatch.cpp:93
int size() const
Definition: genericvector.h:71
double certainty_scale
Definition: dict.h:611
void set_fonts(const GenericVector< tesseract::ScoredFont > &fonts)
Definition: ratngs.h:95
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:98
void XHeightRange(int unichar_id, const UNICHARSET &unicharset, const TBOX &bbox, float *min_xht, float *max_xht, float *yshift) const
Definition: normalis.cpp:429
double classify_adapted_pruning_threshold
Definition: classify.h:484
UNICHARSET unicharset
Definition: ccutil.h:68
double classify_adapted_pruning_factor
Definition: classify.h:482
int MaxNumUnichars() const
Definition: shapetable.cpp:455
ShapeTable * shape_table_
Definition: classify.h:553
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729
void truncate(int size)
virtual Dict & getDict()
Definition: classify.h:107
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:658
GenericVector< ScoredFont > fonts
Definition: shapetable.h:88

◆ ConvertProto()

void tesseract::Classify::ConvertProto ( PROTO  Proto,
int  ProtoId,
INT_CLASS  Class 
)

This routine converts Proto to integer format and installs it as ProtoId in Class.

Parameters
Protofloating-pt proto to be converted to integer format
ProtoIdid of proto
Classinteger class to add converted proto to
Returns
none
Note
Globals: none

Definition at line 496 of file intproto.cpp.

496  {
497  INT_PROTO P;
498  float Param;
499 
500  assert(ProtoId < Class->NumProtos);
501 
502  P = ProtoForProtoId(Class, ProtoId);
503 
504  Param = Proto->A * 128;
505  P->A = TruncateParam(Param, -128, 127, nullptr);
506 
507  Param = -Proto->B * 256;
508  P->B = TruncateParam(Param, 0, 255, nullptr);
509 
510  Param = Proto->C * 128;
511  P->C = TruncateParam(Param, -128, 127, nullptr);
512 
513  Param = Proto->Angle * 256;
514  if (Param < 0 || Param >= 256)
515  P->Angle = 0;
516  else
517  P->Angle = (uint8_t) Param;
518 
519  /* round proto length to nearest integer number of pico-features */
520  Param = (Proto->Length / GetPicoFeatureLength()) + 0.5;
521  Class->ProtoLengths[ProtoId] = TruncateParam(Param, 1, 255, nullptr);
523  cprintf("Converted ffeat to (A=%d,B=%d,C=%d,L=%d)",
524  P->A, P->B, P->C, Class->ProtoLengths[ProtoId]);
525 } /* ConvertProto */
void cprintf(const char *format,...)
Definition: callcpp.cpp:33
float B
Definition: protos.h:44
#define ProtoForProtoId(C, P)
Definition: intproto.h:166
uint8_t * ProtoLengths
Definition: intproto.h:110
#define GetPicoFeatureLength()
Definition: picofeat.h:57
float Length
Definition: protos.h:49
int classify_learning_debug_level
Definition: classify.h:460
int TruncateParam(float Param, int Min, int Max, char *Id)
Definition: intproto.cpp:1727
float C
Definition: protos.h:45
float Angle
Definition: protos.h:48
uint8_t Angle
Definition: intproto.h:85
float A
Definition: protos.h:43

◆ CreateIntTemplates()

INT_TEMPLATES tesseract::Classify::CreateIntTemplates ( CLASSES  FloatProtos,
const UNICHARSET target_unicharset 
)

This routine converts from the old floating point format to the new integer format.

Parameters
FloatProtosprototypes in old floating pt format
target_unicharsetthe UNICHARSET to use
Returns
New set of training templates in integer format.
Note
Globals: none

Definition at line 535 of file intproto.cpp.

537  {
538  INT_TEMPLATES IntTemplates;
539  CLASS_TYPE FClass;
540  INT_CLASS IClass;
541  int ClassId;
542  int ProtoId;
543  int ConfigId;
544 
545  IntTemplates = NewIntTemplates();
546 
547  for (ClassId = 0; ClassId < target_unicharset.size(); ClassId++) {
548  FClass = &(FloatProtos[ClassId]);
549  if (FClass->NumProtos == 0 && FClass->NumConfigs == 0 &&
550  strcmp(target_unicharset.id_to_unichar(ClassId), " ") != 0) {
551  cprintf("Warning: no protos/configs for %s in CreateIntTemplates()\n",
552  target_unicharset.id_to_unichar(ClassId));
553  }
554  assert(UnusedClassIdIn(IntTemplates, ClassId));
555  IClass = NewIntClass(FClass->NumProtos, FClass->NumConfigs);
556  FontSet fs;
557  fs.size = FClass->font_set.size();
558  fs.configs = new int[fs.size];
559  for (int i = 0; i < fs.size; ++i) {
560  fs.configs[i] = FClass->font_set.get(i);
561  }
562  if (this->fontset_table_.contains(fs)) {
563  IClass->font_set_id = this->fontset_table_.get_id(fs);
564  delete[] fs.configs;
565  } else {
566  IClass->font_set_id = this->fontset_table_.push_back(fs);
567  }
568  AddIntClass(IntTemplates, ClassId, IClass);
569 
570  for (ProtoId = 0; ProtoId < FClass->NumProtos; ProtoId++) {
571  AddIntProto(IClass);
572  ConvertProto(ProtoIn(FClass, ProtoId), ProtoId, IClass);
573  AddProtoToProtoPruner(ProtoIn(FClass, ProtoId), ProtoId, IClass,
575  AddProtoToClassPruner(ProtoIn(FClass, ProtoId), ClassId, IntTemplates);
576  }
577 
578  for (ConfigId = 0; ConfigId < FClass->NumConfigs; ConfigId++) {
579  AddIntConfig(IClass);
580  ConvertConfig(FClass->Configurations[ConfigId], ConfigId, IClass);
581  }
582  }
583  return (IntTemplates);
584 } /* CreateIntTemplates */
int16_t NumProtos
Definition: protos.h:61
void cprintf(const char *format,...)
Definition: callcpp.cpp:33
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class)
Definition: intproto.cpp:496
#define UnusedClassIdIn(T, c)
Definition: intproto.h:175
int AddIntProto(INT_CLASS Class)
Definition: intproto.cpp:283
UnicityTableEqEq< int > font_set
Definition: protos.h:67
#define ProtoIn(Class, Pid)
Definition: protos.h:121
int16_t NumConfigs
Definition: protos.h:64
int size() const
Definition: unicharset.h:336
int classify_learning_debug_level
Definition: classify.h:460
int size() const
Return the size used.
int AddIntConfig(INT_CLASS Class)
Definition: intproto.cpp:262
void AddProtoToProtoPruner(PROTO Proto, int ProtoId, INT_CLASS Class, bool debug)
Definition: intproto.cpp:370
const T & get(int id) const
Return the object from an id.
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
INT_TEMPLATES NewIntTemplates()
Definition: intproto.cpp:692
void AddIntClass(INT_TEMPLATES Templates, CLASS_ID ClassId, INT_CLASS Class)
Definition: intproto.cpp:232
CONFIGS Configurations
Definition: protos.h:66
INT_CLASS NewIntClass(int MaxNumProtos, int MaxNumConfigs)
Definition: intproto.cpp:636
void AddProtoToClassPruner(PROTO Proto, CLASS_ID ClassId, INT_TEMPLATES Templates)
Definition: intproto.cpp:330
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS Class)
Definition: intproto.cpp:469

◆ DebugAdaptiveClassifier()

void tesseract::Classify::DebugAdaptiveClassifier ( TBLOB blob,
ADAPT_RESULTS Results 
)
Parameters
blobblob whose classification is being debugged
Resultsresults of match being debugged

Globals: none

Definition at line 1501 of file adaptmatch.cpp.

1502  {
1503  if (static_classifier_ == nullptr) return;
1504  INT_FX_RESULT_STRUCT fx_info;
1506  TrainingSample* sample =
1507  BlobToTrainingSample(*blob, false, &fx_info, &bl_features);
1508  if (sample == nullptr) return;
1509  static_classifier_->DebugDisplay(*sample, blob->denorm().pix(),
1510  Results->best_unichar_id);
1511 } /* DebugAdaptiveClassifier */
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:79
Definition: cluster.h:32
UNICHAR_ID best_unichar_id
Definition: adaptmatch.cpp:95
virtual void DebugDisplay(const TrainingSample &sample, Pix *page_pix, UNICHAR_ID unichar_id)
const DENORM & denorm() const
Definition: blobs.h:347
Pix * pix() const
Definition: normalis.h:246

◆ DisplayAdaptedChar()

void tesseract::Classify::DisplayAdaptedChar ( TBLOB blob,
INT_CLASS_STRUCT int_class 
)

Definition at line 950 of file adaptmatch.cpp.

950  {
951 #ifndef GRAPHICS_DISABLED
952  INT_FX_RESULT_STRUCT fx_info;
954  TrainingSample* sample =
956  &bl_features);
957  if (sample == nullptr) return;
958 
959  UnicharRating int_result;
960  im_.Match(int_class, AllProtosOn, AllConfigsOn,
961  bl_features.size(), &bl_features[0],
964  tprintf("Best match to temp config %d = %4.1f%%.\n",
965  int_result.config, int_result.rating * 100.0);
967  uint32_t ConfigMask;
968  ConfigMask = 1 << int_result.config;
970  im_.Match(int_class, AllProtosOn, (BIT_VECTOR)&ConfigMask,
971  bl_features.size(), &bl_features[0],
975  }
976 
977  delete sample;
978 #endif
979 }
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:79
int size() const
Definition: genericvector.h:71
void UpdateMatchDisplay()
Definition: intproto.cpp:451
Definition: cluster.h:32
BIT_VECTOR AllProtosOn
Definition: classify.h:521
bool matcher_debug_separate_windows
Definition: classify.h:499
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
void Match(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:470
bool classify_nonlinear_norm
Definition: classify.h:457
int classify_learning_debug_level
Definition: classify.h:460
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int classify_adapt_feature_threshold
Definition: classify.h:488
BIT_VECTOR AllConfigsOn
Definition: classify.h:522
#define NO_DEBUG
Definition: adaptmatch.cpp:80
IntegerMatcher im_
Definition: classify.h:544

◆ DoAdaptiveMatch()

void tesseract::Classify::DoAdaptiveMatch ( TBLOB Blob,
ADAPT_RESULTS Results 
)

This routine performs an adaptive classification. If we have not yet adapted to enough classes, a simple classification to the pre-trained templates is performed. Otherwise, we match the blob against the adapted templates. If the adapted templates do not match well, we try a match against the pre-trained templates. If an adapted template match is found, we do a match to any pre-trained templates which could be ambiguous. The results from all of these classifications are merged together into Results.

Parameters
Blobblob to be classified
Resultsplace to put match results

Globals:

  • PreTrainedTemplates built-in training templates
  • AdaptedTemplates templates adapted for this page
  • matcher_reliable_adaptive_result rating limit for a great match

Definition at line 1534 of file adaptmatch.cpp.

1534  {
1535  UNICHAR_ID *Ambiguities;
1536 
1537  INT_FX_RESULT_STRUCT fx_info;
1539  TrainingSample* sample =
1541  &bl_features);
1542  if (sample == nullptr) return;
1543 
1544  // TODO: With LSTM, static_classifier_ is nullptr.
1545  // Return to avoid crash in CharNormClassifier.
1546  if (static_classifier_ == nullptr) {
1547  delete sample;
1548  return;
1549  }
1550 
1552  tess_cn_matching) {
1553  CharNormClassifier(Blob, *sample, Results);
1554  } else {
1555  Ambiguities = BaselineClassifier(Blob, bl_features, fx_info,
1556  AdaptedTemplates, Results);
1557  if ((!Results->match.empty() &&
1558  MarginalMatch(Results->best_rating,
1560  !tess_bn_matching) ||
1561  Results->match.empty()) {
1562  CharNormClassifier(Blob, *sample, Results);
1563  } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
1564  AmbigClassifier(bl_features, fx_info, Blob,
1567  Ambiguities,
1568  Results);
1569  }
1570  }
1571 
1572  // Force the blob to be classified as noise
1573  // if the results contain only fragments.
1574  // TODO(daria): verify that this is better than
1575  // just adding a nullptr classification.
1576  if (!Results->HasNonfragment || Results->match.empty())
1577  ClassifyAsNoise(Results);
1578  delete sample;
1579 } /* DoAdaptiveMatch */
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:79
void ClassifyAsNoise(ADAPT_RESULTS *Results)
int UNICHAR_ID
Definition: unichar.h:35
int matcher_permanent_classes_min
Definition: classify.h:467
Definition: cluster.h:32
bool HasNonfragment
Definition: adaptmatch.cpp:94
UNICHAR_ID * BaselineClassifier(TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:98
bool classify_nonlinear_norm
Definition: classify.h:457
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:80
float best_rating
Definition: adaptmatch.cpp:97
void AmbigClassifier(const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:510
bool MarginalMatch(float confidence, float matcher_great_threshold)
Definition: adaptmatch.cpp:132
double matcher_reliable_adaptive_result
Definition: classify.h:462
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:514
bool empty() const
Definition: genericvector.h:90
uint8_t NumPermClasses
Definition: adaptive.h:78
int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)

◆ EndAdaptiveClassifier()

void tesseract::Classify::EndAdaptiveClassifier ( )

This routine performs cleanup operations on the adaptive classifier. It should be called before the program is terminated. Its main function is to save the adapted templates to a file.

Globals:

Definition at line 460 of file adaptmatch.cpp.

460  {
461  STRING Filename;
462  FILE *File;
463 
464  if (AdaptedTemplates != nullptr &&
466  Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
467  File = fopen (Filename.string(), "wb");
468  if (File == nullptr)
469  cprintf ("Unable to save adapted templates to %s!\n", Filename.string());
470  else {
471  cprintf ("\nSaving adapted templates to %s ...", Filename.string());
472  fflush(stdout);
474  cprintf ("\n");
475  fclose(File);
476  }
477  }
478 
479  if (AdaptedTemplates != nullptr) {
481  AdaptedTemplates = nullptr;
482  }
483  if (BackupAdaptedTemplates != nullptr) {
485  BackupAdaptedTemplates = nullptr;
486  }
487 
488  if (PreTrainedTemplates != nullptr) {
490  PreTrainedTemplates = nullptr;
491  }
493  FreeNormProtos();
494  if (AllProtosOn != nullptr) {
499  AllProtosOn = nullptr;
500  AllConfigsOn = nullptr;
501  AllConfigsOff = nullptr;
502  TempProtoMask = nullptr;
503  }
504  delete shape_table_;
505  shape_table_ = nullptr;
506  delete static_classifier_;
507  static_classifier_ = nullptr;
508 } /* EndAdaptiveClassifier */
void cprintf(const char *format,...)
Definition: callcpp.cpp:33
void free_int_templates(INT_TEMPLATES templates)
Definition: intproto.cpp:708
BIT_VECTOR AllProtosOn
Definition: classify.h:521
const char * string() const
Definition: strngs.cpp:196
void EndDangerousAmbigs()
Definition: stopper.cpp:358
void FreeBitVector(BIT_VECTOR BitVector)
Definition: bitvec.cpp:51
void free_adapted_templates(ADAPT_TEMPLATES templates)
Definition: adaptive.cpp:183
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:510
BIT_VECTOR AllConfigsOff
Definition: classify.h:523
STRING imagefile
Definition: ccutil.h:70
ShapeTable * shape_table_
Definition: classify.h:553
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:518
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:514
bool classify_enable_adaptive_matcher
Definition: classify.h:450
Definition: strngs.h:45
BIT_VECTOR AllConfigsOn
Definition: classify.h:522
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:454
BIT_VECTOR TempProtoMask
Definition: classify.h:524
virtual Dict & getDict()
Definition: classify.h:107
#define ADAPT_TEMPLATE_SUFFIX
Definition: adaptmatch.cpp:76
bool classify_save_adapted_templates
Definition: classify.h:454

◆ ExpandShapesAndApplyCorrections()

void tesseract::Classify::ExpandShapesAndApplyCorrections ( ADAPT_CLASS classes,
bool  debug,
int  class_id,
int  bottom,
int  top,
float  cp_rating,
int  blob_length,
int  matcher_multiplier,
const uint8_t *  cn_factors,
UnicharRating int_result,
ADAPT_RESULTS final_results 
)

Definition at line 1132 of file adaptmatch.cpp.

1136  {
1137  if (classes != nullptr) {
1138  // Adapted result. Convert configs to fontinfo_ids.
1139  int_result->adapted = true;
1140  for (int f = 0; f < int_result->fonts.size(); ++f) {
1141  int_result->fonts[f].fontinfo_id =
1142  GetFontinfoId(classes[class_id], int_result->fonts[f].fontinfo_id);
1143  }
1144  } else {
1145  // Pre-trained result. Map fonts using font_sets_.
1146  int_result->adapted = false;
1147  for (int f = 0; f < int_result->fonts.size(); ++f) {
1148  int_result->fonts[f].fontinfo_id =
1150  int_result->fonts[f].fontinfo_id);
1151  }
1152  if (shape_table_ != nullptr) {
1153  // Two possible cases:
1154  // 1. Flat shapetable. All unichar-ids of the shapes referenced by
1155  // int_result->fonts are the same. In this case build a new vector of
1156  // mapped fonts and replace the fonts in int_result.
1157  // 2. Multi-unichar shapetable. Variable unichars in the shapes referenced
1158  // by int_result. In this case, build a vector of UnicharRating to
1159  // gather together different font-ids for each unichar. Also covers case1.
1160  GenericVector<UnicharRating> mapped_results;
1161  for (int f = 0; f < int_result->fonts.size(); ++f) {
1162  int shape_id = int_result->fonts[f].fontinfo_id;
1163  const Shape& shape = shape_table_->GetShape(shape_id);
1164  for (int c = 0; c < shape.size(); ++c) {
1165  int unichar_id = shape[c].unichar_id;
1166  if (!unicharset.get_enabled(unichar_id)) continue;
1167  // Find the mapped_result for unichar_id.
1168  int r = 0;
1169  for (r = 0; r < mapped_results.size() &&
1170  mapped_results[r].unichar_id != unichar_id; ++r) {}
1171  if (r == mapped_results.size()) {
1172  mapped_results.push_back(*int_result);
1173  mapped_results[r].unichar_id = unichar_id;
1174  mapped_results[r].fonts.truncate(0);
1175  }
1176  for (int i = 0; i < shape[c].font_ids.size(); ++i) {
1177  mapped_results[r].fonts.push_back(
1178  ScoredFont(shape[c].font_ids[i], int_result->fonts[f].score));
1179  }
1180  }
1181  }
1182  for (int m = 0; m < mapped_results.size(); ++m) {
1183  mapped_results[m].rating =
1184  ComputeCorrectedRating(debug, mapped_results[m].unichar_id,
1185  cp_rating, int_result->rating,
1186  int_result->feature_misses, bottom, top,
1187  blob_length, matcher_multiplier, cn_factors);
1188  AddNewResult(mapped_results[m], final_results);
1189  }
1190  return;
1191  }
1192  }
1193  if (unicharset.get_enabled(class_id)) {
1194  int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating,
1195  int_result->rating,
1196  int_result->feature_misses,
1197  bottom, top, blob_length,
1198  matcher_multiplier, cn_factors);
1199  AddNewResult(*int_result, final_results);
1200  }
1201 }
int size() const
Definition: genericvector.h:71
int GetFontinfoId(ADAPT_CLASS Class, uint8_t ConfigId)
Definition: adaptive.cpp:174
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:998
UNICHARSET unicharset
Definition: ccutil.h:68
int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const
double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:873
ShapeTable * shape_table_
Definition: classify.h:553
int push_back(T object)
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:320
void truncate(int size)
GenericVector< ScoredFont > fonts
Definition: shapetable.h:88

◆ ExtractFeatures()

void tesseract::Classify::ExtractFeatures ( const TBLOB blob,
bool  nonlinear_norm,
GenericVector< INT_FEATURE_STRUCT > *  bl_features,
GenericVector< INT_FEATURE_STRUCT > *  cn_features,
INT_FX_RESULT_STRUCT results,
GenericVector< int > *  outline_cn_counts 
)
static

Definition at line 444 of file intfx.cpp.

449  {
450  DENORM bl_denorm, cn_denorm;
451  tesseract::Classify::SetupBLCNDenorms(blob, nonlinear_norm,
452  &bl_denorm, &cn_denorm, results);
453  if (outline_cn_counts != nullptr)
454  outline_cn_counts->truncate(0);
455  // Iterate the outlines.
456  for (TESSLINE* ol = blob.outlines; ol != nullptr; ol = ol->next) {
457  // Iterate the polygon.
458  EDGEPT* loop_pt = ol->FindBestStartPt();
459  EDGEPT* pt = loop_pt;
460  if (pt == nullptr) continue;
461  do {
462  if (pt->IsHidden()) continue;
463  // Find a run of equal src_outline.
464  EDGEPT* last_pt = pt;
465  do {
466  last_pt = last_pt->next;
467  } while (last_pt != loop_pt && !last_pt->IsHidden() &&
468  last_pt->src_outline == pt->src_outline);
469  last_pt = last_pt->prev;
470  // Until the adaptive classifier can be weaned off polygon segments,
471  // we have to force extraction from the polygon for the bl_features.
472  ExtractFeaturesFromRun(pt, last_pt, bl_denorm, kStandardFeatureLength,
473  true, bl_features);
474  ExtractFeaturesFromRun(pt, last_pt, cn_denorm, kStandardFeatureLength,
475  false, cn_features);
476  pt = last_pt;
477  } while ((pt = pt->next) != loop_pt);
478  if (outline_cn_counts != nullptr)
479  outline_cn_counts->push_back(cn_features->size());
480  }
481  results->NumBL = bl_features->size();
482  results->NumCN = cn_features->size();
483  results->YBottom = blob.bounding_box().bottom();
484  results->YTop = blob.bounding_box().top();
485  results->Width = blob.bounding_box().width();
486 }
static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
Definition: intfx.cpp:131
TESSLINE * next
Definition: blobs.h:265
int16_t Width
Definition: intfx.h:40
int size() const
Definition: genericvector.h:71
int16_t NumBL
Definition: intfx.h:39
bool IsHidden() const
Definition: blobs.h:160
int16_t width() const
Definition: rect.h:115
int16_t top() const
Definition: rect.h:58
Definition: blobs.h:83
EDGEPT * prev
Definition: blobs.h:177
TBOX bounding_box() const
Definition: blobs.cpp:478
uint8_t YBottom
Definition: intfx.h:41
int push_back(T object)
uint8_t YTop
Definition: intfx.h:42
C_OUTLINE * src_outline
Definition: blobs.h:178
const double kStandardFeatureLength
Definition: intfx.h:46
void truncate(int size)
int16_t NumCN
Definition: intfx.h:39
int16_t bottom() const
Definition: rect.h:65
TESSLINE * outlines
Definition: blobs.h:384
EDGEPT * next
Definition: blobs.h:176

◆ ExtractIntCNFeatures()

FEATURE_SET tesseract::Classify::ExtractIntCNFeatures ( const TBLOB blob,
const INT_FX_RESULT_STRUCT fx_info 
)
Parameters
blobblob to extract features from
fx_info
Returns
Integer character-normalized features for blob.

Definition at line 219 of file picofeat.cpp.

220  {
221  INT_FX_RESULT_STRUCT local_fx_info(fx_info);
224  blob, false, &local_fx_info, &bl_features);
225  if (sample == nullptr) return nullptr;
226 
227  uint32_t num_features = sample->num_features();
228  const INT_FEATURE_STRUCT* features = sample->features();
229  FEATURE_SET feature_set = NewFeatureSet(num_features);
230  for (uint32_t f = 0; f < num_features; ++f) {
231  FEATURE feature = NewFeature(&IntFeatDesc);
232 
233  feature->Params[IntX] = features[f].X;
234  feature->Params[IntY] = features[f].Y;
235  feature->Params[IntDir] = features[f].Theta;
236  AddFeature(feature_set, feature);
237  }
238  delete sample;
239 
240  return feature_set;
241 } /* ExtractIntCNFeatures */
Definition: picofeat.h:31
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:79
FEATURE_SET NewFeatureSet(int NumFeatures)
Definition: ocrfeatures.cpp:97
Definition: cluster.h:32
const FEATURE_DESC_STRUCT IntFeatDesc
float Params[1]
Definition: ocrfeatures.h:62
Definition: picofeat.h:30
FEATURE NewFeature(const FEATURE_DESC_STRUCT *FeatureDesc)
Definition: ocrfeatures.cpp:81
bool AddFeature(FEATURE_SET FeatureSet, FEATURE Feature)
Definition: ocrfeatures.cpp:41

◆ ExtractIntGeoFeatures()

FEATURE_SET tesseract::Classify::ExtractIntGeoFeatures ( const TBLOB blob,
const INT_FX_RESULT_STRUCT fx_info 
)
Parameters
blobblob to extract features from
fx_info
Returns
Geometric (top/bottom/width) features for blob.

Definition at line 249 of file picofeat.cpp.

250  {
251  INT_FX_RESULT_STRUCT local_fx_info(fx_info);
254  blob, false, &local_fx_info, &bl_features);
255  if (sample == nullptr) return nullptr;
256 
257  FEATURE_SET feature_set = NewFeatureSet(1);
258  FEATURE feature = NewFeature(&IntFeatDesc);
259 
260  feature->Params[GeoBottom] = sample->geo_feature(GeoBottom);
261  feature->Params[GeoTop] = sample->geo_feature(GeoTop);
262  feature->Params[GeoWidth] = sample->geo_feature(GeoWidth);
263  AddFeature(feature_set, feature);
264  delete sample;
265 
266  return feature_set;
267 } /* ExtractIntGeoFeatures */
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:79
FEATURE_SET NewFeatureSet(int NumFeatures)
Definition: ocrfeatures.cpp:97
Definition: cluster.h:32
const FEATURE_DESC_STRUCT IntFeatDesc
float Params[1]
Definition: ocrfeatures.h:62
FEATURE NewFeature(const FEATURE_DESC_STRUCT *FeatureDesc)
Definition: ocrfeatures.cpp:81
bool AddFeature(FEATURE_SET FeatureSet, FEATURE Feature)
Definition: ocrfeatures.cpp:41

◆ ExtractOutlineFeatures()

FEATURE_SET tesseract::Classify::ExtractOutlineFeatures ( TBLOB Blob)

Convert each segment in the outline to a feature and return the features.

Parameters
Blobblob to extract pico-features from
Returns
Outline-features for Blob.
Note
Globals: none

Definition at line 42 of file outfeat.cpp.

42  {
43  LIST Outlines;
44  LIST RemainingOutlines;
45  MFOUTLINE Outline;
46  FEATURE_SET FeatureSet;
47  float XScale, YScale;
48 
49  FeatureSet = NewFeatureSet (MAX_OUTLINE_FEATURES);
50  if (Blob == nullptr)
51  return (FeatureSet);
52 
53  Outlines = ConvertBlob (Blob);
54 
55  NormalizeOutlines(Outlines, &XScale, &YScale);
56  RemainingOutlines = Outlines;
57  iterate(RemainingOutlines) {
58  Outline = (MFOUTLINE) first_node (RemainingOutlines);
59  ConvertToOutlineFeatures(Outline, FeatureSet);
60  }
62  NormalizeOutlineX(FeatureSet);
63  FreeOutlines(Outlines);
64  return (FeatureSet);
65 } /* ExtractOutlineFeatures */
void NormalizeOutlineX(FEATURE_SET FeatureSet)
Definition: outfeat.cpp:150
void NormalizeOutlines(LIST Outlines, float *XScale, float *YScale)
Definition: mfoutline.cpp:285
FEATURE_SET NewFeatureSet(int NumFeatures)
Definition: ocrfeatures.cpp:97
void FreeOutlines(LIST Outlines)
Definition: mfoutline.cpp:172
LIST ConvertBlob(TBLOB *blob)
Definition: mfoutline.cpp:39
#define first_node(l)
Definition: oldlist.h:141
#define iterate(l)
Definition: oldlist.h:161
void ConvertToOutlineFeatures(MFOUTLINE Outline, FEATURE_SET FeatureSet)
Definition: outfeat.cpp:111
#define MAX_OUTLINE_FEATURES
Definition: outfeat.h:35
LIST MFOUTLINE
Definition: mfoutline.h:34

◆ ExtractPicoFeatures()

FEATURE_SET tesseract::Classify::ExtractPicoFeatures ( TBLOB Blob)

Operation: Dummy for now.

Globals:

  • classify_norm_method normalization method currently specified
    Parameters
    Blobblob to extract pico-features from
    Returns
    Pico-features for Blob.

Definition at line 64 of file picofeat.cpp.

64  {
65  LIST Outlines;
66  LIST RemainingOutlines;
67  MFOUTLINE Outline;
68  FEATURE_SET FeatureSet;
69  float XScale, YScale;
70 
71  FeatureSet = NewFeatureSet(MAX_PICO_FEATURES);
72  Outlines = ConvertBlob(Blob);
73  NormalizeOutlines(Outlines, &XScale, &YScale);
74  RemainingOutlines = Outlines;
75  iterate(RemainingOutlines) {
76  Outline = (MFOUTLINE) first_node (RemainingOutlines);
77  ConvertToPicoFeatures2(Outline, FeatureSet);
78  }
80  NormalizePicoX(FeatureSet);
81  FreeOutlines(Outlines);
82  return (FeatureSet);
83 
84 } /* ExtractPicoFeatures */
void NormalizeOutlines(LIST Outlines, float *XScale, float *YScale)
Definition: mfoutline.cpp:285
FEATURE_SET NewFeatureSet(int NumFeatures)
Definition: ocrfeatures.cpp:97
void FreeOutlines(LIST Outlines)
Definition: mfoutline.cpp:172
#define MAX_PICO_FEATURES
Definition: picofeat.h:46
LIST ConvertBlob(TBLOB *blob)
Definition: mfoutline.cpp:39
void ConvertToPicoFeatures2(MFOUTLINE Outline, FEATURE_SET FeatureSet)
Definition: picofeat.cpp:156
void NormalizePicoX(FEATURE_SET FeatureSet)
Definition: picofeat.cpp:195
#define first_node(l)
Definition: oldlist.h:141
#define iterate(l)
Definition: oldlist.h:161
LIST MFOUTLINE
Definition: mfoutline.h:34

◆ FreeNormProtos()

void tesseract::Classify::FreeNormProtos ( )

Definition at line 157 of file normmatch.cpp.

157  {
158  if (NormProtos != nullptr) {
159  for (int i = 0; i < NormProtos->NumProtos; i++)
163  Efree(NormProtos);
164  NormProtos = nullptr;
165  }
166 }
NORM_PROTOS * NormProtos
Definition: classify.h:527
LIST * Protos
Definition: normmatch.cpp:39
void Efree(void *ptr)
Definition: emalloc.cpp:45
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:563
PARAM_DESC * ParamDesc
Definition: normmatch.cpp:38

◆ get_fontinfo_table() [1/2]

UnicityTable<FontInfo>& tesseract::Classify::get_fontinfo_table ( )
inline

Definition at line 386 of file classify.h.

386  {
387  return fontinfo_table_;
388  }
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529

◆ get_fontinfo_table() [2/2]

const UnicityTable<FontInfo>& tesseract::Classify::get_fontinfo_table ( ) const
inline

Definition at line 389 of file classify.h.

389  {
390  return fontinfo_table_;
391  }
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529

◆ get_fontset_table()

UnicityTable<FontSet>& tesseract::Classify::get_fontset_table ( )
inline

Definition at line 392 of file classify.h.

392  {
393  return fontset_table_;
394  }
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537

◆ GetAdaptiveFeatures()

int tesseract::Classify::GetAdaptiveFeatures ( TBLOB Blob,
INT_FEATURE_ARRAY  IntFeatures,
FEATURE_SET FloatFeatures 
)

This routine sets up the feature extractor to extract baseline normalized pico-features.

The extracted pico-features are converted to integer form and placed in IntFeatures. The original floating-pt. features are returned in FloatFeatures.

Globals: none

Parameters
Blobblob to extract features from
[out]IntFeaturesarray to fill with integer features
[out]FloatFeaturesplace to return actual floating-pt features
Returns
Number of pico-features returned (0 if an error occurred)

Definition at line 787 of file adaptmatch.cpp.

789  {
790  FEATURE_SET Features;
791  int NumFeatures;
792 
793  classify_norm_method.set_value(baseline);
794  Features = ExtractPicoFeatures(Blob);
795 
796  NumFeatures = Features->NumFeatures;
797  if (NumFeatures == 0 || NumFeatures > UNLIKELY_NUM_FEAT) {
798  FreeFeatureSet(Features);
799  return 0;
800  }
801 
802  ComputeIntFeatures(Features, IntFeatures);
803  *FloatFeatures = Features;
804 
805  return NumFeatures;
806 } /* GetAdaptiveFeatures */
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
Definition: float2int.cpp:90
uint16_t NumFeatures
Definition: ocrfeatures.h:67
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:65
#define UNLIKELY_NUM_FEAT
Definition: adaptmatch.cpp:79
FEATURE_SET ExtractPicoFeatures(TBLOB *Blob)
Definition: picofeat.cpp:64

◆ GetAmbiguities()

UNICHAR_ID * tesseract::Classify::GetAmbiguities ( TBLOB Blob,
CLASS_ID  CorrectClass 
)

This routine matches blob to the built-in templates to find out if there are any classes other than the correct class which are potential ambiguities.

Parameters
Blobblob to get classification ambiguities for
CorrectClasscorrect class for Blob

Globals:

  • CurrentRatings used by qsort compare routine
  • PreTrainedTemplates built-in templates
Returns
String containing all possible ambiguous classes.

Definition at line 1596 of file adaptmatch.cpp.

1597  {
1598  ADAPT_RESULTS *Results = new ADAPT_RESULTS();
1599  UNICHAR_ID *Ambiguities;
1600  int i;
1601 
1602  Results->Initialize();
1603  INT_FX_RESULT_STRUCT fx_info;
1605  TrainingSample* sample =
1607  &bl_features);
1608  if (sample == nullptr) {
1609  delete Results;
1610  return nullptr;
1611  }
1612 
1613  CharNormClassifier(Blob, *sample, Results);
1614  delete sample;
1615  RemoveBadMatches(Results);
1617 
1618  /* copy the class id's into an string of ambiguities - don't copy if
1619  the correct class is the only class id matched */
1620  Ambiguities = new UNICHAR_ID[Results->match.size() + 1];
1621  if (Results->match.size() > 1 ||
1622  (Results->match.size() == 1 &&
1623  Results->match[0].unichar_id != CorrectClass)) {
1624  for (i = 0; i < Results->match.size(); i++)
1625  Ambiguities[i] = Results->match[i].unichar_id;
1626  Ambiguities[i] = -1;
1627  } else {
1628  Ambiguities[0] = -1;
1629  }
1630 
1631  delete Results;
1632  return Ambiguities;
1633 } /* GetAmbiguities */
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:79
int UNICHAR_ID
Definition: unichar.h:35
int size() const
Definition: genericvector.h:71
void Initialize()
Definition: adaptmatch.cpp:103
Definition: cluster.h:32
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:98
bool classify_nonlinear_norm
Definition: classify.h:457
static int SortDescendingRating(const void *t1, const void *t2)
Definition: shapetable.h:56
int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
void RemoveBadMatches(ADAPT_RESULTS *Results)

◆ GetCharNormFeature()

int tesseract::Classify::GetCharNormFeature ( const INT_FX_RESULT_STRUCT fx_info,
INT_TEMPLATES  templates,
uint8_t *  pruner_norm_array,
uint8_t *  char_norm_array 
)

This routine calls the integer (Hardware) feature extractor if it has not been called before for this blob.

The results from the feature extractor are placed into globals so that they can be used in other routines without re-extracting the features.

It then copies the char norm features into the IntFeatures array provided by the caller.

Parameters
templatesused to compute char norm adjustments
pruner_norm_arrayArray of factors from blob normalization process
char_norm_arrayarray to fill with dummy char norm adjustments
fx_infoGlobals:
Returns
Number of features extracted or 0 if an error occurred.

Definition at line 1682 of file adaptmatch.cpp.

1685  {
1686  FEATURE norm_feature = NewFeature(&CharNormDesc);
1687  float baseline = kBlnBaselineOffset;
1688  float scale = MF_SCALE_FACTOR;
1689  norm_feature->Params[CharNormY] = (fx_info.Ymean - baseline) * scale;
1690  norm_feature->Params[CharNormLength] =
1691  fx_info.Length * scale / LENGTH_COMPRESSION;
1692  norm_feature->Params[CharNormRx] = fx_info.Rx * scale;
1693  norm_feature->Params[CharNormRy] = fx_info.Ry * scale;
1694  // Deletes norm_feature.
1695  ComputeCharNormArrays(norm_feature, templates, char_norm_array,
1696  pruner_norm_array);
1697  return IntCastRounded(fx_info.Length / kStandardFeatureLength);
1698 } /* GetCharNormFeature */
#define MF_SCALE_FACTOR
Definition: mfoutline.h:64
const int kBlnBaselineOffset
Definition: normalis.h:25
float Params[1]
Definition: ocrfeatures.h:62
void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
int IntCastRounded(double x)
Definition: helpers.h:168
#define LENGTH_COMPRESSION
Definition: normfeat.h:27
FEATURE NewFeature(const FEATURE_DESC_STRUCT *FeatureDesc)
Definition: ocrfeatures.cpp:81
int32_t Length
Definition: intfx.h:36
const FEATURE_DESC_STRUCT CharNormDesc
const double kStandardFeatureLength
Definition: intfx.h:46
int16_t Ymean
Definition: intfx.h:37

◆ GetClassToDebug()

CLASS_ID tesseract::Classify::GetClassToDebug ( const char *  Prompt,
bool *  adaptive_on,
bool *  pretrained_on,
int *  shape_id 
)

This routine prompts the user with Prompt and waits for the user to enter something in the debug window.

Parameters
Promptprompt to print while waiting for input from window
adaptive_on
pretrained_on
shape_id
Returns
Character entered in the debug window.
Note
Globals: none

Definition at line 1274 of file intproto.cpp.

1275  {
1276  tprintf("%s\n", Prompt);
1277  SVEvent* ev;
1278  SVEventType ev_type;
1279  int unichar_id = INVALID_UNICHAR_ID;
1280  // Wait until a click or popup event.
1281  do {
1283  ev_type = ev->type;
1284  if (ev_type == SVET_POPUP) {
1285  if (ev->command_id == IDA_SHAPE_INDEX) {
1286  if (shape_table_ != nullptr) {
1287  *shape_id = atoi(ev->parameter);
1288  *adaptive_on = false;
1289  *pretrained_on = true;
1290  if (*shape_id >= 0 && *shape_id < shape_table_->NumShapes()) {
1291  int font_id;
1292  shape_table_->GetFirstUnicharAndFont(*shape_id, &unichar_id,
1293  &font_id);
1294  tprintf("Shape %d, first unichar=%d, font=%d\n",
1295  *shape_id, unichar_id, font_id);
1296  return unichar_id;
1297  }
1298  tprintf("Shape index '%s' not found in shape table\n", ev->parameter);
1299  } else {
1300  tprintf("No shape table loaded!\n");
1301  }
1302  } else {
1304  unichar_id = unicharset.unichar_to_id(ev->parameter);
1305  if (ev->command_id == IDA_ADAPTIVE) {
1306  *adaptive_on = true;
1307  *pretrained_on = false;
1308  *shape_id = -1;
1309  } else if (ev->command_id == IDA_STATIC) {
1310  *adaptive_on = false;
1311  *pretrained_on = true;
1312  } else {
1313  *adaptive_on = true;
1314  *pretrained_on = true;
1315  }
1316  if (ev->command_id == IDA_ADAPTIVE || shape_table_ == nullptr) {
1317  *shape_id = -1;
1318  return unichar_id;
1319  }
1320  for (int s = 0; s < shape_table_->NumShapes(); ++s) {
1321  if (shape_table_->GetShape(s).ContainsUnichar(unichar_id)) {
1322  tprintf("%s\n", shape_table_->DebugStr(s).string());
1323  }
1324  }
1325  } else {
1326  tprintf("Char class '%s' not found in unicharset",
1327  ev->parameter);
1328  }
1329  }
1330  }
1331  delete ev;
1332  } while (ev_type != SVET_CLICK);
1333  return 0;
1334 } /* GetClassToDebug */
STRING DebugStr(int shape_id) const
Definition: shapetable.cpp:281
const char * string() const
Definition: strngs.cpp:196
SVEventType
Definition: scrollview.h:45
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:445
UNICHARSET unicharset
Definition: ccutil.h:68
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
SVEventType type
Definition: scrollview.h:64
ShapeTable * shape_table_
Definition: classify.h:553
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:320
int command_id
Definition: scrollview.h:70
char * parameter
Definition: scrollview.h:71
ScrollView * IntMatchWindow
Definition: intproto.cpp:176
bool ContainsUnichar(int unichar_id) const
Definition: shapetable.cpp:147
void GetFirstUnicharAndFont(int shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:404
int NumShapes() const
Definition: shapetable.h:275

◆ getDict()

virtual Dict& tesseract::Classify::getDict ( )
inlinevirtual

Reimplemented in tesseract::Tesseract.

Definition at line 107 of file classify.h.

107  {
108  return dict_;
109  }

◆ GetFontinfoId()

int tesseract::Classify::GetFontinfoId ( ADAPT_CLASS  Class,
uint8_t  ConfigId 
)

Definition at line 174 of file adaptive.cpp.

174  {
175  return (ConfigIsPermanent(Class, ConfigId) ?
176  PermConfigFor(Class, ConfigId)->FontinfoId :
177  TempConfigFor(Class, ConfigId)->FontinfoId);
178 }
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:101
#define PermConfigFor(Class, ConfigId)
Definition: adaptive.h:104
#define ConfigIsPermanent(Class, ConfigId)
Definition: adaptive.h:92

◆ InitAdaptedClass()

void tesseract::Classify::InitAdaptedClass ( TBLOB Blob,
CLASS_ID  ClassId,
int  FontinfoId,
ADAPT_CLASS  Class,
ADAPT_TEMPLATES  Templates 
)

This routine creates a new adapted class and uses Blob as the model for the first config in that class.

Parameters
Blobblob to model new class after
ClassIdid of the class to be initialized
FontinfoIdfont information inferred from pre-trained templates
Classadapted class to be initialized
Templatesadapted templates to add new class to

Globals:

Definition at line 694 of file adaptmatch.cpp.

698  {
699  FEATURE_SET Features;
700  int Fid, Pid;
701  FEATURE Feature;
702  int NumFeatures;
703  TEMP_PROTO TempProto;
704  PROTO Proto;
705  INT_CLASS IClass;
707 
708  classify_norm_method.set_value(baseline);
709  Features = ExtractOutlineFeatures(Blob);
710  NumFeatures = Features->NumFeatures;
711  if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
712  FreeFeatureSet(Features);
713  return;
714  }
715 
716  Config = NewTempConfig(NumFeatures - 1, FontinfoId);
717  TempConfigFor(Class, 0) = Config;
718 
719  /* this is a kludge to construct cutoffs for adapted templates */
720  if (Templates == AdaptedTemplates)
721  BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];
722 
723  IClass = ClassForClassId (Templates->Templates, ClassId);
724 
725  for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
726  Pid = AddIntProto (IClass);
727  assert (Pid != NO_PROTO);
728 
729  Feature = Features->Features[Fid];
730  TempProto = NewTempProto ();
731  Proto = &(TempProto->Proto);
732 
733  /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
734  ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
735  instead of the -0.25 to 0.75 used in baseline normalization */
736  Proto->Angle = Feature->Params[OutlineFeatDir];
737  Proto->X = Feature->Params[OutlineFeatX];
738  Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
739  Proto->Length = Feature->Params[OutlineFeatLength];
740  FillABC(Proto);
741 
742  TempProto->ProtoId = Pid;
743  SET_BIT (Config->Protos, Pid);
744 
745  ConvertProto(Proto, Pid, IClass);
746  AddProtoToProtoPruner(Proto, Pid, IClass,
748 
749  Class->TempProtos = push (Class->TempProtos, TempProto);
750  }
751  FreeFeatureSet(Features);
752 
753  AddIntConfig(IClass);
754  ConvertConfig (AllProtosOn, 0, IClass);
755 
757  tprintf("Added new class '%s' with class id %d and %d protos.\n",
758  unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
760  DisplayAdaptedChar(Blob, IClass);
761  }
762 
763  if (IsEmptyAdaptedClass(Class))
764  (Templates->NumNonEmptyClasses)++;
765 } /* InitAdaptedClass */
CLUSTERCONFIG Config
void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class)
Definition: adaptmatch.cpp:950
float X
Definition: protos.h:46
BIT_VECTOR AllProtosOn
Definition: classify.h:521
#define Y_DIM_OFFSET
Definition: adaptmatch.cpp:85
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class)
Definition: intproto.cpp:496
LIST push(LIST list, void *element)
Definition: oldlist.cpp:283
PROTO_STRUCT Proto
Definition: adaptive.h:32
uint16_t ProtoId
Definition: adaptive.h:30
int AddIntProto(INT_CLASS Class)
Definition: intproto.cpp:283
float Params[1]
Definition: ocrfeatures.h:62
float Y
Definition: protos.h:47
float Length
Definition: protos.h:49
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:101
FEATURE Features[1]
Definition: ocrfeatures.h:69
int classify_learning_debug_level
Definition: classify.h:460
UNICHARSET unicharset
Definition: ccutil.h:68
uint16_t NumFeatures
Definition: ocrfeatures.h:67
TEMP_PROTO NewTempProto()
Definition: adaptive.cpp:229
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:514
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
TEMP_CONFIG NewTempConfig(int MaxProtoId, int FontinfoId)
Definition: adaptive.cpp:204
int AddIntConfig(INT_CLASS Class)
Definition: intproto.cpp:262
FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob)
Definition: outfeat.cpp:42
void FillABC(PROTO Proto)
Definition: protos.cpp:195
#define IsEmptyAdaptedClass(Class)
Definition: adaptive.h:89
float Angle
Definition: protos.h:48
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:65
void AddProtoToProtoPruner(PROTO Proto, int ProtoId, INT_CLASS Class, bool debug)
Definition: intproto.cpp:370
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
#define NO_PROTO
Definition: matchdefs.h:43
#define ClassForClassId(T, c)
Definition: intproto.h:176
#define UNLIKELY_NUM_FEAT
Definition: adaptmatch.cpp:79
INT_TEMPLATES Templates
Definition: adaptive.h:76
#define SET_BIT(array, bit)
Definition: bitvec.h:57
void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS Class)
Definition: intproto.cpp:469

◆ InitAdaptiveClassifier()

void tesseract::Classify::InitAdaptiveClassifier ( TessdataManager mgr)

This routine reads in the training information needed by the adaptive classifier and saves it into global variables. Parameters: load_pre_trained_templates Indicates whether the pre-trained templates (inttemp, normproto and pffmtable components) should be loaded. Should only be set to true if the necessary classifier components are present in the [lang].traineddata file. Globals: BuiltInTemplatesFile file to get built-in temps from BuiltInCutoffsFile file to get avg. feat per class from classify_use_pre_adapted_templates enables use of pre-adapted templates

Definition at line 528 of file adaptmatch.cpp.

528  {
530  return;
531  if (AllProtosOn != nullptr)
532  EndAdaptiveClassifier(); // Don't leak with multiple inits.
533 
534  // If there is no language_data_path_prefix, the classifier will be
535  // adaptive only.
536  if (language_data_path_prefix.length() > 0 && mgr != nullptr) {
537  TFile fp;
538  ASSERT_HOST(mgr->GetComponent(TESSDATA_INTTEMP, &fp));
540 
541  if (mgr->GetComponent(TESSDATA_SHAPE_TABLE, &fp)) {
542  shape_table_ = new ShapeTable(unicharset);
543  if (!shape_table_->DeSerialize(&fp)) {
544  tprintf("Error loading shape table!\n");
545  delete shape_table_;
546  shape_table_ = nullptr;
547  }
548  }
549 
550  ASSERT_HOST(mgr->GetComponent(TESSDATA_PFFMTABLE, &fp));
551  ReadNewCutoffs(&fp, CharNormCutoffs);
552 
553  ASSERT_HOST(mgr->GetComponent(TESSDATA_NORMPROTO, &fp));
554  NormProtos = ReadNormProtos(&fp);
555  static_classifier_ = new TessClassifier(false, this);
556  }
557 
558  InitIntegerFX();
559 
567 
568  for (int i = 0; i < MAX_NUM_CLASSES; i++) {
569  BaselineCutoffs[i] = 0;
570  }
571 
573  TFile fp;
574  STRING Filename;
575 
576  Filename = imagefile;
577  Filename += ADAPT_TEMPLATE_SUFFIX;
578  if (!fp.Open(Filename.string(), nullptr)) {
580  } else {
581  cprintf("\nReading pre-adapted templates from %s ...\n",
582  Filename.string());
583  fflush(stdout);
585  cprintf("\n");
587 
588  for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
589  BaselineCutoffs[i] = CharNormCutoffs[i];
590  }
591  }
592  } else {
593  if (AdaptedTemplates != nullptr)
596  }
597 } /* InitAdaptiveClassifier */
ADAPT_TEMPLATES ReadAdaptedTemplates(TFile *File)
Definition: adaptive.cpp:333
void cprintf(const char *format,...)
Definition: callcpp.cpp:33
NORM_PROTOS * NormProtos
Definition: classify.h:527
#define WordsInVectorOfSize(NumBits)
Definition: bitvec.h:63
BIT_VECTOR AllProtosOn
Definition: classify.h:521
const char * string() const
Definition: strngs.cpp:196
void InitIntegerFX()
Definition: intfx.cpp:53
STRING language_data_path_prefix
Definition: ccutil.h:67
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:246
#define zero_all_bits(array, length)
Definition: bitvec.h:33
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset)
Definition: adaptive.cpp:152
void free_adapted_templates(ADAPT_TEMPLATES templates)
Definition: adaptive.cpp:183
NORM_PROTOS * ReadNormProtos(TFile *fp)
Definition: normmatch.cpp:235
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
#define MAX_NUM_PROTOS
Definition: intproto.h:48
BIT_VECTOR NewBitVector(int NumBits)
Definition: bitvec.cpp:82
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:510
UNICHARSET unicharset
Definition: ccutil.h:68
BIT_VECTOR AllConfigsOff
Definition: classify.h:523
#define set_all_bits(array, length)
Definition: bitvec.h:41
STRING imagefile
Definition: ccutil.h:70
ShapeTable * shape_table_
Definition: classify.h:553
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:514
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:245
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:460
bool classify_enable_adaptive_matcher
Definition: classify.h:450
Definition: strngs.h:45
BIT_VECTOR AllConfigsOn
Definition: classify.h:522
INT_TEMPLATES ReadIntTemplates(TFile *fp)
Definition: intproto.cpp:728
BIT_VECTOR TempProtoMask
Definition: classify.h:524
#define ADAPT_TEMPLATE_SUFFIX
Definition: adaptmatch.cpp:76
void ReadNewCutoffs(TFile *fp, CLASS_CUTOFF_ARRAY Cutoffs)
Definition: cutoffs.cpp:46
#define MAX_NUM_CLASSES
Definition: matchdefs.h:32
INT_TEMPLATES Templates
Definition: adaptive.h:76
int32_t length() const
Definition: strngs.cpp:191
bool classify_use_pre_adapted_templates
Definition: classify.h:452
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ LargeSpeckle()

bool tesseract::Classify::LargeSpeckle ( const TBLOB blob)

Definition at line 256 of file classify.cpp.

256  {
257  double speckle_size = kBlnXHeight * speckle_large_max_size;
258  TBOX bbox = blob.bounding_box();
259  return bbox.width() < speckle_size && bbox.height() < speckle_size;
260 }
Definition: rect.h:34
const int kBlnXHeight
Definition: normalis.h:24
int16_t width() const
Definition: rect.h:115
double speckle_large_max_size
Definition: classify.h:542
TBOX bounding_box() const
Definition: blobs.cpp:478
int16_t height() const
Definition: rect.h:108

◆ LearnBlob()

void tesseract::Classify::LearnBlob ( const STRING fontname,
TBLOB Blob,
const DENORM cn_denorm,
const INT_FX_RESULT_STRUCT fx_info,
const char *  blob_text 
)

Definition at line 74 of file blobclass.cpp.

77  {
79  CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);
80  CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);
81  CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);
82  CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);
83 
84  if (ValidCharDescription(feature_defs_, CharDesc)) {
85  // Label the features with a class name and font name.
86  tr_file_data_ += "\n";
87  tr_file_data_ += fontname;
88  tr_file_data_ += " ";
89  tr_file_data_ += blob_text;
90  tr_file_data_ += "\n";
91 
92  // write micro-features to file and clean up
93  WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_);
94  } else {
95  tprintf("Blob learned was invalid!\n");
96  }
97  FreeCharDescription(CharDesc);
98 } // LearnBlob
void FreeCharDescription(CHAR_DESC CharDesc)
Definition: featdefs.cpp:129
FEATURE_SET ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:219
FEATURE_SET ExtractIntGeoFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:249
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:42
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC CharDesc)
Definition: featdefs.cpp:195
FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT &fx_info)
Definition: normfeat.cpp:61
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:548
FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM &cn_denorm)
Definition: mf.cpp:43
CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs)
Definition: featdefs.cpp:148
void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC CharDesc, STRING *str)
Definition: featdefs.cpp:174

◆ LearnPieces()

void tesseract::Classify::LearnPieces ( const char *  fontname,
int  start,
int  length,
float  threshold,
CharSegmentationType  segmentation,
const char *  correct_text,
WERD_RES word 
)

Definition at line 375 of file adaptmatch.cpp.

377  {
378  // TODO(daria) Remove/modify this if/when we want
379  // to train and/or adapt to n-grams.
380  if (segmentation != CST_WHOLE &&
381  (segmentation != CST_FRAGMENT || disable_character_fragments))
382  return;
383 
384  if (length > 1) {
385  SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start,
386  start + length - 1);
387  }
388  TBLOB* blob = word->chopped_word->blobs[start];
389  // Rotate the blob if needed for classification.
390  TBLOB* rotated_blob = blob->ClassifyNormalizeIfNeeded();
391  if (rotated_blob == nullptr)
392  rotated_blob = blob;
393 
394  #ifndef GRAPHICS_DISABLED
395  // Draw debug windows showing the blob that is being learned if needed.
396  if (strcmp(classify_learn_debug_str.string(), correct_text) == 0) {
397  RefreshDebugWindow(&learn_debug_win_, "LearnPieces", 600,
398  word->chopped_word->bounding_box());
399  rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
400  learn_debug_win_->Update();
401  window_wait(learn_debug_win_);
402  }
403  if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {
404  ASSERT_HOST(learn_fragments_debug_win_ != nullptr); // set up in LearnWord
405  blob->plot(learn_fragments_debug_win_,
407  learn_fragments_debug_win_->Update();
408  }
409  #endif // GRAPHICS_DISABLED
410 
411  if (fontname != nullptr) {
412  classify_norm_method.set_value(character); // force char norm spc 30/11/93
413  tess_bn_matching.set_value(false); // turn it off
414  tess_cn_matching.set_value(false);
415  DENORM bl_denorm, cn_denorm;
416  INT_FX_RESULT_STRUCT fx_info;
418  &bl_denorm, &cn_denorm, &fx_info);
419  LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
420  } else if (unicharset.contains_unichar(correct_text)) {
421  UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
422  int font_id = word->fontinfo != nullptr
423  ? fontinfo_table_.get_id(*word->fontinfo)
424  : 0;
426  tprintf("Adapting to char = %s, thr= %g font_id= %d\n",
427  unicharset.id_to_unichar(class_id), threshold, font_id);
428  // If filename is not nullptr we are doing recognition
429  // (as opposed to training), so we must have already set word fonts.
430  AdaptToChar(rotated_blob, class_id, font_id, threshold, AdaptedTemplates);
431  if (BackupAdaptedTemplates != nullptr) {
432  // Adapt the backup templates too. They will be used if the primary gets
433  // too full.
434  AdaptToChar(rotated_blob, class_id, font_id, threshold,
436  }
437  } else if (classify_debug_level >= 1) {
438  tprintf("Can't adapt to %s not in unicharset\n", correct_text);
439  }
440  if (rotated_blob != blob) {
441  delete rotated_blob;
442  }
443 
444  SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start,
445  start + length - 1);
446 } // LearnPieces.
static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
Definition: intfx.cpp:131
int UNICHAR_ID
Definition: unichar.h:35
void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
Definition: adaptmatch.cpp:227
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:520
TBOX bounding_box() const
Definition: blobs.cpp:871
static void JoinPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:216
static void BreakPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:194
const FontInfo * fontinfo
Definition: pageres.h:304
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
static void Update()
Definition: scrollview.cpp:711
bool classify_nonlinear_norm
Definition: classify.h:457
bool classify_debug_character_fragments
Definition: classify.h:496
int classify_learning_debug_level
Definition: classify.h:460
UNICHARSET unicharset
Definition: ccutil.h:68
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:518
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:514
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
TBLOB * ClassifyNormalizeIfNeeded() const
Definition: blobs.cpp:356
GenericVector< SEAM * > seam_array
Definition: pageres.h:217
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
bool disable_character_fragments
Definition: classify.h:491
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
char * classify_learn_debug_str
Definition: classify.h:500
void LearnBlob(const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
Definition: blobclass.cpp:74
char window_wait(ScrollView *win)
Definition: callcpp.cpp:104
Definition: blobs.h:268
void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
Definition: adaptmatch.cpp:857
TWERD * chopped_word
Definition: pageres.h:215
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ LearnWord()

void tesseract::Classify::LearnWord ( const char *  fontname,
WERD_RES word 
)

Definition at line 251 of file adaptmatch.cpp.

251  {
252  int word_len = word->correct_text.size();
253  if (word_len == 0) return;
254 
255  float* thresholds = nullptr;
256  if (fontname == nullptr) {
257  // Adaption mode.
258  if (!EnableLearning || word->best_choice == nullptr)
259  return; // Can't or won't adapt.
260 
262  tprintf("\n\nAdapting to word = %s\n",
263  word->best_choice->debug_string().string());
264  thresholds = new float[word_len];
268  matcher_rating_margin, thresholds);
269  }
270  int start_blob = 0;
271 
272  #ifndef GRAPHICS_DISABLED
274  if (learn_fragmented_word_debug_win_ != nullptr) {
275  window_wait(learn_fragmented_word_debug_win_);
276  }
277  RefreshDebugWindow(&learn_fragments_debug_win_, "LearnPieces", 400,
278  word->chopped_word->bounding_box());
279  RefreshDebugWindow(&learn_fragmented_word_debug_win_, "LearnWord", 200,
280  word->chopped_word->bounding_box());
281  word->chopped_word->plot(learn_fragmented_word_debug_win_);
283  }
284  #endif // GRAPHICS_DISABLED
285 
286  for (int ch = 0; ch < word_len; ++ch) {
288  tprintf("\nLearning %s\n", word->correct_text[ch].string());
289  }
290  if (word->correct_text[ch].length() > 0) {
291  float threshold = thresholds != nullptr ? thresholds[ch] : 0.0f;
292 
293  LearnPieces(fontname, start_blob, word->best_state[ch], threshold,
294  CST_WHOLE, word->correct_text[ch].string(), word);
295 
296  if (word->best_state[ch] > 1 && !disable_character_fragments) {
297  // Check that the character breaks into meaningful fragments
298  // that each match a whole character with at least
299  // classify_character_fragments_garbage_certainty_threshold
300  bool garbage = false;
301  int frag;
302  for (frag = 0; frag < word->best_state[ch]; ++frag) {
303  TBLOB* frag_blob = word->chopped_word->blobs[start_blob + frag];
305  garbage |= LooksLikeGarbage(frag_blob);
306  }
307  }
308  // Learn the fragments.
309  if (!garbage) {
310  bool pieces_all_natural = word->PiecesAllNatural(start_blob,
311  word->best_state[ch]);
312  if (pieces_all_natural || !prioritize_division) {
313  for (frag = 0; frag < word->best_state[ch]; ++frag) {
314  GenericVector<STRING> tokens;
315  word->correct_text[ch].split(' ', &tokens);
316 
317  tokens[0] = CHAR_FRAGMENT::to_string(
318  tokens[0].string(), frag, word->best_state[ch],
319  pieces_all_natural);
320 
321  STRING full_string;
322  for (int i = 0; i < tokens.size(); i++) {
323  full_string += tokens[i];
324  if (i != tokens.size() - 1)
325  full_string += ' ';
326  }
327  LearnPieces(fontname, start_blob + frag, 1, threshold,
328  CST_FRAGMENT, full_string.string(), word);
329  }
330  }
331  }
332  }
333 
334  // TODO(rays): re-enable this part of the code when we switch to the
335  // new classifier that needs to see examples of garbage.
336  /*
337  if (word->best_state[ch] > 1) {
338  // If the next blob is good, make junk with the rightmost fragment.
339  if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
340  LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
341  word->best_state[ch + 1] + 1,
342  threshold, CST_IMPROPER, INVALID_UNICHAR, word);
343  }
344  // If the previous blob is good, make junk with the leftmost fragment.
345  if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
346  LearnPieces(fontname, start_blob - word->best_state[ch - 1],
347  word->best_state[ch - 1] + 1,
348  threshold, CST_IMPROPER, INVALID_UNICHAR, word);
349  }
350  }
351  // If the next blob is good, make a join with it.
352  if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
353  STRING joined_text = word->correct_text[ch];
354  joined_text += word->correct_text[ch + 1];
355  LearnPieces(fontname, start_blob,
356  word->best_state[ch] + word->best_state[ch + 1],
357  threshold, CST_NGRAM, joined_text.string(), word);
358  }
359  */
360  }
361  start_blob += word->best_state[ch];
362  }
363  delete [] thresholds;
364 } // LearnWord.
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1084
int size() const
Definition: genericvector.h:71
void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
Definition: adaptmatch.cpp:227
void LearnPieces(const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
Definition: adaptmatch.cpp:375
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:567
const char * string() const
Definition: strngs.cpp:196
TBOX bounding_box() const
Definition: blobs.cpp:871
double matcher_rating_margin
Definition: classify.h:465
bool prioritize_division
Definition: classify.h:428
static void Update()
Definition: scrollview.cpp:711
GenericVector< STRING > correct_text
Definition: pageres.h:275
double classify_character_fragments_garbage_certainty_threshold
Definition: classify.h:494
bool classify_debug_character_fragments
Definition: classify.h:496
int classify_learning_debug_level
Definition: classify.h:460
STRING to_string() const
Definition: unicharset.h:80
double matcher_good_threshold
Definition: classify.h:461
int length() const
Definition: genericvector.h:85
void plot(ScrollView *window)
Definition: blobs.cpp:907
GenericVector< int > best_state
Definition: pageres.h:271
double certainty_scale
Definition: classify.h:478
bool LooksLikeGarbage(TBLOB *blob)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
double matcher_perfect_threshold
Definition: classify.h:463
bool disable_character_fragments
Definition: classify.h:491
Definition: strngs.h:45
const STRING debug_string() const
Definition: ratngs.h:505
char window_wait(ScrollView *win)
Definition: callcpp.cpp:104
Definition: blobs.h:268
TWERD * chopped_word
Definition: pageres.h:215
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ LooksLikeGarbage()

bool tesseract::Classify::LooksLikeGarbage ( TBLOB blob)

Definition at line 1637 of file adaptmatch.cpp.

1637  {
1638  BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST();
1639  AdaptiveClassifier(blob, ratings);
1640  BLOB_CHOICE_IT ratings_it(ratings);
1643  print_ratings_list("======================\nLooksLikeGarbage() got ",
1644  ratings, unicharset);
1645  }
1646  for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list();
1647  ratings_it.forward()) {
1648  if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != nullptr) {
1649  continue;
1650  }
1651  float certainty = ratings_it.data()->certainty();
1652  delete ratings;
1653  return certainty <
1655  }
1656  delete ratings;
1657  return true; // no whole characters in ratings
1658 }
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:192
double classify_character_fragments_garbage_certainty_threshold
Definition: classify.h:494
bool classify_debug_character_fragments
Definition: classify.h:496
UNICHARSET unicharset
Definition: ccutil.h:68
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:836
virtual Dict & getDict()
Definition: classify.h:107

◆ MakeNewTemporaryConfig()

int tesseract::Classify::MakeNewTemporaryConfig ( ADAPT_TEMPLATES  Templates,
CLASS_ID  ClassId,
int  FontinfoId,
int  NumFeatures,
INT_FEATURE_ARRAY  Features,
FEATURE_SET  FloatFeatures 
)
Parameters
Templatesadapted templates to add new config to
ClassIdclass id to associate with new config
FontinfoIdfont information inferred from pre-trained templates
NumFeaturesnumber of features in IntFeatures
Featuresfeatures describing model for new config
FloatFeaturesfloating-pt representation of features
Returns
The id of the new config created, a negative integer in case of error.

Definition at line 1744 of file adaptmatch.cpp.

1749  {
1750  INT_CLASS IClass;
1751  ADAPT_CLASS Class;
1752  PROTO_ID OldProtos[MAX_NUM_PROTOS];
1753  FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
1754  int NumOldProtos;
1755  int NumBadFeatures;
1756  int MaxProtoId, OldMaxProtoId;
1757  int BlobLength = 0;
1758  int MaskSize;
1759  int ConfigId;
1761  int i;
1762  int debug_level = NO_DEBUG;
1763 
1765  debug_level =
1767 
1768  IClass = ClassForClassId(Templates->Templates, ClassId);
1769  Class = Templates->Class[ClassId];
1770 
1771  if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
1772  ++NumAdaptationsFailed;
1774  cprintf("Cannot make new temporary config: maximum number exceeded.\n");
1775  return -1;
1776  }
1777 
1778  OldMaxProtoId = IClass->NumProtos - 1;
1779 
1780  NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff,
1781  BlobLength, NumFeatures, Features,
1782  OldProtos, classify_adapt_proto_threshold,
1783  debug_level);
1784 
1785  MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
1786  zero_all_bits(TempProtoMask, MaskSize);
1787  for (i = 0; i < NumOldProtos; i++)
1788  SET_BIT(TempProtoMask, OldProtos[i]);
1789 
1790  NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn,
1791  BlobLength, NumFeatures, Features,
1792  BadFeatures,
1794  debug_level);
1795 
1796  MaxProtoId = MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures,
1797  IClass, Class, TempProtoMask);
1798  if (MaxProtoId == NO_PROTO) {
1799  ++NumAdaptationsFailed;
1801  cprintf("Cannot make new temp protos: maximum number exceeded.\n");
1802  return -1;
1803  }
1804 
1805  ConfigId = AddIntConfig(IClass);
1806  ConvertConfig(TempProtoMask, ConfigId, IClass);
1807  Config = NewTempConfig(MaxProtoId, FontinfoId);
1808  TempConfigFor(Class, ConfigId) = Config;
1809  copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);
1810 
1812  cprintf("Making new temp config %d fontinfo id %d"
1813  " using %d old and %d new protos.\n",
1814  ConfigId, Config->FontinfoId,
1815  NumOldProtos, MaxProtoId - OldMaxProtoId);
1816 
1817  return ConfigId;
1818 } /* MakeNewTemporaryConfig */
CLUSTERCONFIG Config
void cprintf(const char *format,...)
Definition: callcpp.cpp:33
uint8_t FEATURE_ID
Definition: matchdefs.h:48
#define WordsInVectorOfSize(NumBits)
Definition: bitvec.h:63
BIT_VECTOR AllProtosOn
Definition: classify.h:521
int FindGoodProtos(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, uint16_t BlobLength, int16_t NumFeatures, INT_FEATURE_ARRAY Features, PROTO_ID *ProtoArray, int AdaptProtoThreshold, int Debug)
Definition: intmatcher.cpp:549
#define PRINT_FEATURE_MATCHES
Definition: intproto.h:189
#define zero_all_bits(array, length)
Definition: bitvec.h:33
#define copy_all_bits(source, dest, length)
Definition: bitvec.h:49
int classify_adapt_proto_threshold
Definition: classify.h:486
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:101
uint8_t NumConfigs
Definition: intproto.h:108
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
#define MAX_NUM_PROTOS
Definition: intproto.h:48
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:80
int classify_learning_debug_level
Definition: classify.h:460
BIT_VECTOR AllConfigsOff
Definition: classify.h:523
#define MAX_NUM_INT_FEATURES
Definition: intproto.h:129
PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
TEMP_CONFIG NewTempConfig(int MaxProtoId, int FontinfoId)
Definition: adaptive.cpp:204
int AddIntConfig(INT_CLASS Class)
Definition: intproto.cpp:262
int16_t PROTO_ID
Definition: matchdefs.h:42
int classify_adapt_feature_threshold
Definition: classify.h:488
BIT_VECTOR AllConfigsOn
Definition: classify.h:522
#define PRINT_MATCH_SUMMARY
Definition: intproto.h:186
BIT_VECTOR TempProtoMask
Definition: classify.h:524
int FindBadFeatures(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, uint16_t BlobLength, int16_t NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_ID *FeatureArray, int AdaptFeatureThreshold, int Debug)
Definition: intmatcher.cpp:618
uint16_t NumProtos
Definition: intproto.h:106
#define NO_PROTO
Definition: matchdefs.h:43
#define PRINT_PROTO_MATCHES
Definition: intproto.h:190
#define ClassForClassId(T, c)
Definition: intproto.h:176
INT_TEMPLATES Templates
Definition: adaptive.h:76
#define NO_DEBUG
Definition: adaptmatch.cpp:80
#define SET_BIT(array, bit)
Definition: bitvec.h:57
void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS Class)
Definition: intproto.cpp:469
IntegerMatcher im_
Definition: classify.h:544

◆ MakeNewTempProtos()

PROTO_ID tesseract::Classify::MakeNewTempProtos ( FEATURE_SET  Features,
int  NumBadFeat,
FEATURE_ID  BadFeat[],
INT_CLASS  IClass,
ADAPT_CLASS  Class,
BIT_VECTOR  TempProtoMask 
)

This routine finds sets of sequential bad features that all have the same angle and converts each set into a new temporary proto. The temp proto is added to the proto pruner for IClass, pushed onto the list of temp protos in Class, and added to TempProtoMask.

Parameters
Featuresfloating-pt features describing new character
NumBadFeatnumber of bad features to turn into protos
BadFeatfeature id's of bad features
IClassinteger class templates to add new protos to
Classadapted class templates to add new protos to
TempProtoMaskproto mask to add new protos to

Globals: none

Returns
Max proto id in class after all protos have been added.

Definition at line 1839 of file adaptmatch.cpp.

1844  {
1845  FEATURE_ID *ProtoStart;
1846  FEATURE_ID *ProtoEnd;
1847  FEATURE_ID *LastBad;
1848  TEMP_PROTO TempProto;
1849  PROTO Proto;
1850  FEATURE F1, F2;
1851  float X1, X2, Y1, Y2;
1852  float A1, A2, AngleDelta;
1853  float SegmentLength;
1854  PROTO_ID Pid;
1855 
1856  for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat;
1857  ProtoStart < LastBad; ProtoStart = ProtoEnd) {
1858  F1 = Features->Features[*ProtoStart];
1859  X1 = F1->Params[PicoFeatX];
1860  Y1 = F1->Params[PicoFeatY];
1861  A1 = F1->Params[PicoFeatDir];
1862 
1863  for (ProtoEnd = ProtoStart + 1,
1864  SegmentLength = GetPicoFeatureLength();
1865  ProtoEnd < LastBad;
1866  ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
1867  F2 = Features->Features[*ProtoEnd];
1868  X2 = F2->Params[PicoFeatX];
1869  Y2 = F2->Params[PicoFeatY];
1870  A2 = F2->Params[PicoFeatDir];
1871 
1872  AngleDelta = fabs(A1 - A2);
1873  if (AngleDelta > 0.5)
1874  AngleDelta = 1.0 - AngleDelta;
1875 
1876  if (AngleDelta > matcher_clustering_max_angle_delta ||
1877  fabs(X1 - X2) > SegmentLength ||
1878  fabs(Y1 - Y2) > SegmentLength)
1879  break;
1880  }
1881 
1882  F2 = Features->Features[*(ProtoEnd - 1)];
1883  X2 = F2->Params[PicoFeatX];
1884  Y2 = F2->Params[PicoFeatY];
1885  A2 = F2->Params[PicoFeatDir];
1886 
1887  Pid = AddIntProto(IClass);
1888  if (Pid == NO_PROTO)
1889  return (NO_PROTO);
1890 
1891  TempProto = NewTempProto();
1892  Proto = &(TempProto->Proto);
1893 
1894  /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
1895  ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
1896  instead of the -0.25 to 0.75 used in baseline normalization */
1897  Proto->Length = SegmentLength;
1898  Proto->Angle = A1;
1899  Proto->X = (X1 + X2) / 2.0;
1900  Proto->Y = (Y1 + Y2) / 2.0 - Y_DIM_OFFSET;
1901  FillABC(Proto);
1902 
1903  TempProto->ProtoId = Pid;
1904  SET_BIT(TempProtoMask, Pid);
1905 
1906  ConvertProto(Proto, Pid, IClass);
1907  AddProtoToProtoPruner(Proto, Pid, IClass,
1909 
1910  Class->TempProtos = push(Class->TempProtos, TempProto);
1911  }
1912  return IClass->NumProtos - 1;
1913 } /* MakeNewTempProtos */
float X
Definition: protos.h:46
uint8_t FEATURE_ID
Definition: matchdefs.h:48
#define Y_DIM_OFFSET
Definition: adaptmatch.cpp:85
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class)
Definition: intproto.cpp:496
LIST push(LIST list, void *element)
Definition: oldlist.cpp:283
PROTO_STRUCT Proto
Definition: adaptive.h:32
uint16_t ProtoId
Definition: adaptive.h:30
int AddIntProto(INT_CLASS Class)
Definition: intproto.cpp:283
float Params[1]
Definition: ocrfeatures.h:62
float Y
Definition: protos.h:47
#define GetPicoFeatureLength()
Definition: picofeat.h:57
float Length
Definition: protos.h:49
FEATURE Features[1]
Definition: ocrfeatures.h:69
int classify_learning_debug_level
Definition: classify.h:460
TEMP_PROTO NewTempProto()
Definition: adaptive.cpp:229
double matcher_clustering_max_angle_delta
Definition: classify.h:473
int16_t PROTO_ID
Definition: matchdefs.h:42
void FillABC(PROTO Proto)
Definition: protos.cpp:195
float Angle
Definition: protos.h:48
void AddProtoToProtoPruner(PROTO Proto, int ProtoId, INT_CLASS Class, bool debug)
Definition: intproto.cpp:370
BIT_VECTOR TempProtoMask
Definition: classify.h:524
uint16_t NumProtos
Definition: intproto.h:106
#define NO_PROTO
Definition: matchdefs.h:43
#define SET_BIT(array, bit)
Definition: bitvec.h:57

◆ MakePermanent()

void tesseract::Classify::MakePermanent ( ADAPT_TEMPLATES  Templates,
CLASS_ID  ClassId,
int  ConfigId,
TBLOB Blob 
)
Parameters
Templatescurrent set of adaptive templates
ClassIdclass containing config to be made permanent
ConfigIdconfig to be made permanent
Blobcurrent blob being adapted to

Globals: none

Definition at line 1925 of file adaptmatch.cpp.

1928  {
1929  UNICHAR_ID *Ambigs;
1931  ADAPT_CLASS Class;
1932  PROTO_KEY ProtoKey;
1933 
1934  Class = Templates->Class[ClassId];
1935  Config = TempConfigFor(Class, ConfigId);
1936 
1937  MakeConfigPermanent(Class, ConfigId);
1938  if (Class->NumPermConfigs == 0)
1939  Templates->NumPermClasses++;
1940  Class->NumPermConfigs++;
1941 
1942  // Initialize permanent config.
1943  Ambigs = GetAmbiguities(Blob, ClassId);
1944  PERM_CONFIG Perm = (PERM_CONFIG)malloc(sizeof(PERM_CONFIG_STRUCT));
1945  Perm->Ambigs = Ambigs;
1946  Perm->FontinfoId = Config->FontinfoId;
1947 
1948  // Free memory associated with temporary config (since ADAPTED_CONFIG
1949  // is a union we need to clean up before we record permanent config).
1950  ProtoKey.Templates = Templates;
1951  ProtoKey.ClassId = ClassId;
1952  ProtoKey.ConfigId = ConfigId;
1953  Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);
1955 
1956  // Record permanent config.
1957  PermConfigFor(Class, ConfigId) = Perm;
1958 
1959  if (classify_learning_debug_level >= 1) {
1960  tprintf("Making config %d for %s (ClassId %d) permanent:"
1961  " fontinfo id %d, ambiguities '",
1962  ConfigId, getDict().getUnicharset().debug_str(ClassId).string(),
1963  ClassId, PermConfigFor(Class, ConfigId)->FontinfoId);
1964  for (UNICHAR_ID *AmbigsPointer = Ambigs;
1965  *AmbigsPointer >= 0; ++AmbigsPointer)
1966  tprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
1967  tprintf("'.\n");
1968  }
1969 } /* MakePermanent */
int UNICHAR_ID
Definition: unichar.h:35
CLUSTERCONFIG Config
CLASS_ID ClassId
Definition: adaptmatch.cpp:125
int MakeTempProtoPerm(void *item1, void *item2)
uint8_t NumPermConfigs
Definition: adaptive.h:64
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:101
void FreeTempConfig(TEMP_CONFIG Config)
Definition: adaptive.cpp:75
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:80
#define PermConfigFor(Class, ConfigId)
Definition: adaptive.h:104
int classify_learning_debug_level
Definition: classify.h:460
UNICHARSET unicharset
Definition: ccutil.h:68
ADAPT_TEMPLATES Templates
Definition: adaptmatch.cpp:124
UNICHAR_ID * Ambigs
Definition: adaptive.h:51
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
uint8_t NumPermClasses
Definition: adaptive.h:78
PERM_CONFIG_STRUCT * PERM_CONFIG
Definition: adaptive.h:54
#define MakeConfigPermanent(Class, ConfigId)
Definition: adaptive.h:95
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
UNICHAR_ID * GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass)
virtual Dict & getDict()
Definition: classify.h:107
LIST delete_d(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:114

◆ MasterMatcher()

void tesseract::Classify::MasterMatcher ( INT_TEMPLATES  templates,
int16_t  num_features,
const INT_FEATURE_STRUCT features,
const uint8_t *  norm_factors,
ADAPT_CLASS classes,
int  debug,
int  matcher_multiplier,
const TBOX blob_box,
const GenericVector< CP_RESULT_STRUCT > &  results,
ADAPT_RESULTS final_results 
)

Factored-out calls to IntegerMatcher based on class pruner results. Returns integer matcher results inside CLASS_PRUNER_RESULTS structure.

Definition at line 1092 of file adaptmatch.cpp.

1101  {
1102  int top = blob_box.top();
1103  int bottom = blob_box.bottom();
1104  UnicharRating int_result;
1105  for (int c = 0; c < results.size(); c++) {
1106  CLASS_ID class_id = results[c].Class;
1107  BIT_VECTOR protos = classes != nullptr ? classes[class_id]->PermProtos
1108  : AllProtosOn;
1109  BIT_VECTOR configs = classes != nullptr ? classes[class_id]->PermConfigs
1110  : AllConfigsOn;
1111 
1112  int_result.unichar_id = class_id;
1113  im_.Match(ClassForClassId(templates, class_id),
1114  protos, configs,
1115  num_features, features,
1116  &int_result, classify_adapt_feature_threshold, debug,
1118  bool is_debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1119  ExpandShapesAndApplyCorrections(classes, is_debug, class_id, bottom, top,
1120  results[c].Rating,
1121  final_results->BlobLength,
1122  matcher_multiplier, norm_factors,
1123  &int_result, final_results);
1124  }
1125 }
int32_t BlobLength
Definition: adaptmatch.cpp:93
int size() const
Definition: genericvector.h:71
BIT_VECTOR AllProtosOn
Definition: classify.h:521
bool matcher_debug_separate_windows
Definition: classify.h:499
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:36
void Match(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:470
int16_t top() const
Definition: rect.h:58
BIT_VECTOR PermConfigs
Definition: adaptive.h:68
void ExpandShapesAndApplyCorrections(ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
int classify_adapt_feature_threshold
Definition: classify.h:488
BIT_VECTOR PermProtos
Definition: adaptive.h:67
BIT_VECTOR AllConfigsOn
Definition: classify.h:522
#define ClassForClassId(T, c)
Definition: intproto.h:176
int16_t bottom() const
Definition: rect.h:65
IntegerMatcher im_
Definition: classify.h:544

◆ NewAdaptedTemplates()

ADAPT_TEMPLATES tesseract::Classify::NewAdaptedTemplates ( bool  InitFromUnicharset)

Allocates memory for adapted tempates. each char in unicharset to the newly created templates

Parameters
InitFromUnicharsetif true, add an empty class for
Returns
Ptr to new adapted templates.
Note
Globals: none

Definition at line 152 of file adaptive.cpp.

152  {
153  ADAPT_TEMPLATES Templates;
154 
155  Templates = (ADAPT_TEMPLATES) Emalloc (sizeof (ADAPT_TEMPLATES_STRUCT));
156 
157  Templates->Templates = NewIntTemplates ();
158  Templates->NumPermClasses = 0;
159  Templates->NumNonEmptyClasses = 0;
160 
161  /* Insert an empty class for each unichar id in unicharset */
162  for (int i = 0; i < MAX_NUM_CLASSES; i++) {
163  Templates->Class[i] = nullptr;
164  if (InitFromUnicharset && i < unicharset.size()) {
165  AddAdaptedClass(Templates, NewAdaptedClass(), i);
166  }
167  }
168 
169  return (Templates);
170 
171 } /* NewAdaptedTemplates */
void AddAdaptedClass(ADAPT_TEMPLATES Templates, ADAPT_CLASS Class, CLASS_ID ClassId)
Definition: adaptive.cpp:46
void * Emalloc(int Size)
Definition: emalloc.cpp:31
ADAPT_TEMPLATES_STRUCT * ADAPT_TEMPLATES
Definition: adaptive.h:82
int size() const
Definition: unicharset.h:336
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:80
UNICHARSET unicharset
Definition: ccutil.h:68
uint8_t NumPermClasses
Definition: adaptive.h:78
ADAPT_CLASS NewAdaptedClass()
Definition: adaptive.cpp:103
INT_TEMPLATES NewIntTemplates()
Definition: intproto.cpp:692
#define MAX_NUM_CLASSES
Definition: matchdefs.h:32
INT_TEMPLATES Templates
Definition: adaptive.h:76

◆ NormalizeOutlines()

void tesseract::Classify::NormalizeOutlines ( LIST  Outlines,
float *  XScale,
float *  YScale 
)

This routine normalizes every outline in Outlines according to the currently selected normalization method. It also returns the scale factors that it used to do this scaling. The scale factors returned represent the x and y sizes in the normalized coordinate system that correspond to 1 pixel in the original coordinate system.

Globals:

  • classify_norm_method method being used for normalization
  • classify_char_norm_range map radius of gyration to this value
    Parameters
    Outlineslist of outlines to be normalized
    XScalex-direction scale factor used by routine
    YScaley-direction scale factor used by routine
    Returns
    none (Outlines are changed and XScale and YScale are updated)

Definition at line 285 of file mfoutline.cpp.

287  {
288  MFOUTLINE Outline;
289 
290  switch (classify_norm_method) {
291  case character:
292  ASSERT_HOST(!"How did NormalizeOutlines get called in character mode?");
293  break;
294 
295  case baseline:
296  iterate(Outlines) {
297  Outline = (MFOUTLINE) first_node(Outlines);
298  NormalizeOutline(Outline, 0.0);
299  }
300  *XScale = *YScale = MF_SCALE_FACTOR;
301  break;
302  }
303 } /* NormalizeOutlines */
void NormalizeOutline(MFOUTLINE Outline, float XOrigin)
Definition: mfoutline.cpp:251
#define MF_SCALE_FACTOR
Definition: mfoutline.h:64
#define first_node(l)
Definition: oldlist.h:141
#define iterate(l)
Definition: oldlist.h:161
LIST MFOUTLINE
Definition: mfoutline.h:34
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ PrintAdaptedTemplates()

void tesseract::Classify::PrintAdaptedTemplates ( FILE *  File,
ADAPT_TEMPLATES  Templates 
)

This routine prints a summary of the adapted templates in Templates to File.

Parameters
Fileopen text file to print Templates to
Templatesadapted templates to print to File
Note
Globals: none

Definition at line 245 of file adaptive.cpp.

245  {
246  INT_CLASS IClass;
247  ADAPT_CLASS AClass;
248 
249  fprintf (File, "\n\nSUMMARY OF ADAPTED TEMPLATES:\n\n");
250  fprintf (File, "Num classes = %d; Num permanent classes = %d\n\n",
251  Templates->NumNonEmptyClasses, Templates->NumPermClasses);
252  fprintf (File, " Id NC NPC NP NPP\n");
253  fprintf (File, "------------------------\n");
254 
255  for (int i = 0; i < (Templates->Templates)->NumClasses; i++) {
256  IClass = Templates->Templates->Class[i];
257  AClass = Templates->Class[i];
258  if (!IsEmptyAdaptedClass (AClass)) {
259  fprintf (File, "%5d %s %3d %3d %3d %3d\n",
261  IClass->NumConfigs, AClass->NumPermConfigs,
262  IClass->NumProtos,
263  IClass->NumProtos - count (AClass->TempProtos));
264  }
265  }
266  fprintf (File, "\n");
267 
268 } /* PrintAdaptedTemplates */
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:121
int count(LIST var_list)
Definition: oldlist.cpp:98
uint8_t NumPermConfigs
Definition: adaptive.h:64
uint8_t NumConfigs
Definition: intproto.h:108
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:80
UNICHARSET unicharset
Definition: ccutil.h:68
uint8_t NumPermClasses
Definition: adaptive.h:78
#define IsEmptyAdaptedClass(Class)
Definition: adaptive.h:89
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
uint16_t NumProtos
Definition: intproto.h:106
INT_TEMPLATES Templates
Definition: adaptive.h:76

◆ PrintAdaptiveMatchResults()

void tesseract::Classify::PrintAdaptiveMatchResults ( const ADAPT_RESULTS results)

This routine writes the matches in Results to File.

Parameters
resultsmatch results to write to File

Globals: none

Definition at line 2018 of file adaptmatch.cpp.

2018  {
2019  for (int i = 0; i < results.match.size(); ++i) {
2020  tprintf("%s ", unicharset.debug_str(results.match[i].unichar_id).string());
2021  results.match[i].Print();
2022  }
2023 } /* PrintAdaptiveMatchResults */
int size() const
Definition: genericvector.h:71
const char * string() const
Definition: strngs.cpp:196
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:98
UNICHARSET unicharset
Definition: ccutil.h:68
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:342
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37

◆ PruneClasses()

int tesseract::Classify::PruneClasses ( const INT_TEMPLATES_STRUCT int_templates,
int  num_features,
int  keep_this,
const INT_FEATURE_STRUCT features,
const uint8_t *  normalization_factors,
const uint16_t *  expected_num_features,
GenericVector< CP_RESULT_STRUCT > *  results 
)

Runs the class pruner from int_templates on the given features, returning the number of classes output in results.

Parameters
int_templatesClass pruner tables
num_featuresNumber of features in blob
featuresArray of features
normalization_factorsArray of fudge factors from blob normalization process (by CLASS_INDEX)
expected_num_featuresArray of expected number of features for each class (by CLASS_INDEX)
resultsSorted Array of pruned classes. Must be an array of size at least int_templates->NumClasses.
keep_this

Definition at line 409 of file intmatcher.cpp.

414  {
415  ClassPruner pruner(int_templates->NumClasses);
416  // Compute initial match scores for all classes.
417  pruner.ComputeScores(int_templates, num_features, features);
418  // Adjust match scores for number of expected features.
419  pruner.AdjustForExpectedNumFeatures(expected_num_features,
421  // Apply disabled classes in unicharset - only works without a shape_table.
422  if (shape_table_ == nullptr)
423  pruner.DisableDisabledClasses(unicharset);
424  // If fragments are disabled, remove them, also only without a shape table.
425  if (disable_character_fragments && shape_table_ == nullptr)
426  pruner.DisableFragments(unicharset);
427 
428  // If we have good x-heights, apply the given normalization factors.
429  if (normalization_factors != nullptr) {
430  pruner.NormalizeForXheight(classify_class_pruner_multiplier,
431  normalization_factors);
432  } else {
433  pruner.NoNormalization();
434  }
435  // Do the actual pruning and sort the short-list.
436  pruner.PruneAndSort(classify_class_pruner_threshold, keep_this,
437  shape_table_ == nullptr, unicharset);
438 
439  if (classify_debug_level > 2) {
440  pruner.DebugMatch(*this, int_templates, features);
441  }
442  if (classify_debug_level > 1) {
443  pruner.SummarizeResult(*this, int_templates, expected_num_features,
445  normalization_factors);
446  }
447  // Convert to the expected output format.
448  return pruner.SetupResults(results);
449 }
int classify_class_pruner_multiplier
Definition: classify.h:506
int classify_class_pruner_threshold
Definition: classify.h:504
UNICHARSET unicharset
Definition: ccutil.h:68
ShapeTable * shape_table_
Definition: classify.h:553
bool disable_character_fragments
Definition: classify.h:491
int classify_cp_cutoff_strength
Definition: classify.h:508

◆ ReadAdaptedTemplates()

ADAPT_TEMPLATES tesseract::Classify::ReadAdaptedTemplates ( TFile fp)

Read a set of adapted templates from file and return a ptr to the templates.

Parameters
fpopen text file to read adapted templates from
Returns
Ptr to adapted templates read from file.
Note
Globals: none

Definition at line 333 of file adaptive.cpp.

333  {
334  ADAPT_TEMPLATES Templates;
335 
336  /* first read the high level adaptive template struct */
337  Templates = (ADAPT_TEMPLATES) Emalloc (sizeof (ADAPT_TEMPLATES_STRUCT));
338  fp->FRead(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1);
339 
340  /* then read in the basic integer templates */
341  Templates->Templates = ReadIntTemplates(fp);
342 
343  /* then read in the adaptive info for each class */
344  for (int i = 0; i < (Templates->Templates)->NumClasses; i++) {
345  Templates->Class[i] = ReadAdaptedClass(fp);
346  }
347  return (Templates);
348 
349 } /* ReadAdaptedTemplates */
int FRead(void *buffer, size_t size, int count)
Definition: serialis.cpp:270
void * Emalloc(int Size)
Definition: emalloc.cpp:31
ADAPT_TEMPLATES_STRUCT * ADAPT_TEMPLATES
Definition: adaptive.h:82
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:80
INT_TEMPLATES ReadIntTemplates(TFile *fp)
Definition: intproto.cpp:728
ADAPT_CLASS ReadAdaptedClass(TFile *fp)
Definition: adaptive.cpp:282
INT_TEMPLATES Templates
Definition: adaptive.h:76

◆ ReadIntTemplates()

INT_TEMPLATES tesseract::Classify::ReadIntTemplates ( TFile fp)

This routine reads a set of integer templates from File. File must already be open and must be in the correct binary format.

Parameters
fpopen file to read templates from
Returns
Pointer to integer templates read from File.
Note
Globals: none

Definition at line 728 of file intproto.cpp.

728  {
729  int i, j, w, x, y, z;
730  int unicharset_size;
731  int version_id = 0;
732  INT_TEMPLATES Templates;
733  CLASS_PRUNER_STRUCT* Pruner;
734  INT_CLASS Class;
735  uint8_t *Lengths;
736  PROTO_SET ProtoSet;
737 
738  /* variables for conversion from older inttemp formats */
739  int b, bit_number, last_cp_bit_number, new_b, new_i, new_w;
740  CLASS_ID class_id, max_class_id;
741  int16_t *IndexFor = new int16_t[MAX_NUM_CLASSES];
742  CLASS_ID *ClassIdFor = new CLASS_ID[MAX_NUM_CLASSES];
743  CLASS_PRUNER_STRUCT **TempClassPruner =
745  uint32_t SetBitsForMask = // word with NUM_BITS_PER_CLASS
746  (1 << NUM_BITS_PER_CLASS) - 1; // set starting at bit 0
747  uint32_t Mask, NewMask, ClassBits;
748  int MaxNumConfigs = MAX_NUM_CONFIGS;
749  int WerdsPerConfigVec = WERDS_PER_CONFIG_VEC;
750 
751  /* first read the high level template struct */
752  Templates = NewIntTemplates();
753  // Read Templates in parts for 64 bit compatibility.
754  if (fp->FReadEndian(&unicharset_size, sizeof(unicharset_size), 1) != 1)
755  tprintf("Bad read of inttemp!\n");
756  if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses),
757  1) != 1 ||
758  fp->FReadEndian(&Templates->NumClassPruners,
759  sizeof(Templates->NumClassPruners), 1) != 1)
760  tprintf("Bad read of inttemp!\n");
761  if (Templates->NumClasses < 0) {
762  // This file has a version id!
763  version_id = -Templates->NumClasses;
764  if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses),
765  1) != 1)
766  tprintf("Bad read of inttemp!\n");
767  }
768 
769  if (version_id < 3) {
770  MaxNumConfigs = OLD_MAX_NUM_CONFIGS;
771  WerdsPerConfigVec = OLD_WERDS_PER_CONFIG_VEC;
772  }
773 
774  if (version_id < 2) {
775  if (fp->FReadEndian(IndexFor, sizeof(IndexFor[0]), unicharset_size) !=
776  unicharset_size) {
777  tprintf("Bad read of inttemp!\n");
778  }
779  if (fp->FReadEndian(ClassIdFor, sizeof(ClassIdFor[0]),
780  Templates->NumClasses) != Templates->NumClasses) {
781  tprintf("Bad read of inttemp!\n");
782  }
783  }
784 
785  /* then read in the class pruners */
786  const int kNumBuckets =
788  for (i = 0; i < Templates->NumClassPruners; i++) {
789  Pruner = new CLASS_PRUNER_STRUCT;
790  if (fp->FReadEndian(Pruner, sizeof(Pruner->p[0][0][0][0]), kNumBuckets) !=
791  kNumBuckets) {
792  tprintf("Bad read of inttemp!\n");
793  }
794  if (version_id < 2) {
795  TempClassPruner[i] = Pruner;
796  } else {
797  Templates->ClassPruners[i] = Pruner;
798  }
799  }
800 
801  /* fix class pruners if they came from an old version of inttemp */
802  if (version_id < 2) {
803  // Allocate enough class pruners to cover all the class ids.
804  max_class_id = 0;
805  for (i = 0; i < Templates->NumClasses; i++)
806  if (ClassIdFor[i] > max_class_id)
807  max_class_id = ClassIdFor[i];
808  for (i = 0; i <= CPrunerIdFor(max_class_id); i++) {
809  Templates->ClassPruners[i] = new CLASS_PRUNER_STRUCT;
810  memset(Templates->ClassPruners[i], 0, sizeof(CLASS_PRUNER_STRUCT));
811  }
812  // Convert class pruners from the old format (indexed by class index)
813  // to the new format (indexed by class id).
814  last_cp_bit_number = NUM_BITS_PER_CLASS * Templates->NumClasses - 1;
815  for (i = 0; i < Templates->NumClassPruners; i++) {
816  for (x = 0; x < NUM_CP_BUCKETS; x++)
817  for (y = 0; y < NUM_CP_BUCKETS; y++)
818  for (z = 0; z < NUM_CP_BUCKETS; z++)
819  for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {
820  if (TempClassPruner[i]->p[x][y][z][w] == 0)
821  continue;
822  for (b = 0; b < BITS_PER_WERD; b += NUM_BITS_PER_CLASS) {
823  bit_number = i * BITS_PER_CP_VECTOR + w * BITS_PER_WERD + b;
824  if (bit_number > last_cp_bit_number)
825  break; // the rest of the bits in this word are not used
826  class_id = ClassIdFor[bit_number / NUM_BITS_PER_CLASS];
827  // Single out NUM_BITS_PER_CLASS bits relating to class_id.
828  Mask = SetBitsForMask << b;
829  ClassBits = TempClassPruner[i]->p[x][y][z][w] & Mask;
830  // Move these bits to the new position in which they should
831  // appear (indexed corresponding to the class_id).
832  new_i = CPrunerIdFor(class_id);
833  new_w = CPrunerWordIndexFor(class_id);
834  new_b = CPrunerBitIndexFor(class_id) * NUM_BITS_PER_CLASS;
835  if (new_b > b) {
836  ClassBits <<= (new_b - b);
837  } else {
838  ClassBits >>= (b - new_b);
839  }
840  // Copy bits relating to class_id to the correct position
841  // in Templates->ClassPruner.
842  NewMask = SetBitsForMask << new_b;
843  Templates->ClassPruners[new_i]->p[x][y][z][new_w] &= ~NewMask;
844  Templates->ClassPruners[new_i]->p[x][y][z][new_w] |= ClassBits;
845  }
846  }
847  }
848  for (i = 0; i < Templates->NumClassPruners; i++) {
849  delete TempClassPruner[i];
850  }
851  }
852 
853  /* then read in each class */
854  for (i = 0; i < Templates->NumClasses; i++) {
855  /* first read in the high level struct for the class */
856  Class = (INT_CLASS) Emalloc (sizeof (INT_CLASS_STRUCT));
857  if (fp->FReadEndian(&Class->NumProtos, sizeof(Class->NumProtos), 1) != 1 ||
858  fp->FRead(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1) != 1 ||
859  fp->FRead(&Class->NumConfigs, sizeof(Class->NumConfigs), 1) != 1)
860  tprintf("Bad read of inttemp!\n");
861  if (version_id == 0) {
862  // Only version 0 writes 5 pointless pointers to the file.
863  for (j = 0; j < 5; ++j) {
864  int32_t junk;
865  if (fp->FRead(&junk, sizeof(junk), 1) != 1)
866  tprintf("Bad read of inttemp!\n");
867  }
868  }
869  int num_configs = version_id < 4 ? MaxNumConfigs : Class->NumConfigs;
870  ASSERT_HOST(num_configs <= MaxNumConfigs);
871  if (fp->FReadEndian(Class->ConfigLengths, sizeof(uint16_t), num_configs) !=
872  num_configs) {
873  tprintf("Bad read of inttemp!\n");
874  }
875  if (version_id < 2) {
876  ClassForClassId (Templates, ClassIdFor[i]) = Class;
877  } else {
878  ClassForClassId (Templates, i) = Class;
879  }
880 
881  /* then read in the proto lengths */
882  Lengths = nullptr;
883  if (MaxNumIntProtosIn (Class) > 0) {
884  Lengths = (uint8_t *)Emalloc(sizeof(uint8_t) * MaxNumIntProtosIn(Class));
885  if (fp->FRead(Lengths, sizeof(uint8_t), MaxNumIntProtosIn(Class)) !=
886  MaxNumIntProtosIn(Class))
887  tprintf("Bad read of inttemp!\n");
888  }
889  Class->ProtoLengths = Lengths;
890 
891  /* then read in the proto sets */
892  for (j = 0; j < Class->NumProtoSets; j++) {
893  ProtoSet = (PROTO_SET)Emalloc(sizeof(PROTO_SET_STRUCT));
894  int num_buckets = NUM_PP_PARAMS * NUM_PP_BUCKETS * WERDS_PER_PP_VECTOR;
895  if (fp->FReadEndian(&ProtoSet->ProtoPruner,
896  sizeof(ProtoSet->ProtoPruner[0][0][0]),
897  num_buckets) != num_buckets)
898  tprintf("Bad read of inttemp!\n");
899  for (x = 0; x < PROTOS_PER_PROTO_SET; x++) {
900  if (fp->FRead(&ProtoSet->Protos[x].A, sizeof(ProtoSet->Protos[x].A),
901  1) != 1 ||
902  fp->FRead(&ProtoSet->Protos[x].B, sizeof(ProtoSet->Protos[x].B),
903  1) != 1 ||
904  fp->FRead(&ProtoSet->Protos[x].C, sizeof(ProtoSet->Protos[x].C),
905  1) != 1 ||
906  fp->FRead(&ProtoSet->Protos[x].Angle,
907  sizeof(ProtoSet->Protos[x].Angle), 1) != 1)
908  tprintf("Bad read of inttemp!\n");
909  if (fp->FReadEndian(&ProtoSet->Protos[x].Configs,
910  sizeof(ProtoSet->Protos[x].Configs[0]),
911  WerdsPerConfigVec) != WerdsPerConfigVec)
912  cprintf("Bad read of inttemp!\n");
913  }
914  Class->ProtoSets[j] = ProtoSet;
915  }
916  if (version_id < 4) {
917  Class->font_set_id = -1;
918  } else {
919  fp->FReadEndian(&Class->font_set_id, sizeof(Class->font_set_id), 1);
920  }
921  }
922 
923  if (version_id < 2) {
924  /* add an empty nullptr class with class id 0 */
925  assert(UnusedClassIdIn (Templates, 0));
926  ClassForClassId (Templates, 0) = NewIntClass (1, 1);
927  ClassForClassId (Templates, 0)->font_set_id = -1;
928  Templates->NumClasses++;
929  /* make sure the classes are contiguous */
930  for (i = 0; i < MAX_NUM_CLASSES; i++) {
931  if (i < Templates->NumClasses) {
932  if (ClassForClassId (Templates, i) == nullptr) {
933  fprintf(stderr, "Non-contiguous class ids in inttemp\n");
934  exit(1);
935  }
936  } else {
937  if (ClassForClassId (Templates, i) != nullptr) {
938  fprintf(stderr, "Class id %d exceeds NumClassesIn (Templates) %d\n",
939  i, Templates->NumClasses);
940  exit(1);
941  }
942  }
943  }
944  }
945  if (version_id >= 4) {
947  if (version_id >= 5) {
948  this->fontinfo_table_.read(fp,
950  }
952  }
953 
954  // Clean up.
955  delete[] IndexFor;
956  delete[] ClassIdFor;
957  delete[] TempClassPruner;
958 
959  return (Templates);
960 } /* ReadIntTemplates */
#define BITS_PER_CP_VECTOR
Definition: intproto.h:59
void cprintf(const char *format,...)
Definition: callcpp.cpp:33
#define NUM_PP_PARAMS
Definition: intproto.h:51
uint32_t Configs[WERDS_PER_CONFIG_VEC]
Definition: intproto.h:86
struct INT_CLASS_STRUCT * INT_CLASS
CLASS_PRUNER_STRUCT * ClassPruners[MAX_NUM_CLASS_PRUNERS]
Definition: intproto.h:122
void * Emalloc(int Size)
Definition: emalloc.cpp:31
#define NUM_BITS_PER_CLASS
Definition: intproto.h:55
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
#define CPrunerIdFor(c)
Definition: intproto.h:178
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:36
#define UnusedClassIdIn(T, c)
Definition: intproto.h:175
uint32_t p[NUM_CP_BUCKETS][NUM_CP_BUCKETS][NUM_CP_BUCKETS][WERDS_PER_CP_VECTOR]
Definition: intproto.h:78
uint8_t NumProtoSets
Definition: intproto.h:107
#define CPrunerWordIndexFor(c)
Definition: intproto.h:180
uint8_t * ProtoLengths
Definition: intproto.h:110
PROTO_SET ProtoSets[MAX_NUM_PROTO_SETS]
Definition: intproto.h:109
uint8_t NumConfigs
Definition: intproto.h:108
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
#define WERDS_PER_PP_VECTOR
Definition: intproto.h:63
#define BITS_PER_WERD
Definition: intproto.h:45
#define MaxNumIntProtosIn(C)
Definition: intproto.h:163
bool read_spacing_info(TFile *f, FontInfo *fi)
Definition: fontinfo.cpp:169
INT_PROTO_STRUCT Protos[PROTOS_PER_PROTO_SET]
Definition: intproto.h:97
uint16_t ConfigLengths[MAX_NUM_CONFIGS]
Definition: intproto.h:111
struct PROTO_SET_STRUCT * PROTO_SET
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
#define PROTOS_PER_PROTO_SET
Definition: intproto.h:49
#define NUM_CP_BUCKETS
Definition: intproto.h:53
bool read_info(TFile *f, FontInfo *fi)
Definition: fontinfo.cpp:152
PROTO_PRUNER ProtoPruner
Definition: intproto.h:96
#define CPrunerBitIndexFor(c)
Definition: intproto.h:181
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
#define OLD_WERDS_PER_CONFIG_VEC
Definition: intproto.cpp:110
INT_TEMPLATES NewIntTemplates()
Definition: intproto.cpp:692
#define WERDS_PER_CP_VECTOR
Definition: intproto.h:62
uint16_t NumProtos
Definition: intproto.h:106
bool read_set(TFile *f, FontSet *fs)
Definition: fontinfo.cpp:225
#define NUM_PP_BUCKETS
Definition: intproto.h:52
uint8_t Angle
Definition: intproto.h:85
#define ClassForClassId(T, c)
Definition: intproto.h:176
INT_CLASS NewIntClass(int MaxNumProtos, int MaxNumConfigs)
Definition: intproto.cpp:636
#define MAX_NUM_CLASSES
Definition: matchdefs.h:32
#define MAX_NUM_CLASS_PRUNERS
Definition: intproto.h:60
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
#define OLD_MAX_NUM_CONFIGS
Definition: intproto.cpp:109
#define ASSERT_HOST(x)
Definition: errcode.h:84
#define WERDS_PER_CONFIG_VEC
Definition: intproto.h:68

◆ ReadNewCutoffs()

void tesseract::Classify::ReadNewCutoffs ( TFile fp,
CLASS_CUTOFF_ARRAY  Cutoffs 
)

Open file, read in all of the class-id/cutoff pairs and insert them into the Cutoffs array. Cutoffs are indexed in the array by class id. Unused entries in the array are set to an arbitrarily high cutoff value.

Parameters
fpfile containing cutoff definitions
Cutoffsarray to put cutoffs into
Returns
none
Note
Globals: none

Definition at line 46 of file cutoffs.cpp.

46  {
47  char Class[UNICHAR_LEN + 1];
48  CLASS_ID ClassId;
49  int Cutoff;
50 
51  if (shape_table_ != nullptr) {
52  if (!shapetable_cutoffs_.DeSerialize(fp)) {
53  tprintf("Error during read of shapetable pffmtable!\n");
54  }
55  }
56  for (int i = 0; i < MAX_NUM_CLASSES; i++)
57  Cutoffs[i] = MAX_CUTOFF;
58 
59  const int kMaxLineSize = 100;
60  char line[kMaxLineSize];
61  while (fp->FGets(line, kMaxLineSize) != nullptr &&
62  sscanf(line, "%" REALLY_QUOTE_IT(UNICHAR_LEN) "s %d", Class,
63  &Cutoff) == 2) {
64  if (strcmp(Class, "NULL") == 0) {
65  ClassId = unicharset.unichar_to_id(" ");
66  } else {
67  ClassId = unicharset.unichar_to_id(Class);
68  }
69  Cutoffs[ClassId] = Cutoff;
70  }
71 }
#define REALLY_QUOTE_IT(x)
Definition: cutoffs.cpp:31
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
bool DeSerialize(bool swap, FILE *fp)
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:36
#define UNICHAR_LEN
Definition: unichar.h:31
#define MAX_CUTOFF
Definition: cutoffs.cpp:33
UNICHARSET unicharset
Definition: ccutil.h:68
ShapeTable * shape_table_
Definition: classify.h:553
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
#define MAX_NUM_CLASSES
Definition: matchdefs.h:32

◆ ReadNormProtos()

NORM_PROTOS * tesseract::Classify::ReadNormProtos ( TFile fp)

This routine allocates a new data structure to hold a set of character normalization protos. It then fills in the data structure by reading from the specified File.

Parameters
fpopen text file to read normalization protos from Globals: none
Returns
Character normalization protos.

Definition at line 235 of file normmatch.cpp.

235  {
237  int i;
238  char unichar[2 * UNICHAR_LEN + 1];
239  UNICHAR_ID unichar_id;
240  LIST Protos;
241  int NumProtos;
242 
243  /* allocate and initialization data structure */
244  NormProtos = (NORM_PROTOS *) Emalloc (sizeof (NORM_PROTOS));
246  NormProtos->Protos = (LIST *) Emalloc (NormProtos->NumProtos * sizeof(LIST));
247  for (i = 0; i < NormProtos->NumProtos; i++)
248  NormProtos->Protos[i] = NIL_LIST;
249 
250  /* read file header and save in data structure */
253 
254  /* read protos for each class into a separate list */
255  const int kMaxLineSize = 100;
256  char line[kMaxLineSize];
257  while (fp->FGets(line, kMaxLineSize) != nullptr) {
258  if (sscanf(line, "%s %d", unichar, &NumProtos) != 2) continue;
259  if (unicharset.contains_unichar(unichar)) {
260  unichar_id = unicharset.unichar_to_id(unichar);
261  Protos = NormProtos->Protos[unichar_id];
262  for (i = 0; i < NumProtos; i++)
263  Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
264  NormProtos->Protos[unichar_id] = Protos;
265  } else {
266  tprintf("Error: unichar %s in normproto file is not in unichar set.\n",
267  unichar);
268  for (i = 0; i < NumProtos; i++)
270  }
271  }
272  return (NormProtos);
273 } /* ReadNormProtos */
int UNICHAR_ID
Definition: unichar.h:35
NORM_PROTOS * NormProtos
Definition: classify.h:527
uint16_t ReadSampleSize(TFile *fp)
Definition: clusttool.cpp:41
void * Emalloc(int Size)
Definition: emalloc.cpp:31
LIST * Protos
Definition: normmatch.cpp:39
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
#define UNICHAR_LEN
Definition: unichar.h:31
int size() const
Definition: unicharset.h:336
void FreePrototype(void *arg)
Definition: cluster.cpp:575
UNICHARSET unicharset
Definition: ccutil.h:68
PROTOTYPE * ReadPrototype(TFile *fp, uint16_t N)
Definition: clusttool.cpp:99
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:297
PARAM_DESC * ParamDesc
Definition: normmatch.cpp:38
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
#define NIL_LIST
Definition: oldlist.h:127
PARAM_DESC * ReadParamDesc(TFile *fp, uint16_t N)
Definition: clusttool.cpp:61

◆ RefreshDebugWindow()

void tesseract::Classify::RefreshDebugWindow ( ScrollView **  win,
const char *  msg,
int  y_offset,
const TBOX wbox 
)

Definition at line 227 of file adaptmatch.cpp.

228  {
229  #ifndef GRAPHICS_DISABLED
230  const int kSampleSpaceWidth = 500;
231  if (*win == nullptr) {
232  *win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200,
233  kSampleSpaceWidth * 2, 200, true);
234  }
235  (*win)->Clear();
236  (*win)->Pen(64, 64, 64);
237  (*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset,
238  kSampleSpaceWidth, kBlnBaselineOffset);
239  (*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset,
240  kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset);
241  (*win)->ZoomToRectangle(wbox.left(), wbox.top(),
242  wbox.right(), wbox.bottom());
243  #endif // GRAPHICS_DISABLED
244 }
const int kBlnXHeight
Definition: normalis.h:24
const int kBlnBaselineOffset
Definition: normalis.h:25
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65

◆ RemoveBadMatches()

void tesseract::Classify::RemoveBadMatches ( ADAPT_RESULTS Results)

This routine steps through each matching class in Results and removes it from the match list if its rating is worse than the BestRating plus a pad. In other words, all good matches get moved to the front of the classes array.

Parameters
Resultscontains matches to be filtered

Globals:

  • matcher_bad_match_pad defines a "bad match"

Definition at line 2038 of file adaptmatch.cpp.

2038  {
2039  int Next, NextGood;
2040  float BadMatchThreshold;
2041  static const char* romans = "i v x I V X";
2042  BadMatchThreshold = Results->best_rating - matcher_bad_match_pad;
2043 
2045  UNICHAR_ID unichar_id_one = unicharset.contains_unichar("1") ?
2046  unicharset.unichar_to_id("1") : -1;
2047  UNICHAR_ID unichar_id_zero = unicharset.contains_unichar("0") ?
2048  unicharset.unichar_to_id("0") : -1;
2049  float scored_one = ScoredUnichar(unichar_id_one, *Results);
2050  float scored_zero = ScoredUnichar(unichar_id_zero, *Results);
2051 
2052  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2053  const UnicharRating& match = Results->match[Next];
2054  if (match.rating >= BadMatchThreshold) {
2055  if (!unicharset.get_isalpha(match.unichar_id) ||
2056  strstr(romans,
2057  unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2058  } else if (unicharset.eq(match.unichar_id, "l") &&
2059  scored_one < BadMatchThreshold) {
2060  Results->match[Next].unichar_id = unichar_id_one;
2061  } else if (unicharset.eq(match.unichar_id, "O") &&
2062  scored_zero < BadMatchThreshold) {
2063  Results->match[Next].unichar_id = unichar_id_zero;
2064  } else {
2065  Results->match[Next].unichar_id = INVALID_UNICHAR_ID; // Don't copy.
2066  }
2067  if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) {
2068  if (NextGood == Next) {
2069  ++NextGood;
2070  } else {
2071  Results->match[NextGood++] = Results->match[Next];
2072  }
2073  }
2074  }
2075  }
2076  } else {
2077  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2078  if (Results->match[Next].rating >= BadMatchThreshold) {
2079  if (NextGood == Next) {
2080  ++NextGood;
2081  } else {
2082  Results->match[NextGood++] = Results->match[Next];
2083  }
2084  }
2085  }
2086  }
2087  Results->match.truncate(NextGood);
2088 } /* RemoveBadMatches */
int UNICHAR_ID
Definition: unichar.h:35
int size() const
Definition: genericvector.h:71
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:686
bool classify_bln_numeric_mode
Definition: classify.h:541
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:98
float best_rating
Definition: adaptmatch.cpp:97
UNICHARSET unicharset
Definition: ccutil.h:68
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
double matcher_bad_match_pad
Definition: classify.h:464
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
void truncate(int size)

◆ RemoveExtraPuncs()

void tesseract::Classify::RemoveExtraPuncs ( ADAPT_RESULTS Results)

This routine discards extra digits or punctuation from the results. We keep only the top 2 punctuation answers and the top 1 digit answer if present.

Parameters
Resultscontains matches to be filtered

Definition at line 2098 of file adaptmatch.cpp.

2098  {
2099  int Next, NextGood;
2100  int punc_count; /*no of garbage characters */
2101  int digit_count;
2102  /*garbage characters */
2103  static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
2104  static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";
2105 
2106  punc_count = 0;
2107  digit_count = 0;
2108  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2109  const UnicharRating& match = Results->match[Next];
2110  bool keep = true;
2111  if (strstr(punc_chars,
2112  unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2113  if (punc_count >= 2)
2114  keep = false;
2115  punc_count++;
2116  } else {
2117  if (strstr(digit_chars,
2118  unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2119  if (digit_count >= 1)
2120  keep = false;
2121  digit_count++;
2122  }
2123  }
2124  if (keep) {
2125  if (NextGood == Next) {
2126  ++NextGood;
2127  } else {
2128  Results->match[NextGood++] = match;
2129  }
2130  }
2131  }
2132  Results->match.truncate(NextGood);
2133 } /* RemoveExtraPuncs */
int size() const
Definition: genericvector.h:71
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:98
UNICHARSET unicharset
Definition: ccutil.h:68
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
void truncate(int size)

◆ ResetAdaptiveClassifierInternal()

void tesseract::Classify::ResetAdaptiveClassifierInternal ( )

Definition at line 599 of file adaptmatch.cpp.

599  {
601  tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n",
602  NumAdaptationsFailed);
603  }
606  if (BackupAdaptedTemplates != nullptr)
608  BackupAdaptedTemplates = nullptr;
609  NumAdaptationsFailed = 0;
610 }
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset)
Definition: adaptive.cpp:152
void free_adapted_templates(ADAPT_TEMPLATES templates)
Definition: adaptive.cpp:183
int classify_learning_debug_level
Definition: classify.h:460
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:518
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:514
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37

◆ SetAdaptiveThreshold()

void tesseract::Classify::SetAdaptiveThreshold ( float  Threshold)

This routine resets the internal thresholds inside the integer matcher to correspond to the specified threshold.

Parameters
Thresholdthreshold for creating new templates

Globals:

  • matcher_good_threshold default good match rating

Definition at line 2146 of file adaptmatch.cpp.

2146  {
2147  Threshold = (Threshold == matcher_good_threshold) ? 0.9: (1.0 - Threshold);
2149  ClipToRange<int>(255 * Threshold, 0, 255));
2151  ClipToRange<int>(255 * Threshold, 0, 255));
2152 } /* SetAdaptiveThreshold */
int classify_adapt_proto_threshold
Definition: classify.h:486
double matcher_good_threshold
Definition: classify.h:461
int classify_adapt_feature_threshold
Definition: classify.h:488

◆ SetStaticClassifier()

void tesseract::Classify::SetStaticClassifier ( ShapeClassifier static_classifier)

Definition at line 225 of file classify.cpp.

225  {
226  delete static_classifier_;
227  static_classifier_ = static_classifier;
228 }

◆ SettupPass1()

void tesseract::Classify::SettupPass1 ( )

This routine prepares the adaptive matcher for the start of the first pass. Learning is enabled (unless it is disabled for the whole program).

Note
this is somewhat redundant, it simply says that if learning is enabled then it will remain enabled on the first pass. If it is disabled, then it will remain disabled. This is only put here to make it very clear that learning is controlled directly by the global setting of EnableLearning.

Globals:

Definition at line 653 of file adaptmatch.cpp.

653  {
655 
657 
658 } /* SettupPass1 */
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:360
virtual Dict & getDict()
Definition: classify.h:107
bool classify_enable_learning
Definition: classify.h:430

◆ SettupPass2()

void tesseract::Classify::SettupPass2 ( )

This routine prepares the adaptive matcher for the start of the second pass. Further learning is disabled.

Globals:

Definition at line 670 of file adaptmatch.cpp.

670  {
673 
674 } /* SettupPass2 */
#define FALSE
Definition: capi.h:52
virtual Dict & getDict()
Definition: classify.h:107
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:364

◆ SetupBLCNDenorms()

void tesseract::Classify::SetupBLCNDenorms ( const TBLOB blob,
bool  nonlinear_norm,
DENORM bl_denorm,
DENORM cn_denorm,
INT_FX_RESULT_STRUCT fx_info 
)
static

Definition at line 131 of file intfx.cpp.

133  {
134  // Compute 1st and 2nd moments of the original outline.
135  FCOORD center, second_moments;
136  int length = blob.ComputeMoments(&center, &second_moments);
137  if (fx_info != nullptr) {
138  fx_info->Length = length;
139  fx_info->Rx = IntCastRounded(second_moments.y());
140  fx_info->Ry = IntCastRounded(second_moments.x());
141 
142  fx_info->Xmean = IntCastRounded(center.x());
143  fx_info->Ymean = IntCastRounded(center.y());
144  }
145  // Setup the denorm for Baseline normalization.
146  bl_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(), center.x(), 128.0f,
147  1.0f, 1.0f, 128.0f, 128.0f);
148  // Setup the denorm for character normalization.
149  if (nonlinear_norm) {
152  TBOX box;
153  blob.GetPreciseBoundingBox(&box);
154  box.pad(1, 1);
155  blob.GetEdgeCoords(box, &x_coords, &y_coords);
156  cn_denorm->SetupNonLinear(&blob.denorm(), box, UINT8_MAX, UINT8_MAX,
157  0.0f, 0.0f, x_coords, y_coords);
158  } else {
159  cn_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(),
160  center.x(), center.y(),
161  51.2f / second_moments.x(),
162  51.2f / second_moments.y(),
163  128.0f, 128.0f);
164  }
165 }
int ComputeMoments(FCOORD *center, FCOORD *second_moments) const
Definition: blobs.cpp:532
void SetupNormalization(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor, float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift, float final_yshift)
Definition: normalis.cpp:96
void SetupNonLinear(const DENORM *predecessor, const TBOX &box, float target_width, float target_height, float final_xshift, float final_yshift, const GenericVector< GenericVector< int > > &x_coords, const GenericVector< GenericVector< int > > &y_coords)
Definition: normalis.cpp:268
Definition: rect.h:34
int16_t Xmean
Definition: intfx.h:37
int IntCastRounded(double x)
Definition: helpers.h:168
void GetPreciseBoundingBox(TBOX *precise_box) const
Definition: blobs.cpp:551
int32_t Length
Definition: intfx.h:36
const DENORM & denorm() const
Definition: blobs.h:347
Definition: points.h:189
float x() const
Definition: points.h:208
int16_t Ymean
Definition: intfx.h:37
void GetEdgeCoords(const TBOX &box, GenericVector< GenericVector< int > > *x_coords, GenericVector< GenericVector< int > > *y_coords) const
Definition: blobs.cpp:567
void pad(int xpad, int ypad)
Definition: rect.h:131
float y() const
Definition: points.h:211

◆ shape_table()

const ShapeTable* tesseract::Classify::shape_table ( ) const
inline

Definition at line 111 of file classify.h.

111  {
112  return shape_table_;
113  }
ShapeTable * shape_table_
Definition: classify.h:553

◆ ShapeIDToClassID()

int tesseract::Classify::ShapeIDToClassID ( int  shape_id) const

Definition at line 2225 of file adaptmatch.cpp.

2225  {
2226  for (int id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
2227  int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
2228  ASSERT_HOST(font_set_id >= 0);
2229  const FontSet &fs = fontset_table_.get(font_set_id);
2230  for (int config = 0; config < fs.size; ++config) {
2231  if (fs.configs[config] == shape_id)
2232  return id;
2233  }
2234  }
2235  tprintf("Shape %d not found\n", shape_id);
2236  return -1;
2237 }
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:121
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:510
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ ShowBestMatchFor()

void tesseract::Classify::ShowBestMatchFor ( int  shape_id,
const INT_FEATURE_STRUCT features,
int  num_features 
)

This routine displays debug information for the best config of the given shape_id for the given set of features.

Parameters
shape_idclassifier id to work with
featuresfeatures of the unknown character
num_featuresNumber of features in the features array.

Definition at line 2164 of file adaptmatch.cpp.

2166  {
2167 #ifndef GRAPHICS_DISABLED
2168  uint32_t config_mask;
2169  if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
2170  tprintf("No built-in templates for class/shape %d\n", shape_id);
2171  return;
2172  }
2173  if (num_features <= 0) {
2174  tprintf("Illegal blob (char norm features)!\n");
2175  return;
2176  }
2177  UnicharRating cn_result;
2178  classify_norm_method.set_value(character);
2181  num_features, features, &cn_result,
2184  tprintf("\n");
2185  config_mask = 1 << cn_result.config;
2186 
2187  tprintf("Static Shape ID: %d\n", shape_id);
2188  ShowMatchDisplay();
2190  &config_mask, num_features, features, &cn_result,
2194 #endif // GRAPHICS_DISABLED
2195 } /* ShowBestMatchFor */
void UpdateMatchDisplay()
Definition: intproto.cpp:451
BIT_VECTOR AllProtosOn
Definition: classify.h:521
bool matcher_debug_separate_windows
Definition: classify.h:499
#define UnusedClassIdIn(T, c)
Definition: intproto.h:175
void Match(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:470
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:510
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int classify_adapt_feature_threshold
Definition: classify.h:488
BIT_VECTOR AllConfigsOn
Definition: classify.h:522
#define ClassForClassId(T, c)
Definition: intproto.h:176
#define NO_DEBUG
Definition: adaptmatch.cpp:80
IntegerMatcher im_
Definition: classify.h:544

◆ ShowMatchDisplay()

void tesseract::Classify::ShowMatchDisplay ( )

This routine sends the shapes in the global display lists to the match debugger window.

Globals:

  • FeatureShapes display list containing feature matches
  • ProtoShapes display list containing proto matches
    Returns
    none

Definition at line 973 of file intproto.cpp.

973  {
975  if (ProtoDisplayWindow) {
977  }
978  if (FeatureDisplayWindow) {
980  }
982  static_cast<NORM_METHOD>(static_cast<int>(classify_norm_method)),
986  if (ProtoDisplayWindow) {
989  }
990  if (FeatureDisplayWindow) {
993  }
994 } /* ShowMatchDisplay */
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:998
#define INT_MIN_Y
Definition: intproto.cpp:61
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:760
ScrollView * FeatureDisplayWindow
Definition: intproto.cpp:177
#define INT_MAX_X
Definition: intproto.cpp:62
ScrollView * ProtoDisplayWindow
Definition: intproto.cpp:178
#define INT_MIN_X
Definition: intproto.cpp:60
ScrollView * IntMatchWindow
Definition: intproto.cpp:176
void Clear()
Definition: scrollview.cpp:591
void InitIntMatchWindowIfReqd()
Definition: intproto.cpp:1748
#define INT_MAX_Y
Definition: intproto.cpp:63

◆ StartBackupAdaptiveClassifier()

void tesseract::Classify::StartBackupAdaptiveClassifier ( )

Definition at line 630 of file adaptmatch.cpp.

630  {
631  if (BackupAdaptedTemplates != nullptr)
634 }
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset)
Definition: adaptive.cpp:152
void free_adapted_templates(ADAPT_TEMPLATES templates)
Definition: adaptive.cpp:183
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:518

◆ SwitchAdaptiveClassifier()

void tesseract::Classify::SwitchAdaptiveClassifier ( )

Definition at line 614 of file adaptmatch.cpp.

614  {
615  if (BackupAdaptedTemplates == nullptr) {
617  return;
618  }
620  tprintf("Switch to backup adaptive classifier (NumAdaptationsFailed=%d)\n",
621  NumAdaptationsFailed);
622  }
625  BackupAdaptedTemplates = nullptr;
626  NumAdaptationsFailed = 0;
627 }
void free_adapted_templates(ADAPT_TEMPLATES templates)
Definition: adaptive.cpp:183
int classify_learning_debug_level
Definition: classify.h:460
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:518
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:514
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:599

◆ TempConfigReliable()

bool tesseract::Classify::TempConfigReliable ( CLASS_ID  class_id,
const TEMP_CONFIG config 
)

Definition at line 2241 of file adaptmatch.cpp.

2242  {
2243  if (classify_learning_debug_level >= 1) {
2244  tprintf("NumTimesSeen for config of %s is %d\n",
2245  getDict().getUnicharset().debug_str(class_id).string(),
2246  config->NumTimesSeen);
2247  }
2249  return true;
2250  } else if (config->NumTimesSeen < matcher_min_examples_for_prototyping) {
2251  return false;
2252  } else if (use_ambigs_for_adaption) {
2253  // Go through the ambigs vector and see whether we have already seen
2254  // enough times all the characters represented by the ambigs vector.
2255  const UnicharIdVector *ambigs =
2257  int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2258  for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2259  ADAPT_CLASS ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
2260  assert(ambig_class != nullptr);
2261  if (ambig_class->NumPermConfigs == 0 &&
2262  ambig_class->MaxNumTimesSeen <
2264  if (classify_learning_debug_level >= 1) {
2265  tprintf("Ambig %s has not been seen enough times,"
2266  " not making config for %s permanent\n",
2267  getDict().getUnicharset().debug_str(
2268  (*ambigs)[ambig]).string(),
2269  getDict().getUnicharset().debug_str(class_id).string());
2270  }
2271  return false;
2272  }
2273  }
2274  }
2275  return true;
2276 }
int size() const
Definition: genericvector.h:71
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:104
bool use_ambigs_for_adaption
Definition: ccutil.h:88
const UnicharIdVector * AmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:190
GenericVector< UNICHAR_ID > UnicharIdVector
Definition: ambigs.h:34
uint8_t NumPermConfigs
Definition: adaptive.h:64
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:80
uint8_t NumTimesSeen
Definition: adaptive.h:41
int classify_learning_debug_level
Definition: classify.h:460
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:514
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int matcher_sufficient_examples_for_prototyping
Definition: classify.h:471
int matcher_min_examples_for_prototyping
Definition: classify.h:469
uint8_t MaxNumTimesSeen
Definition: adaptive.h:65
virtual Dict & getDict()
Definition: classify.h:107

◆ UpdateAmbigsGroup()

void tesseract::Classify::UpdateAmbigsGroup ( CLASS_ID  class_id,
TBLOB Blob 
)

Definition at line 2278 of file adaptmatch.cpp.

2278  {
2279  const UnicharIdVector *ambigs =
2281  int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2282  if (classify_learning_debug_level >= 1) {
2283  tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
2284  getDict().getUnicharset().debug_str(class_id).string(), class_id);
2285  }
2286  for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2287  CLASS_ID ambig_class_id = (*ambigs)[ambig];
2288  const ADAPT_CLASS ambigs_class = AdaptedTemplates->Class[ambig_class_id];
2289  for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
2290  if (ConfigIsPermanent(ambigs_class, cfg)) continue;
2291  const TEMP_CONFIG config =
2292  TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
2293  if (config != nullptr && TempConfigReliable(ambig_class_id, config)) {
2294  if (classify_learning_debug_level >= 1) {
2295  tprintf("Making config %d of %s permanent\n", cfg,
2296  getDict().getUnicharset().debug_str(
2297  ambig_class_id).string());
2298  }
2299  MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);
2300  }
2301  }
2302  }
2303 }
int size() const
Definition: genericvector.h:71
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:104
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config)
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:36
GenericVector< UNICHAR_ID > UnicharIdVector
Definition: ambigs.h:34
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:101
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:80
int classify_learning_debug_level
Definition: classify.h:460
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:514
void MakePermanent(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
#define ConfigIsPermanent(Class, ConfigId)
Definition: adaptive.h:92
const UnicharIdVector * ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:199
virtual Dict & getDict()
Definition: classify.h:107

◆ WriteAdaptedTemplates()

void tesseract::Classify::WriteAdaptedTemplates ( FILE *  File,
ADAPT_TEMPLATES  Templates 
)

This routine saves Templates to File in a binary format.

Parameters
Fileopen text file to write Templates to
Templatesset of adapted templates to write to File
Note
Globals: none

Definition at line 454 of file adaptive.cpp.

454  {
455  int i;
456 
457  /* first write the high level adaptive template struct */
458  fwrite(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1, File);
459 
460  /* then write out the basic integer templates */
461  WriteIntTemplates (File, Templates->Templates, unicharset);
462 
463  /* then write out the adaptive info for each class */
464  for (i = 0; i < (Templates->Templates)->NumClasses; i++) {
465  WriteAdaptedClass (File, Templates->Class[i],
466  Templates->Templates->Class[i]->NumConfigs);
467  }
468 } /* WriteAdaptedTemplates */
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:121
uint8_t NumConfigs
Definition: intproto.h:108
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:80
UNICHARSET unicharset
Definition: ccutil.h:68
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:1030
void WriteAdaptedClass(FILE *File, ADAPT_CLASS Class, int NumConfigs)
Definition: adaptive.cpp:410
INT_TEMPLATES Templates
Definition: adaptive.h:76

◆ WriteIntTemplates()

void tesseract::Classify::WriteIntTemplates ( FILE *  File,
INT_TEMPLATES  Templates,
const UNICHARSET target_unicharset 
)

This routine writes Templates to File. The format is an efficient binary format. File must already be open for writing.

Parameters
Fileopen file to write templates to
Templatestemplates to save into File
target_unicharsetthe UNICHARSET to use
Returns
none
Note
Globals: none

Definition at line 1030 of file intproto.cpp.

1031  {
1032  int i, j;
1033  INT_CLASS Class;
1034  int unicharset_size = target_unicharset.size();
1035  int version_id = -5; // When negated by the reader -1 becomes +1 etc.
1036 
1037  if (Templates->NumClasses != unicharset_size) {
1038  cprintf("Warning: executing WriteIntTemplates() with %d classes in"
1039  " Templates, while target_unicharset size is %d\n",
1040  Templates->NumClasses, unicharset_size);
1041  }
1042 
1043  /* first write the high level template struct */
1044  fwrite(&unicharset_size, sizeof(unicharset_size), 1, File);
1045  fwrite(&version_id, sizeof(version_id), 1, File);
1046  fwrite(&Templates->NumClassPruners, sizeof(Templates->NumClassPruners),
1047  1, File);
1048  fwrite(&Templates->NumClasses, sizeof(Templates->NumClasses), 1, File);
1049 
1050  /* then write out the class pruners */
1051  for (i = 0; i < Templates->NumClassPruners; i++)
1052  fwrite(Templates->ClassPruners[i],
1053  sizeof(CLASS_PRUNER_STRUCT), 1, File);
1054 
1055  /* then write out each class */
1056  for (i = 0; i < Templates->NumClasses; i++) {
1057  Class = Templates->Class[i];
1058 
1059  /* first write out the high level struct for the class */
1060  fwrite(&Class->NumProtos, sizeof(Class->NumProtos), 1, File);
1061  fwrite(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File);
1062  ASSERT_HOST(Class->NumConfigs == this->fontset_table_.get(Class->font_set_id).size);
1063  fwrite(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File);
1064  for (j = 0; j < Class->NumConfigs; ++j) {
1065  fwrite(&Class->ConfigLengths[j], sizeof(uint16_t), 1, File);
1066  }
1067 
1068  /* then write out the proto lengths */
1069  if (MaxNumIntProtosIn (Class) > 0) {
1070  fwrite(Class->ProtoLengths, sizeof(uint8_t),
1071  MaxNumIntProtosIn(Class), File);
1072  }
1073 
1074  /* then write out the proto sets */
1075  for (j = 0; j < Class->NumProtoSets; j++)
1076  fwrite(Class->ProtoSets[j], sizeof(PROTO_SET_STRUCT), 1, File);
1077 
1078  /* then write the fonts info */
1079  fwrite(&Class->font_set_id, sizeof(int), 1, File);
1080  }
1081 
1082  /* Write the fonts info tables */
1084  this->fontinfo_table_.write(File,
1087 } /* WriteIntTemplates */
bool write_spacing_info(FILE *f, const FontInfo &fi)
Definition: fontinfo.cpp:197
bool write_info(FILE *f, const FontInfo &fi)
Definition: fontinfo.cpp:162
void cprintf(const char *format,...)
Definition: callcpp.cpp:33
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:121
CLASS_PRUNER_STRUCT * ClassPruners[MAX_NUM_CLASS_PRUNERS]
Definition: intproto.h:122
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
uint8_t NumProtoSets
Definition: intproto.h:107
int size() const
Definition: unicharset.h:336
uint8_t * ProtoLengths
Definition: intproto.h:110
PROTO_SET ProtoSets[MAX_NUM_PROTO_SETS]
Definition: intproto.h:109
uint8_t NumConfigs
Definition: intproto.h:108
#define MaxNumIntProtosIn(C)
Definition: intproto.h:163
uint16_t ConfigLengths[MAX_NUM_CONFIGS]
Definition: intproto.h:111
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
uint16_t NumProtos
Definition: intproto.h:106
bool write_set(FILE *f, const FontSet &fs)
Definition: fontinfo.cpp:231
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ WriteTRFile()

bool tesseract::Classify::WriteTRFile ( const STRING filename)

Definition at line 102 of file blobclass.cpp.

102  {
103  bool result = false;
104  STRING tr_filename = filename + ".tr";
105  FILE* fp = fopen(tr_filename.string(), "wb");
106  if (fp) {
107  result =
108  tesseract::Serialize(fp, &tr_file_data_[0], tr_file_data_.length());
109  fclose(fp);
110  }
111  tr_file_data_.truncate_at(0);
112  return result;
113 }
const char * string() const
Definition: strngs.cpp:196
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:59
void truncate_at(int32_t index)
Definition: strngs.cpp:267
Definition: strngs.h:45
int32_t length() const
Definition: strngs.cpp:191

Member Data Documentation

◆ AdaptedTemplates

ADAPT_TEMPLATES tesseract::Classify::AdaptedTemplates

Definition at line 514 of file classify.h.

◆ AllConfigsOff

BIT_VECTOR tesseract::Classify::AllConfigsOff

Definition at line 523 of file classify.h.

◆ AllConfigsOn

BIT_VECTOR tesseract::Classify::AllConfigsOn

Definition at line 522 of file classify.h.

◆ allow_blob_division

bool tesseract::Classify::allow_blob_division = true

"Use divisible blobs chopping"

Definition at line 423 of file classify.h.

◆ AllProtosOn

BIT_VECTOR tesseract::Classify::AllProtosOn

Definition at line 521 of file classify.h.

◆ BackupAdaptedTemplates

ADAPT_TEMPLATES tesseract::Classify::BackupAdaptedTemplates

Definition at line 518 of file classify.h.

◆ certainty_scale

double tesseract::Classify::certainty_scale = 20.0

"Certainty scaling factor"

Definition at line 478 of file classify.h.

◆ classify_adapt_feature_threshold

int tesseract::Classify::classify_adapt_feature_threshold = 230

"Threshold for good features during adaptive 0-255"

Definition at line 488 of file classify.h.

◆ classify_adapt_proto_threshold

int tesseract::Classify::classify_adapt_proto_threshold = 230

"Threshold for good protos during adaptive 0-255"

Definition at line 486 of file classify.h.

◆ classify_adapted_pruning_factor

double tesseract::Classify::classify_adapted_pruning_factor = 2.5

"Prune poor adapted results this much worse than best result"

Definition at line 482 of file classify.h.

◆ classify_adapted_pruning_threshold

double tesseract::Classify::classify_adapted_pruning_threshold = -1.0

"Threshold at which classify_adapted_pruning_factor starts"

Definition at line 484 of file classify.h.

◆ classify_bln_numeric_mode

bool tesseract::Classify::classify_bln_numeric_mode = 0

"Assume the input is numbers [0-9]."

Definition at line 541 of file classify.h.

◆ classify_char_norm_range

double tesseract::Classify::classify_char_norm_range = 0.2

"Character Normalization Range ..."

Definition at line 437 of file classify.h.

◆ classify_character_fragments_garbage_certainty_threshold

double tesseract::Classify::classify_character_fragments_garbage_certainty_threshold = -3.0

"Exclude fragments that do not match any whole character" " with at least this certainty"

Definition at line 494 of file classify.h.

◆ classify_class_pruner_multiplier

int tesseract::Classify::classify_class_pruner_multiplier = 15

"Class Pruner Multiplier 0-255: "

Definition at line 506 of file classify.h.

◆ classify_class_pruner_threshold

int tesseract::Classify::classify_class_pruner_threshold = 229

"Class Pruner Threshold 0-255"

Definition at line 504 of file classify.h.

◆ classify_cp_cutoff_strength

int tesseract::Classify::classify_cp_cutoff_strength = 7

"Class Pruner CutoffStrength: "

Definition at line 508 of file classify.h.

◆ classify_debug_character_fragments

bool tesseract::Classify::classify_debug_character_fragments = FALSE

"Bring up graphical debugging windows for fragments training"

Definition at line 496 of file classify.h.

◆ classify_debug_level

int tesseract::Classify::classify_debug_level = 0

"Classify debug level"

Definition at line 431 of file classify.h.

◆ classify_enable_adaptive_debugger

bool tesseract::Classify::classify_enable_adaptive_debugger = 0

"Enable match debugger"

Definition at line 455 of file classify.h.

◆ classify_enable_adaptive_matcher

bool tesseract::Classify::classify_enable_adaptive_matcher = 1

"Enable adaptive classifier"

Definition at line 450 of file classify.h.

◆ classify_enable_learning

bool tesseract::Classify::classify_enable_learning = true

"Enable adaptive classifier"

Definition at line 430 of file classify.h.

◆ classify_integer_matcher_multiplier

int tesseract::Classify::classify_integer_matcher_multiplier = 10

"Integer Matcher Multiplier 0-255: "

Definition at line 510 of file classify.h.

◆ classify_learn_debug_str

char* tesseract::Classify::classify_learn_debug_str = ""

"Class str to debug learning"

Definition at line 500 of file classify.h.

◆ classify_learning_debug_level

int tesseract::Classify::classify_learning_debug_level = 0

"Learning Debug Level: "

Definition at line 460 of file classify.h.

◆ classify_max_certainty_margin

double tesseract::Classify::classify_max_certainty_margin = 5.5

"Veto difference between classifier certainties"

Definition at line 445 of file classify.h.

◆ classify_max_norm_scale_x

double tesseract::Classify::classify_max_norm_scale_x = 0.325

"Max char x-norm scale ..."

Definition at line 439 of file classify.h.

◆ classify_max_norm_scale_y

double tesseract::Classify::classify_max_norm_scale_y = 0.325

"Max char y-norm scale ..."

Definition at line 441 of file classify.h.

◆ classify_max_rating_ratio

double tesseract::Classify::classify_max_rating_ratio = 1.5

"Veto ratio between classifier ratings"

Definition at line 443 of file classify.h.

◆ classify_min_norm_scale_x

double tesseract::Classify::classify_min_norm_scale_x = 0.0

"Min char x-norm scale ..."

Definition at line 438 of file classify.h.

◆ classify_min_norm_scale_y

double tesseract::Classify::classify_min_norm_scale_y = 0.0

"Min char y-norm scale ..."

Definition at line 440 of file classify.h.

◆ classify_misfit_junk_penalty

double tesseract::Classify::classify_misfit_junk_penalty = 0.0

"Penalty to apply when a non-alnum is vertically out of " "its expected textline position"

Definition at line 476 of file classify.h.

◆ classify_nonlinear_norm

bool tesseract::Classify::classify_nonlinear_norm = 0

"Non-linear stroke-density normalization"

Definition at line 457 of file classify.h.

◆ classify_norm_method

int tesseract::Classify::classify_norm_method = character

"Normalization Method ..."

Definition at line 435 of file classify.h.

◆ classify_save_adapted_templates

bool tesseract::Classify::classify_save_adapted_templates = 0

"Save adapted templates to a file"

Definition at line 454 of file classify.h.

◆ classify_use_pre_adapted_templates

bool tesseract::Classify::classify_use_pre_adapted_templates = 0

"Use pre-adapted classifier templates"

Definition at line 452 of file classify.h.

◆ disable_character_fragments

bool tesseract::Classify::disable_character_fragments = TRUE

"Do not include character fragments in the" " results of the classifier"

Definition at line 491 of file classify.h.

◆ EnableLearning

bool tesseract::Classify::EnableLearning

Definition at line 525 of file classify.h.

◆ feature_defs_

FEATURE_DEFS_STRUCT tesseract::Classify::feature_defs_
protected

Definition at line 548 of file classify.h.

◆ fontinfo_table_

UnicityTable<FontInfo> tesseract::Classify::fontinfo_table_

Definition at line 529 of file classify.h.

◆ fontset_table_

UnicityTable<FontSet> tesseract::Classify::fontset_table_

Definition at line 537 of file classify.h.

◆ il1_adaption_test

int tesseract::Classify::il1_adaption_test = 0

"Don't adapt to i/I at beginning of word"

Definition at line 539 of file classify.h.

◆ im_

IntegerMatcher tesseract::Classify::im_
protected

Definition at line 544 of file classify.h.

◆ matcher_avg_noise_size

double tesseract::Classify::matcher_avg_noise_size = 12.0

"Avg. noise blob length: "

Definition at line 466 of file classify.h.

◆ matcher_bad_match_pad

double tesseract::Classify::matcher_bad_match_pad = 0.15

"Bad Match Pad (0-1)"

Definition at line 464 of file classify.h.

◆ matcher_clustering_max_angle_delta

double tesseract::Classify::matcher_clustering_max_angle_delta = 0.015

"Maximum angle delta for prototype clustering"

Definition at line 473 of file classify.h.

◆ matcher_debug_flags

int tesseract::Classify::matcher_debug_flags = 0

"Matcher Debug Flags"

Definition at line 459 of file classify.h.

◆ matcher_debug_level

int tesseract::Classify::matcher_debug_level = 0

"Matcher Debug Level"

Definition at line 458 of file classify.h.

◆ matcher_debug_separate_windows

bool tesseract::Classify::matcher_debug_separate_windows = FALSE

"Use two different windows for debugging the matching: " "One for the protos and one for the features."

Definition at line 499 of file classify.h.

◆ matcher_good_threshold

double tesseract::Classify::matcher_good_threshold = 0.125

"Good Match (0-1)"

Definition at line 461 of file classify.h.

◆ matcher_min_examples_for_prototyping

int tesseract::Classify::matcher_min_examples_for_prototyping = 3

"Reliable Config Threshold"

Definition at line 469 of file classify.h.

◆ matcher_perfect_threshold

double tesseract::Classify::matcher_perfect_threshold = 0.02

"Perfect Match (0-1)"

Definition at line 463 of file classify.h.

◆ matcher_permanent_classes_min

int tesseract::Classify::matcher_permanent_classes_min = 1

"Min # of permanent classes"

Definition at line 467 of file classify.h.

◆ matcher_rating_margin

double tesseract::Classify::matcher_rating_margin = 0.1

"New template margin (0-1)"

Definition at line 465 of file classify.h.

◆ matcher_reliable_adaptive_result

double tesseract::Classify::matcher_reliable_adaptive_result = 0.0

"Great Match (0-1)"

Definition at line 462 of file classify.h.

◆ matcher_sufficient_examples_for_prototyping

int tesseract::Classify::matcher_sufficient_examples_for_prototyping = 5

"Enable adaption even if the ambiguities have not been seen"

Definition at line 471 of file classify.h.

◆ NormProtos

NORM_PROTOS* tesseract::Classify::NormProtos

Definition at line 527 of file classify.h.

◆ PreTrainedTemplates

INT_TEMPLATES tesseract::Classify::PreTrainedTemplates

Definition at line 510 of file classify.h.

◆ prioritize_division

bool tesseract::Classify::prioritize_division = FALSE

"Prioritize blob division over chopping"

Definition at line 428 of file classify.h.

◆ rating_scale

double tesseract::Classify::rating_scale = 1.5

"Rating scaling factor"

Definition at line 477 of file classify.h.

◆ shape_table_

ShapeTable* tesseract::Classify::shape_table_
protected

Definition at line 553 of file classify.h.

◆ speckle_large_max_size

double tesseract::Classify::speckle_large_max_size = 0.30

"Max large speckle size"

Definition at line 542 of file classify.h.

◆ speckle_rating_penalty

double tesseract::Classify::speckle_rating_penalty = 10.0

"Penalty to add to worst rating for noise"

Definition at line 544 of file classify.h.

◆ TempProtoMask

BIT_VECTOR tesseract::Classify::TempProtoMask

Definition at line 524 of file classify.h.

◆ tess_bn_matching

bool tesseract::Classify::tess_bn_matching = 0

"Baseline Normalized Matching"

Definition at line 449 of file classify.h.

◆ tess_cn_matching

bool tesseract::Classify::tess_cn_matching = 0

"Character Normalized Matching"

Definition at line 448 of file classify.h.

◆ tessedit_class_miss_scale

double tesseract::Classify::tessedit_class_miss_scale = 0.00390625

"Scale factor for features not used"

Definition at line 480 of file classify.h.

◆ tessedit_single_match

int tesseract::Classify::tessedit_single_match = FALSE

"Top choice only from CP"

Definition at line 429 of file classify.h.


The documentation for this class was generated from the following files: