tesseract  5.0.0-alpha-619-ge9db
classify.cpp
Go to the documentation of this file.
1 // File: classify.cpp
3 // Description: classify class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include "classify.h"
20 
21 #ifdef DISABLED_LEGACY_ENGINE
22 
23 #include <string.h>
24 
25 namespace tesseract {
26 
28  :
29  INT_MEMBER(classify_debug_level, 0, "Classify debug level",
30  this->params()),
31 
32  BOOL_MEMBER(classify_bln_numeric_mode, 0,
33 "Assume the input is numbers [0-9].", this->params()),
34 
35  double_MEMBER(classify_max_rating_ratio, 1.5,
36  "Veto ratio between classifier ratings", this->params()),
37 
38  double_MEMBER(classify_max_certainty_margin, 5.5,
39  "Veto difference between classifier certainties",
40  this->params()),
41 
42  dict_(this) {}
43 
45 
46 } // namespace tesseract
47 
48 #else // DISABLED_LEGACY_ENGINE not defined
49 
50 #include "fontinfo.h"
51 #include "intproto.h"
52 #include "mfoutline.h"
53 #include "scrollview.h"
54 #include "shapeclassifier.h"
55 #include "shapetable.h"
56 #include "unicity_table.h"
57 #include <cstring>
58 
59 namespace tesseract {
61  : BOOL_MEMBER(allow_blob_division, true, "Use divisible blobs chopping",
62  this->params()),
63  BOOL_MEMBER(prioritize_division, false,
64  "Prioritize blob division over chopping", this->params()),
65  BOOL_MEMBER(classify_enable_learning, true, "Enable adaptive classifier",
66  this->params()),
67  INT_MEMBER(classify_debug_level, 0, "Classify debug level",
68  this->params()),
69  INT_MEMBER(classify_norm_method, character, "Normalization Method ...",
70  this->params()),
71  double_MEMBER(classify_char_norm_range, 0.2,
72  "Character Normalization Range ...", this->params()),
73  double_MEMBER(classify_max_rating_ratio, 1.5,
74  "Veto ratio between classifier ratings", this->params()),
75  double_MEMBER(classify_max_certainty_margin, 5.5,
76  "Veto difference between classifier certainties",
77  this->params()),
78  BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
79  this->params()),
80  BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
81  this->params()),
82  BOOL_MEMBER(classify_enable_adaptive_matcher, 1,
83  "Enable adaptive classifier", this->params()),
84  BOOL_MEMBER(classify_use_pre_adapted_templates, 0,
85  "Use pre-adapted classifier templates", this->params()),
86  BOOL_MEMBER(classify_save_adapted_templates, 0,
87  "Save adapted templates to a file", this->params()),
88  BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
89  this->params()),
90  BOOL_MEMBER(classify_nonlinear_norm, 0,
91  "Non-linear stroke-density normalization", this->params()),
92  INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
93  INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
94  INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
95  this->params()),
96  double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)",
97  this->params()),
98  double_MEMBER(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)",
99  this->params()),
100  double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)",
101  this->params()),
102  double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)",
103  this->params()),
104  double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)",
105  this->params()),
106  double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length",
107  this->params()),
108  INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes",
109  this->params()),
110  INT_MEMBER(matcher_min_examples_for_prototyping, 3,
111  "Reliable Config Threshold", this->params()),
112  INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5,
113  "Enable adaption even if the ambiguities have not been seen",
114  this->params()),
115  double_MEMBER(matcher_clustering_max_angle_delta, 0.015,
116  "Maximum angle delta for prototype clustering",
117  this->params()),
118  double_MEMBER(classify_misfit_junk_penalty, 0.0,
119  "Penalty to apply when a non-alnum is vertically out of "
120  "its expected textline position",
121  this->params()),
122  double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()),
123  double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
124  this->params()),
125  double_MEMBER(tessedit_class_miss_scale, 0.00390625,
126  "Scale factor for features not used", this->params()),
128  classify_adapted_pruning_factor, 2.5,
129  "Prune poor adapted results this much worse than best result",
130  this->params()),
131  double_MEMBER(classify_adapted_pruning_threshold, -1.0,
132  "Threshold at which classify_adapted_pruning_factor starts",
133  this->params()),
134  INT_MEMBER(classify_adapt_proto_threshold, 230,
135  "Threshold for good protos during adaptive 0-255",
136  this->params()),
137  INT_MEMBER(classify_adapt_feature_threshold, 230,
138  "Threshold for good features during adaptive 0-255",
139  this->params()),
141  "Do not include character fragments in the"
142  " results of the classifier",
143  this->params()),
144  double_MEMBER(classify_character_fragments_garbage_certainty_threshold,
145  -3.0,
146  "Exclude fragments that do not look like whole"
147  " characters from training and adaption",
148  this->params()),
149  BOOL_MEMBER(classify_debug_character_fragments, false,
150  "Bring up graphical debugging windows for fragments training",
151  this->params()),
152  BOOL_MEMBER(matcher_debug_separate_windows, false,
153  "Use two different windows for debugging the matching: "
154  "One for the protos and one for the features.",
155  this->params()),
156  STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning",
157  this->params()),
158  INT_MEMBER(classify_class_pruner_threshold, 229,
159  "Class Pruner Threshold 0-255", this->params()),
160  INT_MEMBER(classify_class_pruner_multiplier, 15,
161  "Class Pruner Multiplier 0-255: ", this->params()),
162  INT_MEMBER(classify_cp_cutoff_strength, 7,
163  "Class Pruner CutoffStrength: ", this->params()),
165  "Integer Matcher Multiplier 0-255: ", this->params()),
166  BOOL_MEMBER(classify_bln_numeric_mode, 0,
167  "Assume the input is numbers [0-9].", this->params()),
168  double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size",
169  this->params()),
170  double_MEMBER(speckle_rating_penalty, 10.0,
171  "Penalty to add to worst rating for noise", this->params()),
172  im_(&classify_debug_level),
173  dict_(this) {
174  using namespace std::placeholders; // for _1, _2
175  fontinfo_table_.set_compare_callback(std::bind(CompareFontInfo, _1, _2));
176  fontinfo_table_.set_clear_callback(std::bind(FontInfoDeleteCallback, _1));
177  fontset_table_.set_compare_callback(std::bind(CompareFontSet, _1, _2));
178  fontset_table_.set_clear_callback(std::bind(FontSetDeleteCallback, _1));
179 
181 }
182 
185  delete learn_debug_win_;
186  delete learn_fragmented_word_debug_win_;
187  delete learn_fragments_debug_win_;
188 }
189 
190 
191 // Takes ownership of the given classifier, and uses it for future calls
192 // to CharNormClassifier.
194  delete static_classifier_;
195  static_classifier_ = static_classifier;
196 }
197 
198 // Moved from speckle.cpp
199 // Adds a noise classification result that is a bit worse than the worst
200 // current result, or the worst possible result if no current results.
201 void Classify::AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices) {
202  BLOB_CHOICE_IT bc_it(choices);
203  // If there is no classifier result, we will use the worst possible certainty
204  // and corresponding rating.
205  float certainty = -getDict().certainty_scale;
206  float rating = rating_scale * blob_length;
207  if (!choices->empty() && blob_length > 0) {
208  bc_it.move_to_last();
209  BLOB_CHOICE* worst_choice = bc_it.data();
210  // Add speckle_rating_penalty to worst rating, matching old value.
211  rating = worst_choice->rating() + speckle_rating_penalty;
212  // Compute the rating to correspond to the certainty. (Used to be kept
213  // the same, but that messes up the language model search.)
214  certainty = -rating * getDict().certainty_scale /
215  (rating_scale * blob_length);
216  }
217  auto* blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty,
218  -1, 0.0f, FLT_MAX, 0,
220  bc_it.add_to_end(blob_choice);
221 }
222 
223 // Returns true if the blob is small enough to be a large speckle.
224 bool Classify::LargeSpeckle(const TBLOB &blob) {
225  double speckle_size = kBlnXHeight * speckle_large_max_size;
226  TBOX bbox = blob.bounding_box();
227  return bbox.width() < speckle_size && bbox.height() < speckle_size;
228 }
229 
230 } // namespace tesseract
231 
232 #endif // def DISABLED_LEGACY_ENGINE
tesseract::FontInfoDeleteCallback
void FontInfoDeleteCallback(FontInfo f)
Definition: fontinfo.cpp:141
tesseract::Classify::SetStaticClassifier
void SetStaticClassifier(ShapeClassifier *static_classifier)
Definition: classify.cpp:193
InitFeatureDefs
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:111
kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:23
unicity_table.h
tesseract::Classify::fontinfo_table_
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
INT_MEMBER
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:312
mfoutline.h
tesseract::Classify::EndAdaptiveClassifier
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:459
classify_integer_matcher_multiplier
int classify_integer_matcher_multiplier
TBOX::height
int16_t height() const
Definition: rect.h:107
tesseract::Classify::getDict
virtual Dict & getDict()
Definition: classify.h:107
STRING_MEMBER
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:318
tesseract::ShapeClassifier
Definition: shapeclassifier.h:43
tesseract::Classify::speckle_large_max_size
double speckle_large_max_size
Definition: classify.h:509
tesseract::FontSetDeleteCallback
void FontSetDeleteCallback(FontSet fs)
Definition: fontinfo.cpp:150
UNICHAR_SPACE
Definition: unicharset.h:34
shapetable.h
tesseract::Classify::Classify
Classify()
Definition: classify.cpp:60
tesseract::Dict::certainty_scale
double certainty_scale
Definition: dict.h:627
TBOX::width
int16_t width() const
Definition: rect.h:114
double_MEMBER
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:321
tesseract::Classify::feature_defs_
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:541
character
Definition: mfoutline.h:62
tesseract::Classify::AddLargeSpeckleTo
void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices)
Definition: classify.cpp:201
tesseract
Definition: baseapi.h:65
fontinfo.h
tesseract::CompareFontSet
bool CompareFontSet(const FontSet &fs1, const FontSet &fs2)
Definition: fontinfo.cpp:130
BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:78
tesseract::Classify::speckle_rating_penalty
double speckle_rating_penalty
Definition: classify.h:511
TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:466
shapeclassifier.h
BLOB_CHOICE
Definition: ratngs.h:49
TBLOB
Definition: blobs.h:282
tesseract::CompareFontInfo
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:122
tesseract::Classify::LargeSpeckle
bool LargeSpeckle(const TBLOB &blob)
Definition: classify.cpp:224
tesseract::Classify::rating_scale
double rating_scale
Definition: classify.h:472
intproto.h
disable_character_fragments
bool disable_character_fragments
tesseract::Classify::~Classify
~Classify() override
Definition: classify.cpp:183
classify.h
BOOL_MEMBER
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:315
tesseract::Classify::fontset_table_
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
scrollview.h
BCC_SPECKLE_CLASSIFIER
Definition: ratngs.h:44
TBOX
Definition: rect.h:33