tesseract  5.0.0-alpha-619-ge9db
trainingsampleset.h
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
15 
16 #ifndef TESSERACT_TRAINING_TRAININGSAMPLESET_H_
17 #define TESSERACT_TRAINING_TRAININGSAMPLESET_H_
18 
19 #include "bitvector.h"
21 #include "indexmapbidi.h"
22 #include "matrix.h"
23 #include "shapetable.h"
24 #include "trainingsample.h"
25 
26 class UNICHARSET;
27 
28 namespace tesseract {
29 
30 struct FontInfo;
31 class FontInfoTable;
32 class IntFeatureMap;
33 class IntFeatureSpace;
34 class TrainingSample;
35 struct UnicharAndFonts;
36 
37 // Collection of TrainingSample used for training or testing a classifier.
38 // Provides several useful methods to operate on the collection as a whole,
39 // including outlier detection and deletion, providing access by font and
40 // class, finding the canonical sample, finding the "cloud" features (OR of
41 // all features in all samples), replication of samples, caching of distance
42 // metrics.
44  public:
47 
48  // Writes to the given file. Returns false in case of error.
49  bool Serialize(FILE* fp) const;
50  // Reads from the given file. Returns false in case of error.
51  // If swap is true, assumes a big/little-endian swap is needed.
52  bool DeSerialize(bool swap, FILE* fp);
53 
54  // Accessors
55  int num_samples() const {
56  return samples_.size();
57  }
58  int num_raw_samples() const {
59  return num_raw_samples_;
60  }
61  int NumFonts() const {
62  return font_id_map_.SparseSize();
63  }
64  const UNICHARSET& unicharset() const {
65  return unicharset_;
66  }
67  int charsetsize() const {
68  return unicharset_size_;
69  }
70  const FontInfoTable& fontinfo_table() const {
71  return fontinfo_table_;
72  }
73 
74  // Loads an initial unicharset, or sets one up if the file cannot be read.
75  void LoadUnicharset(const char* filename);
76 
77  // Adds a character sample to this sample set.
78  // If the unichar is not already in the local unicharset, it is added.
79  // Returns the unichar_id of the added sample, from the local unicharset.
80  int AddSample(const char* unichar, TrainingSample* sample);
81  // Adds a character sample to this sample set with the given unichar_id,
82  // which must correspond to the local unicharset (in this).
83  void AddSample(int unichar_id, TrainingSample* sample);
84 
85  // Returns the number of samples for the given font,class pair.
86  // If randomize is true, returns the number of samples accessible
87  // with randomizing on. (Increases the number of samples if small.)
88  // OrganizeByFontAndClass must have been already called.
89  int NumClassSamples(int font_id, int class_id, bool randomize) const;
90 
91  // Gets a sample by its index.
92  const TrainingSample* GetSample(int index) const;
93 
94  // Gets a sample by its font, class, index.
95  // OrganizeByFontAndClass must have been already called.
96  const TrainingSample* GetSample(int font_id, int class_id, int index) const;
97 
98  // Get a sample by its font, class, index. Does not randomize.
99  // OrganizeByFontAndClass must have been already called.
100  TrainingSample* MutableSample(int font_id, int class_id, int index);
101 
102  // Returns a string debug representation of the given sample:
103  // font, unichar_str, bounding box, page.
105 
106  // Gets the combined set of features used by all the samples of the given
107  // font/class combination.
108  const BitVector& GetCloudFeatures(int font_id, int class_id) const;
109  // Gets the indexed features of the canonical sample of the given
110  // font/class combination.
111  const GenericVector<int>& GetCanonicalFeatures(int font_id,
112  int class_id) const;
113 
114  // Returns the distance between the given UniCharAndFonts pair.
115  // If matched_fonts, only matching fonts, are considered, unless that yields
116  // the empty set.
117  // OrganizeByFontAndClass must have been already called.
118  float UnicharDistance(const UnicharAndFonts& uf1, const UnicharAndFonts& uf2,
119  bool matched_fonts, const IntFeatureMap& feature_map);
120 
121  // Returns the distance between the given pair of font/class pairs.
122  // Finds in cache or computes and caches.
123  // OrganizeByFontAndClass must have been already called.
124  float ClusterDistance(int font_id1, int class_id1,
125  int font_id2, int class_id2,
126  const IntFeatureMap& feature_map);
127 
128  // Computes the distance between the given pair of font/class pairs.
129  float ComputeClusterDistance(int font_id1, int class_id1,
130  int font_id2, int class_id2,
131  const IntFeatureMap& feature_map) const;
132 
133  // Returns the number of canonical features of font/class 2 for which
134  // neither the feature nor any of its near neighbors occurs in the cloud
135  // of font/class 1. Each such feature is a reliable separation between
136  // the classes, ASSUMING that the canonical sample is sufficiently
137  // representative that every sample has a feature near that particular
138  // feature. To check that this is so on the fly would be prohibitively
139  // expensive, but it might be possible to pre-qualify the canonical features
140  // to include only those for which this assumption is true.
141  // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
142  // first, or the results will be nonsense.
143  int ReliablySeparable(int font_id1, int class_id1,
144  int font_id2, int class_id2,
145  const IntFeatureMap& feature_map,
146  bool thorough) const;
147 
148 
149  // Returns the total index of the requested sample.
150  // OrganizeByFontAndClass must have been already called.
151  int GlobalSampleIndex(int font_id, int class_id, int index) const;
152 
153  // Gets the canonical sample for the given font, class pair.
154  // ComputeCanonicalSamples must have been called first.
155  const TrainingSample* GetCanonicalSample(int font_id, int class_id) const;
156  // Gets the max distance for the given canonical sample.
157  // ComputeCanonicalSamples must have been called first.
158  float GetCanonicalDist(int font_id, int class_id) const;
159 
160  // Returns a mutable pointer to the sample with the given index.
162  return samples_[index];
163  }
164  // Gets ownership of the sample with the given index, removing it from this.
166  TrainingSample* sample = samples_[index];
167  samples_[index] = nullptr;
168  return sample;
169  }
170 
171  // Generates indexed features for all samples with the supplied feature_space.
172  void IndexFeatures(const IntFeatureSpace& feature_space);
173 
174  // Marks the given sample for deletion.
175  // Deletion is actually completed by DeleteDeadSamples.
177 
178  // Deletes all samples with a negative sample index marked by KillSample.
179  // Must be called before OrganizeByFontAndClass, and OrganizeByFontAndClass
180  // must be called after as the samples have been renumbered.
181  void DeleteDeadSamples();
182 
183  // Callback function returns true if the given sample is to be deleted, due
184  // to having a negative classid.
186 
187  // Construct an array to access the samples by font,class pair.
188  void OrganizeByFontAndClass();
189 
190  // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
191  // index for the font_class_array_.
192  void SetupFontIdMap();
193 
194  // Finds the sample for each font, class pair that has least maximum
195  // distance to all the other samples of the same font, class.
196  // OrganizeByFontAndClass must have been already called.
197  void ComputeCanonicalSamples(const IntFeatureMap& map, bool debug);
198 
199  // Replicates the samples to a minimum frequency defined by
200  // 2 * kSampleRandomSize, or for larger counts duplicates all samples.
201  // After replication, the replicated samples are perturbed slightly, but
202  // in a predictable and repeatable way.
203  // Use after OrganizeByFontAndClass().
205 
206  // Caches the indexed features of the canonical samples.
207  // ComputeCanonicalSamples must have been already called.
209  // Computes the combined set of features used by all the samples of each
210  // font/class combination. Use after ReplicateAndRandomizeSamples.
211  void ComputeCloudFeatures(int feature_space_size);
212 
213  // Adds all fonts of the given class to the shape.
214  void AddAllFontsForClass(int class_id, Shape* shape) const;
215 
216  // Display the samples with the given indexed feature that also match
217  // the given shape.
218  void DisplaySamplesWithFeature(int f_index, const Shape& shape,
219  const IntFeatureSpace& feature_space,
220  ScrollView::Color color,
221  ScrollView* window) const;
222 
223  private:
224  // Struct to store a triplet of unichar, font, distance in the distance cache.
225  struct FontClassDistance {
226  int unichar_id;
227  int font_id; // Real font id.
228  float distance;
229  };
230  // Simple struct to store information related to each font/class combination.
231  struct FontClassInfo {
232  FontClassInfo();
233 
234  // Writes to the given file. Returns false in case of error.
235  bool Serialize(FILE* fp) const;
236  // Reads from the given file. Returns false in case of error.
237  // If swap is true, assumes a big/little-endian swap is needed.
238  bool DeSerialize(bool swap, FILE* fp);
239 
240  // Number of raw samples.
241  int32_t num_raw_samples;
242  // Index of the canonical sample.
243  int32_t canonical_sample;
244  // Max distance of the canonical sample from any other.
245  float canonical_dist;
246  // Sample indices for the samples, including replicated.
247  GenericVector<int32_t> samples;
248 
249  // Non-serialized cache data.
250  // Indexed features of the canonical sample.
251  GenericVector<int> canonical_features;
252  // The mapped features of all the samples.
253  BitVector cloud_features;
254 
255  // Caches for ClusterDistance.
256  // Caches for other fonts but matching this unichar. -1 indicates not set.
257  // Indexed by compact font index from font_id_map_.
258  GenericVector<float> font_distance_cache;
259  // Caches for other unichars but matching this font. -1 indicates not set.
260  GenericVector<float> unichar_distance_cache;
261  // Cache for the rest (non matching font and unichar.)
262  // A cache of distances computed by ReliablySeparable.
263  GenericVector<FontClassDistance> distance_cache;
264  };
265 
266  PointerVector<TrainingSample> samples_;
267  // Number of samples before replication/randomization.
268  int num_raw_samples_;
269  // Character set we are training for.
270  UNICHARSET unicharset_;
271  // Character set size to which the 2-d arrays below refer.
272  int unicharset_size_;
273  // Map to allow the font_class_array_ below to be compact.
274  // The sparse space is the real font_id, used in samples_ .
275  // The compact space is an index to font_class_array_
276  IndexMapBiDi font_id_map_;
277  // A 2-d array of FontClassInfo holding information related to each
278  // (font_id, class_id) pair.
279  GENERIC_2D_ARRAY<FontClassInfo>* font_class_array_;
280 
281  // Reference to the fontinfo_table_ in MasterTrainer. Provides names
282  // for font_ids in the samples. Not serialized!
283  const FontInfoTable& fontinfo_table_;
284 };
285 
286 } // namespace tesseract.
287 
288 
289 #endif // TRAININGSAMPLESETSET_H_
ScrollView
Definition: scrollview.h:97
tesseract::TrainingSampleSet::extract_sample
TrainingSample * extract_sample(int index)
Definition: trainingsampleset.h:165
tesseract::TrainingSampleSet::AddAllFontsForClass
void AddAllFontsForClass(int class_id, Shape *shape) const
Definition: trainingsampleset.cpp:734
tesseract::TrainingSampleSet::ClusterDistance
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
Definition: trainingsampleset.cpp:296
tesseract::Shape
Definition: shapetable.h:184
tesseract::TrainingSampleSet::GetCanonicalFeatures
const GenericVector< int > & GetCanonicalFeatures(int font_id, int class_id) const
Definition: trainingsampleset.cpp:219
tesseract::TrainingSampleSet::MutableSample
TrainingSample * MutableSample(int font_id, int class_id, int index)
Definition: trainingsampleset.cpp:191
tesseract::TrainingSampleSet::mutable_sample
TrainingSample * mutable_sample(int index)
Definition: trainingsampleset.h:161
tesseract::TrainingSampleSet::GetCanonicalDist
float GetCanonicalDist(int font_id, int class_id) const
Definition: trainingsampleset.cpp:474
tesseract::TrainingSampleSet::LoadUnicharset
void LoadUnicharset(const char *filename)
Definition: trainingsampleset.cpp:113
tesseract::TrainingSampleSet::NumClassSamples
int NumClassSamples(int font_id, int class_id, bool randomize) const
Definition: trainingsampleset.cpp:156
STRING
Definition: strngs.h:45
tesseract::UnicharAndFonts
Definition: shapetable.h:159
GENERIC_2D_ARRAY< FontClassInfo >
tesseract::FontInfoTable
Definition: fontinfo.h:146
tesseract::TrainingSampleSet::Serialize
bool Serialize(FILE *fp) const
Definition: trainingsampleset.cpp:80
tesseract::TrainingSampleSet::DeleteDeadSamples
void DeleteDeadSamples()
Definition: trainingsampleset.cpp:497
tesseract::TrainingSampleSet::GlobalSampleIndex
int GlobalSampleIndex(int font_id, int class_id, int index) const
Definition: trainingsampleset.cpp:452
tesseract::TrainingSampleSet::num_samples
int num_samples() const
Definition: trainingsampleset.h:55
tesseract::TrainingSampleSet::GetCloudFeatures
const BitVector & GetCloudFeatures(int font_id, int class_id) const
Definition: trainingsampleset.cpp:211
genericvector.h
tesseract::TrainingSampleSet::TrainingSampleSet
TrainingSampleSet(const FontInfoTable &fontinfo_table)
Definition: trainingsampleset.cpp:70
tesseract::TrainingSampleSet::fontinfo_table
const FontInfoTable & fontinfo_table() const
Definition: trainingsampleset.h:70
tesseract::TrainingSampleSet::~TrainingSampleSet
~TrainingSampleSet()
Definition: trainingsampleset.cpp:75
tesseract::TrainingSampleSet::ComputeCanonicalFeatures
void ComputeCanonicalFeatures()
Definition: trainingsampleset.cpp:694
tesseract::TrainingSampleSet::UnicharDistance
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)
Definition: trainingsampleset.cpp:230
tesseract::TrainingSampleSet::GetSample
const TrainingSample * GetSample(int index) const
Definition: trainingsampleset.cpp:174
trainingsample.h
shapetable.h
matrix.h
UNICHARSET
Definition: unicharset.h:145
tesseract::TrainingSampleSet::num_raw_samples
int num_raw_samples() const
Definition: trainingsampleset.h:58
tesseract::TrainingSampleSet::OrganizeByFontAndClass
void OrganizeByFontAndClass()
Definition: trainingsampleset.cpp:511
tesseract
Definition: baseapi.h:65
distance
UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first, const UnicodeText::const_iterator &last)
Definition: unicodetext.cc:44
tesseract::TrainingSampleSet::ComputeCloudFeatures
void ComputeCloudFeatures(int feature_space_size)
Definition: trainingsampleset.cpp:712
bitvector.h
tesseract::BitVector
Definition: bitvector.h:30
sample
Definition: cluster.h:31
GenericVector< int >
tesseract::IntFeatureSpace
Definition: intfeaturespace.h:38
tesseract::TrainingSampleSet
Definition: trainingsampleset.h:43
tesseract::TrainingSampleSet::DeleteableSample
bool DeleteableSample(const TrainingSample *sample)
Definition: trainingsampleset.cpp:506
tesseract::TrainingSampleSet::SampleToString
STRING SampleToString(const TrainingSample &sample) const
Definition: trainingsampleset.cpp:202
tesseract::TrainingSampleSet::AddSample
int AddSample(const char *unichar, TrainingSample *sample)
Definition: trainingsampleset.cpp:129
tesseract::IndexMapBiDi::SparseSize
int SparseSize() const override
Definition: indexmapbidi.h:142
tesseract::TrainingSample
Definition: trainingsample.h:53
tesseract::IntFeatureMap
Definition: intfeaturemap.h:48
tesseract::TrainingSampleSet::NumFonts
int NumFonts() const
Definition: trainingsampleset.h:61
tesseract::TrainingSampleSet::charsetsize
int charsetsize() const
Definition: trainingsampleset.h:67
tesseract::TrainingSampleSet::IndexFeatures
void IndexFeatures(const IntFeatureSpace &feature_space)
Definition: trainingsampleset.cpp:485
tesseract::TrainingSampleSet::SetupFontIdMap
void SetupFontIdMap()
Definition: trainingsampleset.cpp:548
tesseract::TrainingSampleSet::DisplaySamplesWithFeature
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
Definition: trainingsampleset.cpp:743
ScrollView::Color
Color
Definition: scrollview.h:100
tesseract::TrainingSampleSet::KillSample
void KillSample(TrainingSample *sample)
Definition: trainingsampleset.cpp:492
tesseract::TrainingSampleSet::ReplicateAndRandomizeSamples
void ReplicateAndRandomizeSamples()
Definition: trainingsampleset.cpp:665
tesseract::TrainingSampleSet::unicharset
const UNICHARSET & unicharset() const
Definition: trainingsampleset.h:64
tesseract::TrainingSampleSet::DeSerialize
bool DeSerialize(bool swap, FILE *fp)
Definition: trainingsampleset.cpp:94
indexmapbidi.h
tesseract::TrainingSampleSet::ReliablySeparable
int ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map, bool thorough) const
Definition: trainingsampleset.cpp:413
tesseract::TrainingSampleSet::ComputeClusterDistance
float ComputeClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map) const
Definition: trainingsampleset.cpp:367
tesseract::TrainingSampleSet::GetCanonicalSample
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
Definition: trainingsampleset.cpp:462
tesseract::TrainingSampleSet::ComputeCanonicalSamples
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)
Definition: trainingsampleset.cpp:568