tesseract  4.0.0-1-g2a2b
trainingsampleset.cpp
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
15 
16 #include "trainingsampleset.h"
17 #include "allheaders.h"
18 #include "boxread.h"
19 #include "fontinfo.h"
20 #include "indexmapbidi.h"
21 #include "intfeaturedist.h"
22 #include "intfeaturemap.h"
23 #include "intfeaturespace.h"
24 #include "shapetable.h"
25 #include "trainingsample.h"
26 #include "unicity_table.h"
27 
28 #include <algorithm>
29 
30 namespace tesseract {
31 
32 const int kTestChar = -1; // 37;
33 // Max number of distances to compute the squared way
34 const int kSquareLimit = 25;
35 // Prime numbers for subsampling distances.
36 const int kPrime1 = 17;
37 const int kPrime2 = 13;
38 
39 TrainingSampleSet::FontClassInfo::FontClassInfo()
40  : num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) {
41 }
42 
43 // Writes to the given file. Returns false in case of error.
45  if (fwrite(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
46  return false;
47  if (fwrite(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
48  return false;
49  if (fwrite(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
50  if (!samples.Serialize(fp)) return false;
51  return true;
52 }
53 // Reads from the given file. Returns false in case of error.
54 // If swap is true, assumes a big/little-endian swap is needed.
55 bool TrainingSampleSet::FontClassInfo::DeSerialize(bool swap, FILE* fp) {
56  if (fread(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
57  return false;
58  if (fread(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
59  return false;
60  if (fread(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
61  if (!samples.DeSerialize(swap, fp)) return false;
62  if (swap) {
63  ReverseN(&num_raw_samples, sizeof(num_raw_samples));
64  ReverseN(&canonical_sample, sizeof(canonical_sample));
65  ReverseN(&canonical_dist, sizeof(canonical_dist));
66  }
67  return true;
68 }
69 
70 TrainingSampleSet::TrainingSampleSet(const FontInfoTable& font_table)
71  : num_raw_samples_(0), unicharset_size_(0),
72  font_class_array_(nullptr), fontinfo_table_(font_table) {
73 }
74 
76  delete font_class_array_;
77 }
78 
79 // Writes to the given file. Returns false in case of error.
80 bool TrainingSampleSet::Serialize(FILE* fp) const {
81  if (!samples_.Serialize(fp)) return false;
82  if (!unicharset_.save_to_file(fp)) return false;
83  if (!font_id_map_.Serialize(fp)) return false;
84  int8_t not_null = font_class_array_ != nullptr;
85  if (fwrite(&not_null, sizeof(not_null), 1, fp) != 1) return false;
86  if (not_null) {
87  if (!font_class_array_->SerializeClasses(fp)) return false;
88  }
89  return true;
90 }
91 
92 // Reads from the given file. Returns false in case of error.
93 // If swap is true, assumes a big/little-endian swap is needed.
94 bool TrainingSampleSet::DeSerialize(bool swap, FILE* fp) {
95  if (!samples_.DeSerialize(swap, fp)) return false;
96  num_raw_samples_ = samples_.size();
97  if (!unicharset_.load_from_file(fp)) return false;
98  if (!font_id_map_.DeSerialize(swap, fp)) return false;
99  delete font_class_array_;
100  font_class_array_ = nullptr;
101  int8_t not_null;
102  if (fread(&not_null, sizeof(not_null), 1, fp) != 1) return false;
103  if (not_null) {
104  FontClassInfo empty;
105  font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo >(1, 1 , empty);
106  if (!font_class_array_->DeSerializeClasses(swap, fp)) return false;
107  }
108  unicharset_size_ = unicharset_.size();
109  return true;
110 }
111 
112 // Load an initial unicharset, or set one up if the file cannot be read.
113 void TrainingSampleSet::LoadUnicharset(const char* filename) {
114  if (!unicharset_.load_from_file(filename)) {
115  tprintf("Failed to load unicharset from file %s\n"
116  "Building unicharset from scratch...\n",
117  filename);
118  unicharset_.clear();
119  // Add special characters as they were removed by the clear.
120  UNICHARSET empty;
121  unicharset_.AppendOtherUnicharset(empty);
122  }
123  unicharset_size_ = unicharset_.size();
124 }
125 
126 // Adds a character sample to this sample set.
127 // If the unichar is not already in the local unicharset, it is added.
128 // Returns the unichar_id of the added sample, from the local unicharset.
130  if (!unicharset_.contains_unichar(unichar)) {
131  unicharset_.unichar_insert(unichar);
132  if (unicharset_.size() > MAX_NUM_CLASSES) {
133  tprintf("Error: Size of unicharset in TrainingSampleSet::AddSample is "
134  "greater than MAX_NUM_CLASSES\n");
135  return -1;
136  }
137  }
138  UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar);
139  AddSample(char_id, sample);
140  return char_id;
141 }
142 
143 // Adds a character sample to this sample set with the given unichar_id,
144 // which must correspond to the local unicharset (in this).
146  sample->set_class_id(unichar_id);
147  samples_.push_back(sample);
148  num_raw_samples_ = samples_.size();
149  unicharset_size_ = unicharset_.size();
150 }
151 
152 // Returns the number of samples for the given font,class pair.
153 // If randomize is true, returns the number of samples accessible
154 // with randomizing on. (Increases the number of samples if small.)
155 // OrganizeByFontAndClass must have been already called.
156 int TrainingSampleSet::NumClassSamples(int font_id, int class_id,
157  bool randomize) const {
158  ASSERT_HOST(font_class_array_ != nullptr);
159  if (font_id < 0 || class_id < 0 ||
160  font_id >= font_id_map_.SparseSize() || class_id >= unicharset_size_) {
161  // There are no samples because the font or class doesn't exist.
162  return 0;
163  }
164  int font_index = font_id_map_.SparseToCompact(font_id);
165  if (font_index < 0)
166  return 0; // The font has no samples.
167  if (randomize)
168  return (*font_class_array_)(font_index, class_id).samples.size();
169  else
170  return (*font_class_array_)(font_index, class_id).num_raw_samples;
171 }
172 
173 // Gets a sample by its index.
175  return samples_[index];
176 }
177 
178 // Gets a sample by its font, class, index.
179 // OrganizeByFontAndClass must have been already called.
180 const TrainingSample* TrainingSampleSet::GetSample(int font_id, int class_id,
181  int index) const {
182  ASSERT_HOST(font_class_array_ != nullptr);
183  int font_index = font_id_map_.SparseToCompact(font_id);
184  if (font_index < 0) return nullptr;
185  int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
186  return samples_[sample_index];
187 }
188 
189 // Get a sample by its font, class, index. Does not randomize.
190 // OrganizeByFontAndClass must have been already called.
192  int index) {
193  ASSERT_HOST(font_class_array_ != nullptr);
194  int font_index = font_id_map_.SparseToCompact(font_id);
195  if (font_index < 0) return nullptr;
196  int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
197  return samples_[sample_index];
198 }
199 
200 // Returns a string debug representation of the given sample:
201 // font, unichar_str, bounding box, page.
203  STRING boxfile_str;
204  MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()),
205  sample.bounding_box(), sample.page_num(), &boxfile_str);
206  return STRING(fontinfo_table_.get(sample.font_id()).name) + " " + boxfile_str;
207 }
208 
209 // Gets the combined set of features used by all the samples of the given
210 // font/class combination.
212  int font_id, int class_id) const {
213  int font_index = font_id_map_.SparseToCompact(font_id);
214  ASSERT_HOST(font_index >= 0);
215  return (*font_class_array_)(font_index, class_id).cloud_features;
216 }
217 // Gets the indexed features of the canonical sample of the given
218 // font/class combination.
220  int font_id, int class_id) const {
221  int font_index = font_id_map_.SparseToCompact(font_id);
222  ASSERT_HOST(font_index >= 0);
223  return (*font_class_array_)(font_index, class_id).canonical_features;
224 }
225 
226 // Returns the distance between the given UniCharAndFonts pair.
227 // If matched_fonts, only matching fonts, are considered, unless that yields
228 // the empty set.
229 // OrganizeByFontAndClass must have been already called.
231  const UnicharAndFonts& uf2,
232  bool matched_fonts,
233  const IntFeatureMap& feature_map) {
234  int num_fonts1 = uf1.font_ids.size();
235  int c1 = uf1.unichar_id;
236  int num_fonts2 = uf2.font_ids.size();
237  int c2 = uf2.unichar_id;
238  double dist_sum = 0.0;
239  int dist_count = 0;
240  const bool debug = false;
241  if (matched_fonts) {
242  // Compute distances only where fonts match.
243  for (int i = 0; i < num_fonts1; ++i) {
244  int f1 = uf1.font_ids[i];
245  for (int j = 0; j < num_fonts2; ++j) {
246  int f2 = uf2.font_ids[j];
247  if (f1 == f2) {
248  dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
249  ++dist_count;
250  }
251  }
252  }
253  } else if (num_fonts1 * num_fonts2 <= kSquareLimit) {
254  // Small enough sets to compute all the distances.
255  for (int i = 0; i < num_fonts1; ++i) {
256  int f1 = uf1.font_ids[i];
257  for (int j = 0; j < num_fonts2; ++j) {
258  int f2 = uf2.font_ids[j];
259  dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
260  if (debug) {
261  tprintf("Cluster dist %d %d %d %d = %g\n",
262  f1, c1, f2, c2,
263  ClusterDistance(f1, c1, f2, c2, feature_map));
264  }
265  ++dist_count;
266  }
267  }
268  } else {
269  // Subsample distances, using the largest set once, and stepping through
270  // the smaller set so as to ensure that all the pairs are different.
271  int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2;
272  int index = 0;
273  int num_samples = std::max(num_fonts1, num_fonts2);
274  for (int i = 0; i < num_samples; ++i, index += increment) {
275  int f1 = uf1.font_ids[i % num_fonts1];
276  int f2 = uf2.font_ids[index % num_fonts2];
277  if (debug) {
278  tprintf("Cluster dist %d %d %d %d = %g\n",
279  f1, c1, f2, c2, ClusterDistance(f1, c1, f2, c2, feature_map));
280  }
281  dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
282  ++dist_count;
283  }
284  }
285  if (dist_count == 0) {
286  if (matched_fonts)
287  return UnicharDistance(uf1, uf2, false, feature_map);
288  return 0.0f;
289  }
290  return dist_sum / dist_count;
291 }
292 
293 // Returns the distance between the given pair of font/class pairs.
294 // Finds in cache or computes and caches.
295 // OrganizeByFontAndClass must have been already called.
296 float TrainingSampleSet::ClusterDistance(int font_id1, int class_id1,
297  int font_id2, int class_id2,
298  const IntFeatureMap& feature_map) {
299  ASSERT_HOST(font_class_array_ != nullptr);
300  int font_index1 = font_id_map_.SparseToCompact(font_id1);
301  int font_index2 = font_id_map_.SparseToCompact(font_id2);
302  if (font_index1 < 0 || font_index2 < 0)
303  return 0.0f;
304  FontClassInfo& fc_info = (*font_class_array_)(font_index1, class_id1);
305  if (font_id1 == font_id2) {
306  // Special case cache for speed.
307  if (fc_info.unichar_distance_cache.size() == 0)
308  fc_info.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
309  if (fc_info.unichar_distance_cache[class_id2] < 0) {
310  // Distance has to be calculated.
311  float result = ComputeClusterDistance(font_id1, class_id1,
312  font_id2, class_id2,
313  feature_map);
314  fc_info.unichar_distance_cache[class_id2] = result;
315  // Copy to the symmetric cache entry.
316  FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
317  if (fc_info2.unichar_distance_cache.size() == 0)
318  fc_info2.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
319  fc_info2.unichar_distance_cache[class_id1] = result;
320  }
321  return fc_info.unichar_distance_cache[class_id2];
322  } else if (class_id1 == class_id2) {
323  // Another special-case cache for equal class-id.
324  if (fc_info.font_distance_cache.size() == 0)
325  fc_info.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
326  -1.0f);
327  if (fc_info.font_distance_cache[font_index2] < 0) {
328  // Distance has to be calculated.
329  float result = ComputeClusterDistance(font_id1, class_id1,
330  font_id2, class_id2,
331  feature_map);
332  fc_info.font_distance_cache[font_index2] = result;
333  // Copy to the symmetric cache entry.
334  FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
335  if (fc_info2.font_distance_cache.size() == 0)
336  fc_info2.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
337  -1.0f);
338  fc_info2.font_distance_cache[font_index1] = result;
339  }
340  return fc_info.font_distance_cache[font_index2];
341  }
342  // Both font and class are different. Linear search for class_id2/font_id2
343  // in what is a hopefully short list of distances.
344  int cache_index = 0;
345  while (cache_index < fc_info.distance_cache.size() &&
346  (fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
347  fc_info.distance_cache[cache_index].font_id != font_id2))
348  ++cache_index;
349  if (cache_index == fc_info.distance_cache.size()) {
350  // Distance has to be calculated.
351  float result = ComputeClusterDistance(font_id1, class_id1,
352  font_id2, class_id2,
353  feature_map);
354  FontClassDistance fc_dist = { class_id2, font_id2, result };
355  fc_info.distance_cache.push_back(fc_dist);
356  // Copy to the symmetric cache entry. We know it isn't there already, as
357  // we always copy to the symmetric entry.
358  FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
359  fc_dist.unichar_id = class_id1;
360  fc_dist.font_id = font_id1;
361  fc_info2.distance_cache.push_back(fc_dist);
362  }
363  return fc_info.distance_cache[cache_index].distance;
364 }
365 
366 // Computes the distance between the given pair of font/class pairs.
368  int font_id1, int class_id1, int font_id2, int class_id2,
369  const IntFeatureMap& feature_map) const {
370  int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2,
371  feature_map, false);
372  dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1,
373  feature_map, false);
374  int denominator = GetCanonicalFeatures(font_id1, class_id1).size();
375  denominator += GetCanonicalFeatures(font_id2, class_id2).size();
376  return static_cast<float>(dist) / denominator;
377 }
378 
379 // Helper to add a feature and its near neighbors to the good_features.
380 // levels indicates how many times to compute the offset features of what is
381 // already there. This is done by iteration rather than recursion.
382 static void AddNearFeatures(const IntFeatureMap& feature_map, int f, int levels,
383  GenericVector<int>* good_features) {
384  int prev_num_features = 0;
385  good_features->push_back(f);
386  int num_features = 1;
387  for (int level = 0; level < levels; ++level) {
388  for (int i = prev_num_features; i < num_features; ++i) {
389  int feature = (*good_features)[i];
390  for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
391  if (dir == 0) continue;
392  int f1 = feature_map.OffsetFeature(feature, dir);
393  if (f1 >= 0) {
394  good_features->push_back(f1);
395  }
396  }
397  }
398  prev_num_features = num_features;
399  num_features = good_features->size();
400  }
401 }
402 
403 // Returns the number of canonical features of font/class 2 for which
404 // neither the feature nor any of its near neighbors occurs in the cloud
405 // of font/class 1. Each such feature is a reliable separation between
406 // the classes, ASSUMING that the canonical sample is sufficiently
407 // representative that every sample has a feature near that particular
408 // feature. To check that this is so on the fly would be prohibitively
409 // expensive, but it might be possible to pre-qualify the canonical features
410 // to include only those for which this assumption is true.
411 // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
412 // first, or the results will be nonsense.
413 int TrainingSampleSet::ReliablySeparable(int font_id1, int class_id1,
414  int font_id2, int class_id2,
415  const IntFeatureMap& feature_map,
416  bool thorough) const {
417  int result = 0;
418  const TrainingSample* sample2 = GetCanonicalSample(font_id2, class_id2);
419  if (sample2 == nullptr)
420  return 0; // There are no canonical features.
421  const GenericVector<int>& canonical2 = GetCanonicalFeatures(font_id2,
422  class_id2);
423  const BitVector& cloud1 = GetCloudFeatures(font_id1, class_id1);
424  if (cloud1.size() == 0)
425  return canonical2.size(); // There are no cloud features.
426 
427  // Find a canonical2 feature that is not in cloud1.
428  for (int f = 0; f < canonical2.size(); ++f) {
429  const int feature = canonical2[f];
430  if (cloud1[feature])
431  continue;
432  // Gather the near neighbours of f.
433  GenericVector<int> good_features;
434  AddNearFeatures(feature_map, feature, 1, &good_features);
435  // Check that none of the good_features are in the cloud.
436  int i;
437  for (i = 0; i < good_features.size(); ++i) {
438  int good_f = good_features[i];
439  if (cloud1[good_f]) {
440  break;
441  }
442  }
443  if (i < good_features.size())
444  continue; // Found one in the cloud.
445  ++result;
446  }
447  return result;
448 }
449 
450 // Returns the total index of the requested sample.
451 // OrganizeByFontAndClass must have been already called.
452 int TrainingSampleSet::GlobalSampleIndex(int font_id, int class_id,
453  int index) const {
454  ASSERT_HOST(font_class_array_ != nullptr);
455  int font_index = font_id_map_.SparseToCompact(font_id);
456  if (font_index < 0) return -1;
457  return (*font_class_array_)(font_index, class_id).samples[index];
458 }
459 
460 // Gets the canonical sample for the given font, class pair.
461 // ComputeCanonicalSamples must have been called first.
463  int font_id, int class_id) const {
464  ASSERT_HOST(font_class_array_ != nullptr);
465  int font_index = font_id_map_.SparseToCompact(font_id);
466  if (font_index < 0) return nullptr;
467  const int sample_index = (*font_class_array_)(font_index,
468  class_id).canonical_sample;
469  return sample_index >= 0 ? samples_[sample_index] : nullptr;
470 }
471 
472 // Gets the max distance for the given canonical sample.
473 // ComputeCanonicalSamples must have been called first.
474 float TrainingSampleSet::GetCanonicalDist(int font_id, int class_id) const {
475  ASSERT_HOST(font_class_array_ != nullptr);
476  int font_index = font_id_map_.SparseToCompact(font_id);
477  if (font_index < 0) return 0.0f;
478  if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0)
479  return (*font_class_array_)(font_index, class_id).canonical_dist;
480  else
481  return 0.0f;
482 }
483 
484 // Generates indexed features for all samples with the supplied feature_space.
486  for (int s = 0; s < samples_.size(); ++s)
487  samples_[s]->IndexFeatures(feature_space);
488 }
489 
490 // Marks the given sample index for deletion.
491 // Deletion is actually completed by DeleteDeadSamples.
493  sample->set_sample_index(-1);
494 }
495 
496 // Deletes all samples with zero features marked by KillSample.
498  samples_.compact(
500  num_raw_samples_ = samples_.size();
501  // Samples must be re-organized now we have deleted a few.
502 }
503 
504 // Callback function returns true if the given sample is to be deleted, due
505 // to having a negative classid.
507  return sample == nullptr || sample->class_id() < 0;
508 }
509 
510 // Construct an array to access the samples by font,class pair.
512  // Font indexes are sparse, so we used a map to compact them, so we can
513  // have an efficient 2-d array of fonts and character classes.
514  SetupFontIdMap();
515  int compact_font_size = font_id_map_.CompactSize();
516  // Get a 2-d array of generic vectors.
517  delete font_class_array_;
518  FontClassInfo empty;
519  font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>(
520  compact_font_size, unicharset_size_, empty);
521  for (int s = 0; s < samples_.size(); ++s) {
522  int font_id = samples_[s]->font_id();
523  int class_id = samples_[s]->class_id();
524  if (font_id < 0 || font_id >= font_id_map_.SparseSize()) {
525  tprintf("Font id = %d/%d, class id = %d/%d on sample %d\n",
526  font_id, font_id_map_.SparseSize(), class_id, unicharset_size_,
527  s);
528  }
529  ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize());
530  ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
531  int font_index = font_id_map_.SparseToCompact(font_id);
532  (*font_class_array_)(font_index, class_id).samples.push_back(s);
533  }
534  // Set the num_raw_samples member of the FontClassInfo, to set the boundary
535  // between the raw samples and the replicated ones.
536  for (int f = 0; f < compact_font_size; ++f) {
537  for (int c = 0; c < unicharset_size_; ++c)
538  (*font_class_array_)(f, c).num_raw_samples =
539  (*font_class_array_)(f, c).samples.size();
540  }
541  // This is the global number of samples and also marks the boundary between
542  // real and replicated samples.
543  num_raw_samples_ = samples_.size();
544 }
545 
546 // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
547 // index for the font_class_array_.
549  // Number of samples for each font_id.
550  GenericVector<int> font_counts;
551  for (int s = 0; s < samples_.size(); ++s) {
552  const int font_id = samples_[s]->font_id();
553  while (font_id >= font_counts.size())
554  font_counts.push_back(0);
555  ++font_counts[font_id];
556  }
557  font_id_map_.Init(font_counts.size(), false);
558  for (int f = 0; f < font_counts.size(); ++f) {
559  font_id_map_.SetMap(f, font_counts[f] > 0);
560  }
561  font_id_map_.Setup();
562 }
563 
564 
565 // Finds the sample for each font, class pair that has least maximum
566 // distance to all the other samples of the same font, class.
567 // OrganizeByFontAndClass must have been already called.
569  bool debug) {
570  ASSERT_HOST(font_class_array_ != nullptr);
571  IntFeatureDist f_table;
572  if (debug) tprintf("feature table size %d\n", map.sparse_size());
573  f_table.Init(&map);
574  int worst_s1 = 0;
575  int worst_s2 = 0;
576  double global_worst_dist = 0.0;
577  // Compute distances independently for each font and char index.
578  int font_size = font_id_map_.CompactSize();
579  for (int font_index = 0; font_index < font_size; ++font_index) {
580  int font_id = font_id_map_.CompactToSparse(font_index);
581  for (int c = 0; c < unicharset_size_; ++c) {
582  int samples_found = 0;
583  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
584  if (fcinfo.samples.size() == 0 ||
585  (kTestChar >= 0 && c != kTestChar)) {
586  fcinfo.canonical_sample = -1;
587  fcinfo.canonical_dist = 0.0f;
588  if (debug) tprintf("Skipping class %d\n", c);
589  continue;
590  }
591  // The canonical sample will be the one with the min_max_dist, which
592  // is the sample with the lowest maximum distance to all other samples.
593  double min_max_dist = 2.0;
594  // We keep track of the farthest apart pair (max_s1, max_s2) which
595  // are max_max_dist apart, so we can see how bad the variability is.
596  double max_max_dist = 0.0;
597  int max_s1 = 0;
598  int max_s2 = 0;
599  fcinfo.canonical_sample = fcinfo.samples[0];
600  fcinfo.canonical_dist = 0.0f;
601  for (int i = 0; i < fcinfo.samples.size(); ++i) {
602  int s1 = fcinfo.samples[i];
603  const GenericVector<int>& features1 = samples_[s1]->indexed_features();
604  f_table.Set(features1, features1.size(), true);
605  double max_dist = 0.0;
606  // Run the full squared-order search for similar samples. It is still
607  // reasonably fast because f_table.FeatureDistance is fast, but we
608  // may have to reconsider if we start playing with too many samples
609  // of a single char/font.
610  for (int j = 0; j < fcinfo.samples.size(); ++j) {
611  int s2 = fcinfo.samples[j];
612  if (samples_[s2]->class_id() != c ||
613  samples_[s2]->font_id() != font_id ||
614  s2 == s1)
615  continue;
616  GenericVector<int> features2 = samples_[s2]->indexed_features();
617  double dist = f_table.FeatureDistance(features2);
618  if (dist > max_dist) {
619  max_dist = dist;
620  if (dist > max_max_dist) {
621  max_s1 = s1;
622  max_s2 = s2;
623  }
624  }
625  }
626  // Using Set(..., false) is far faster than re initializing, due to
627  // the sparseness of the feature space.
628  f_table.Set(features1, features1.size(), false);
629  samples_[s1]->set_max_dist(max_dist);
630  ++samples_found;
631  if (max_dist < min_max_dist) {
632  fcinfo.canonical_sample = s1;
633  fcinfo.canonical_dist = max_dist;
634  }
635  UpdateRange(max_dist, &min_max_dist, &max_max_dist);
636  }
637  if (max_max_dist > global_worst_dist) {
638  // Keep a record of the worst pair over all characters/fonts too.
639  global_worst_dist = max_max_dist;
640  worst_s1 = max_s1;
641  worst_s2 = max_s2;
642  }
643  if (debug) {
644  tprintf("Found %d samples of class %d=%s, font %d, "
645  "dist range [%g, %g], worst pair= %s, %s\n",
646  samples_found, c, unicharset_.debug_str(c).string(),
647  font_index, min_max_dist, max_max_dist,
648  SampleToString(*samples_[max_s1]).string(),
649  SampleToString(*samples_[max_s2]).string());
650  }
651  }
652  }
653  if (debug) {
654  tprintf("Global worst dist = %g, between sample %d and %d\n",
655  global_worst_dist, worst_s1, worst_s2);
656  }
657 }
658 
659 // Replicates the samples to a minimum frequency defined by
660 // 2 * kSampleRandomSize, or for larger counts duplicates all samples.
661 // After replication, the replicated samples are perturbed slightly, but
662 // in a predictable and repeatable way.
663 // Use after OrganizeByFontAndClass().
665  ASSERT_HOST(font_class_array_ != nullptr);
666  int font_size = font_id_map_.CompactSize();
667  for (int font_index = 0; font_index < font_size; ++font_index) {
668  for (int c = 0; c < unicharset_size_; ++c) {
669  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
670  int sample_count = fcinfo.samples.size();
671  int min_samples = 2 * std::max(kSampleRandomSize, sample_count);
672  if (sample_count > 0 && sample_count < min_samples) {
673  int base_count = sample_count;
674  for (int base_index = 0; sample_count < min_samples; ++sample_count) {
675  int src_index = fcinfo.samples[base_index++];
676  if (base_index >= base_count) base_index = 0;
677  TrainingSample* sample = samples_[src_index]->RandomizedCopy(
678  sample_count % kSampleRandomSize);
679  int sample_index = samples_.size();
680  sample->set_sample_index(sample_index);
681  samples_.push_back(sample);
682  fcinfo.samples.push_back(sample_index);
683  }
684  }
685  }
686  }
687 }
688 
689 // Caches the indexed features of the canonical samples.
690 // ComputeCanonicalSamples must have been already called.
691 // TODO(rays) see note on ReliablySeparable and try restricting the
692 // canonical features to those that truly represent all samples.
694  ASSERT_HOST(font_class_array_ != nullptr);
695  const int font_size = font_id_map_.CompactSize();
696  for (int font_index = 0; font_index < font_size; ++font_index) {
697  const int font_id = font_id_map_.CompactToSparse(font_index);
698  for (int c = 0; c < unicharset_size_; ++c) {
699  int num_samples = NumClassSamples(font_id, c, false);
700  if (num_samples == 0)
701  continue;
702  const TrainingSample* sample = GetCanonicalSample(font_id, c);
703  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
704  fcinfo.canonical_features = sample->indexed_features();
705  }
706  }
707 }
708 
709 // Computes the combined set of features used by all the samples of each
710 // font/class combination. Use after ReplicateAndRandomizeSamples.
711 void TrainingSampleSet::ComputeCloudFeatures(int feature_space_size) {
712  ASSERT_HOST(font_class_array_ != nullptr);
713  int font_size = font_id_map_.CompactSize();
714  for (int font_index = 0; font_index < font_size; ++font_index) {
715  int font_id = font_id_map_.CompactToSparse(font_index);
716  for (int c = 0; c < unicharset_size_; ++c) {
717  int num_samples = NumClassSamples(font_id, c, false);
718  if (num_samples == 0)
719  continue;
720  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
721  fcinfo.cloud_features.Init(feature_space_size);
722  for (int s = 0; s < num_samples; ++s) {
723  const TrainingSample* sample = GetSample(font_id, c, s);
724  const GenericVector<int>& sample_features = sample->indexed_features();
725  for (int i = 0; i < sample_features.size(); ++i)
726  fcinfo.cloud_features.SetBit(sample_features[i]);
727  }
728  }
729  }
730 }
731 
732 // Adds all fonts of the given class to the shape.
733 void TrainingSampleSet::AddAllFontsForClass(int class_id, Shape* shape) const {
734  for (int f = 0; f < font_id_map_.CompactSize(); ++f) {
735  const int font_id = font_id_map_.CompactToSparse(f);
736  shape->AddToShape(class_id, font_id);
737  }
738 }
739 
740 // Display the samples with the given indexed feature that also match
741 // the given shape.
743  const Shape& shape,
744  const IntFeatureSpace& space,
745  ScrollView::Color color,
746  ScrollView* window) const {
747  for (int s = 0; s < num_raw_samples(); ++s) {
748  const TrainingSample* sample = GetSample(s);
749  if (shape.ContainsUnichar(sample->class_id())) {
750  GenericVector<int> indexed_features;
751  space.IndexAndSortFeatures(sample->features(), sample->num_features(),
752  &indexed_features);
753  for (int f = 0; f < indexed_features.size(); ++f) {
754  if (indexed_features[f] == f_index) {
755  sample->DisplayFeatures(color, window);
756  }
757  }
758  }
759  }
760 }
761 
762 
763 } // namespace tesseract.
const int kSquareLimit
void KillSample(TrainingSample *sample)
int UNICHAR_ID
Definition: unichar.h:35
void LoadUnicharset(const char *filename)
const int kPrime2
int size() const
Definition: genericvector.h:71
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)
int CompactToSparse(int compact_index) const
Definition: indexmapbidi.h:53
Definition: cluster.h:32
void SetMap(int sparse_index, bool mapped)
bool save_to_file(const char *const filename) const
Definition: unicharset.h:345
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:463
void ComputeCloudFeatures(int feature_space_size)
const char * string() const
Definition: strngs.cpp:196
const GenericVector< int > & GetCanonicalFeatures(int font_id, int class_id) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:59
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
float GetCanonicalDist(int font_id, int class_id) const
int size() const
Definition: bitvector.h:56
virtual int SparseSize() const
Definition: indexmapbidi.h:142
int AddSample(const char *unichar, TrainingSample *sample)
int size() const
Definition: unicharset.h:336
bool Serialize(FILE *fp) const
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:625
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:178
void AddAllFontsForClass(int class_id, Shape *shape) const
TrainingSample * MutableSample(int font_id, int class_id, int index)
T & get(int index) const
bool DeSerialize(bool swap, FILE *fp)
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
void Set(const GenericVector< int > &indexed_features, int canonical_count, bool value)
int GlobalSampleIndex(int font_id, int class_id, int index) const
bool DeleteableSample(const TrainingSample *sample)
const int kTestChar
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:342
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)
bool DeSerialize(bool swap, FILE *fp)
int NumClassSamples(int font_id, int class_id, bool randomize) const
void IndexFeatures(const IntFeatureSpace &feature_space)
const BitVector & GetCloudFeatures(int font_id, int class_id) const
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int push_back(T object)
const TrainingSample * GetSample(int index) const
Definition: strngs.h:45
bool Serialize(FILE *fp) const
float ComputeClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map) const
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
void clear()
Definition: unicharset.h:301
STRING SampleToString(const TrainingSample &sample) const
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
double FeatureDistance(const GenericVector< int > &features) const
void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, STRING *box_str)
Definition: boxread.cpp:233
bool SerializeClasses(FILE *fp) const
Definition: matrix.h:182
bool ContainsUnichar(int unichar_id) const
Definition: shapetable.cpp:147
bool DeSerializeClasses(bool swap, FILE *fp)
Definition: matrix.h:195
const int kPrime1
virtual int SparseToCompact(int sparse_index) const
Definition: indexmapbidi.h:138
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
void IndexAndSortFeatures(const INT_FEATURE_STRUCT *features, int num_features, GenericVector< int > *sorted_features) const
void Init(const IntFeatureMap *feature_map)
int ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map, bool thorough) const
bool DeSerialize(FILE *fp, char *data, size_t n)
Definition: serialis.cpp:27
GenericVector< int32_t > font_ids
Definition: shapetable.h:175
#define MAX_NUM_CLASSES
Definition: matchdefs.h:32
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:383
int CompactSize() const
Definition: indexmapbidi.h:61
void AddToShape(int unichar_id, int font_id)
Definition: shapetable.cpp:101
void Init(int size, bool all_mapped)
int OffsetFeature(int index_feature, int dir) const
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:121
#define ASSERT_HOST(x)
Definition: errcode.h:84