tesseract  5.0.0-alpha-619-ge9db
mastertrainer.cpp
Go to the documentation of this file.
1 // File: mastertrainer.cpp
3 // Description: Trainer to build the MasterClassifier.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2010, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #include "mastertrainer.h"
25 #include <cmath>
26 #include <ctime>
27 #include "allheaders.h"
28 #include "boxread.h"
29 #include "classify.h"
30 #include "errorcounter.h"
31 #include "featdefs.h"
32 #include "sampleiterator.h"
33 #include "shapeclassifier.h"
34 #include "shapetable.h"
35 #include "svmnode.h"
36 
37 #include "scanutils.h"
38 
39 namespace tesseract {
40 
41 // Constants controlling clustering. With a low kMinClusteredShapes and a high
42 // kMaxUnicharsPerCluster, then kFontMergeDistance is the only limiting factor.
43 // Min number of shapes in the output.
44 const int kMinClusteredShapes = 1;
45 // Max number of unichars in any individual cluster.
46 const int kMaxUnicharsPerCluster = 2000;
47 // Mean font distance below which to merge fonts and unichars.
48 const float kFontMergeDistance = 0.025;
49 
51  bool shape_analysis,
52  bool replicate_samples,
53  int debug_level)
54  : norm_mode_(norm_mode), samples_(fontinfo_table_),
55  junk_samples_(fontinfo_table_), verify_samples_(fontinfo_table_),
56  charsetsize_(0),
57  enable_shape_analysis_(shape_analysis),
58  enable_replication_(replicate_samples),
59  fragments_(nullptr), prev_unichar_id_(-1), debug_level_(debug_level) {
60 }
61 
63  delete [] fragments_;
64  for (int p = 0; p < page_images_.size(); ++p)
65  pixDestroy(&page_images_[p]);
66 }
67 
68 // WARNING! Serialize/DeSerialize are only partial, providing
69 // enough data to get the samples back and display them.
70 // Writes to the given file. Returns false in case of error.
71 bool MasterTrainer::Serialize(FILE* fp) const {
72  uint32_t value = norm_mode_;
73  if (!tesseract::Serialize(fp, &value)) return false;
74  if (!unicharset_.save_to_file(fp)) return false;
75  if (!feature_space_.Serialize(fp)) return false;
76  if (!samples_.Serialize(fp)) return false;
77  if (!junk_samples_.Serialize(fp)) return false;
78  if (!verify_samples_.Serialize(fp)) return false;
79  if (!master_shapes_.Serialize(fp)) return false;
80  if (!flat_shapes_.Serialize(fp)) return false;
81  if (!fontinfo_table_.Serialize(fp)) return false;
82  if (!xheights_.Serialize(fp)) return false;
83  return true;
84 }
85 
86 // Load an initial unicharset, or set one up if the file cannot be read.
87 void MasterTrainer::LoadUnicharset(const char* filename) {
88  if (!unicharset_.load_from_file(filename)) {
89  tprintf("Failed to load unicharset from file %s\n"
90  "Building unicharset for training from scratch...\n",
91  filename);
92  unicharset_.clear();
93  UNICHARSET initialized;
94  // Add special characters, as they were removed by the clear, but the
95  // default constructor puts them in.
96  unicharset_.AppendOtherUnicharset(initialized);
97  }
98  charsetsize_ = unicharset_.size();
99  delete [] fragments_;
100  fragments_ = new int[charsetsize_];
101  memset(fragments_, 0, sizeof(*fragments_) * charsetsize_);
102  samples_.LoadUnicharset(filename);
103  junk_samples_.LoadUnicharset(filename);
104  verify_samples_.LoadUnicharset(filename);
105 }
106 
107 // Reads the samples and their features from the given .tr format file,
108 // adding them to the trainer with the font_id from the content of the file.
109 // See mftraining.cpp for a description of the file format.
110 // If verification, then these are verification samples, not training.
111 void MasterTrainer::ReadTrainingSamples(const char* page_name,
113  bool verification) {
114  char buffer[2048];
115  const int int_feature_type = ShortNameToFeatureType(feature_defs, kIntFeatureType);
116  const int micro_feature_type = ShortNameToFeatureType(feature_defs,
118  const int cn_feature_type = ShortNameToFeatureType(feature_defs, kCNFeatureType);
119  const int geo_feature_type = ShortNameToFeatureType(feature_defs, kGeoFeatureType);
120 
121  FILE* fp = fopen(page_name, "rb");
122  if (fp == nullptr) {
123  tprintf("Failed to open tr file: %s\n", page_name);
124  return;
125  }
126  tr_filenames_.push_back(STRING(page_name));
127  while (fgets(buffer, sizeof(buffer), fp) != nullptr) {
128  if (buffer[0] == '\n')
129  continue;
130 
131  char* space = strchr(buffer, ' ');
132  if (space == nullptr) {
133  tprintf("Bad format in tr file, reading fontname, unichar\n");
134  continue;
135  }
136  *space++ = '\0';
137  int font_id = GetFontInfoId(buffer);
138  if (font_id < 0) font_id = 0;
139  int page_number;
140  STRING unichar;
141  TBOX bounding_box;
142  if (!ParseBoxFileStr(space, &page_number, &unichar, &bounding_box)) {
143  tprintf("Bad format in tr file, reading box coords\n");
144  continue;
145  }
146  CHAR_DESC char_desc = ReadCharDescription(feature_defs, fp);
147  auto* sample = new TrainingSample;
148  sample->set_font_id(font_id);
149  sample->set_page_num(page_number + page_images_.size());
150  sample->set_bounding_box(bounding_box);
151  sample->ExtractCharDesc(int_feature_type, micro_feature_type,
152  cn_feature_type, geo_feature_type, char_desc);
153  AddSample(verification, unichar.c_str(), sample);
154  FreeCharDescription(char_desc);
155  }
156  charsetsize_ = unicharset_.size();
157  fclose(fp);
158 }
159 
160 // Adds the given single sample to the trainer, setting the classid
161 // appropriately from the given unichar_str.
162 void MasterTrainer::AddSample(bool verification, const char* unichar,
164  if (verification) {
165  verify_samples_.AddSample(unichar, sample);
166  prev_unichar_id_ = -1;
167  } else if (unicharset_.contains_unichar(unichar)) {
168  if (prev_unichar_id_ >= 0)
169  fragments_[prev_unichar_id_] = -1;
170  prev_unichar_id_ = samples_.AddSample(unichar, sample);
171  if (flat_shapes_.FindShape(prev_unichar_id_, sample->font_id()) < 0)
172  flat_shapes_.AddShape(prev_unichar_id_, sample->font_id());
173  } else {
174  const int junk_id = junk_samples_.AddSample(unichar, sample);
175  if (prev_unichar_id_ >= 0) {
177  if (frag != nullptr && frag->is_natural()) {
178  if (fragments_[prev_unichar_id_] == 0)
179  fragments_[prev_unichar_id_] = junk_id;
180  else if (fragments_[prev_unichar_id_] != junk_id)
181  fragments_[prev_unichar_id_] = -1;
182  }
183  delete frag;
184  }
185  prev_unichar_id_ = -1;
186  }
187 }
188 
189 // Loads all pages from the given tif filename and append to page_images_.
190 // Must be called after ReadTrainingSamples, as the current number of images
191 // is used as an offset for page numbers in the samples.
192 void MasterTrainer::LoadPageImages(const char* filename) {
193  size_t offset = 0;
194  int page;
195  Pix* pix;
196  for (page = 0;; page++) {
197  pix = pixReadFromMultipageTiff(filename, &offset);
198  if (!pix) break;
199  page_images_.push_back(pix);
200  if (!offset) break;
201  }
202  tprintf("Loaded %d page images from %s\n", page, filename);
203 }
204 
205 // Cleans up the samples after initial load from the tr files, and prior to
206 // saving the MasterTrainer:
207 // Remaps fragmented chars if running shape analysis.
208 // Sets up the samples appropriately for class/fontwise access.
209 // Deletes outlier samples.
211  if (debug_level_ > 0)
212  tprintf("PostLoadCleanup...\n");
213  if (enable_shape_analysis_)
214  ReplaceFragmentedSamples();
215  SampleIterator sample_it;
216  sample_it.Init(nullptr, nullptr, true, &verify_samples_);
217  sample_it.NormalizeSamples();
218  verify_samples_.OrganizeByFontAndClass();
219 
220  samples_.IndexFeatures(feature_space_);
221  // TODO(rays) DeleteOutliers is currently turned off to prove NOP-ness
222  // against current training.
223  // samples_.DeleteOutliers(feature_space_, debug_level_ > 0);
224  samples_.OrganizeByFontAndClass();
225  if (debug_level_ > 0)
226  tprintf("ComputeCanonicalSamples...\n");
227  samples_.ComputeCanonicalSamples(feature_map_, debug_level_ > 0);
228 }
229 
230 // Gets the samples ready for training. Use after both
231 // ReadTrainingSamples+PostLoadCleanup or DeSerialize.
232 // Re-indexes the features and computes canonical and cloud features.
234  if (debug_level_ > 0)
235  tprintf("PreTrainingSetup...\n");
236  samples_.IndexFeatures(feature_space_);
237  samples_.ComputeCanonicalFeatures();
238  if (debug_level_ > 0)
239  tprintf("ComputeCloudFeatures...\n");
240  samples_.ComputeCloudFeatures(feature_space_.Size());
241 }
242 
243 // Sets up the master_shapes_ table, which tells which fonts should stay
244 // together until they get to a leaf node classifier.
246  tprintf("Building master shape table\n");
247  const int num_fonts = samples_.NumFonts();
248 
249  ShapeTable char_shapes_begin_fragment(samples_.unicharset());
250  ShapeTable char_shapes_end_fragment(samples_.unicharset());
251  ShapeTable char_shapes(samples_.unicharset());
252  for (int c = 0; c < samples_.charsetsize(); ++c) {
253  ShapeTable shapes(samples_.unicharset());
254  for (int f = 0; f < num_fonts; ++f) {
255  if (samples_.NumClassSamples(f, c, true) > 0)
256  shapes.AddShape(c, f);
257  }
258  ClusterShapes(kMinClusteredShapes, 1, kFontMergeDistance, &shapes);
259 
260  const CHAR_FRAGMENT *fragment = samples_.unicharset().get_fragment(c);
261 
262  if (fragment == nullptr)
263  char_shapes.AppendMasterShapes(shapes, nullptr);
264  else if (fragment->is_beginning())
265  char_shapes_begin_fragment.AppendMasterShapes(shapes, nullptr);
266  else if (fragment->is_ending())
267  char_shapes_end_fragment.AppendMasterShapes(shapes, nullptr);
268  else
269  char_shapes.AppendMasterShapes(shapes, nullptr);
270  }
272  kFontMergeDistance, &char_shapes_begin_fragment);
273  char_shapes.AppendMasterShapes(char_shapes_begin_fragment, nullptr);
275  kFontMergeDistance, &char_shapes_end_fragment);
276  char_shapes.AppendMasterShapes(char_shapes_end_fragment, nullptr);
278  kFontMergeDistance, &char_shapes);
279  master_shapes_.AppendMasterShapes(char_shapes, nullptr);
280  tprintf("Master shape_table:%s\n", master_shapes_.SummaryStr().c_str());
281 }
282 
283 // Adds the junk_samples_ to the main samples_ set. Junk samples are initially
284 // fragments and n-grams (all incorrectly segmented characters).
285 // Various training functions may result in incorrectly segmented characters
286 // being added to the unicharset of the main samples, perhaps because they
287 // form a "radical" decomposition of some (Indic) grapheme, or because they
288 // just look the same as a real character (like rn/m)
289 // This function moves all the junk samples, to the main samples_ set, but
290 // desirable junk, being any sample for which the unichar already exists in
291 // the samples_ unicharset gets the unichar-ids re-indexed to match, but
292 // anything else gets re-marked as unichar_id 0 (space character) to identify
293 // it as junk to the error counter.
295  // Get ids of fragments in junk_samples_ that replace the dead chars.
296  const UNICHARSET& junk_set = junk_samples_.unicharset();
297  const UNICHARSET& sample_set = samples_.unicharset();
298  int num_junks = junk_samples_.num_samples();
299  tprintf("Moving %d junk samples to master sample set.\n", num_junks);
300  for (int s = 0; s < num_junks; ++s) {
301  TrainingSample* sample = junk_samples_.mutable_sample(s);
302  int junk_id = sample->class_id();
303  const char* junk_utf8 = junk_set.id_to_unichar(junk_id);
304  int sample_id = sample_set.unichar_to_id(junk_utf8);
305  if (sample_id == INVALID_UNICHAR_ID)
306  sample_id = 0;
307  sample->set_class_id(sample_id);
308  junk_samples_.extract_sample(s);
309  samples_.AddSample(sample_id, sample);
310  }
311  junk_samples_.DeleteDeadSamples();
312  samples_.OrganizeByFontAndClass();
313 }
314 
315 // Replicates the samples and perturbs them if the enable_replication_ flag
316 // is set. MUST be used after the last call to OrganizeByFontAndClass on
317 // the training samples, ie after IncludeJunk if it is going to be used, as
318 // OrganizeByFontAndClass will eat the replicated samples into the regular
319 // samples.
321  if (enable_replication_) {
322  if (debug_level_ > 0)
323  tprintf("ReplicateAndRandomize...\n");
324  verify_samples_.ReplicateAndRandomizeSamples();
325  samples_.ReplicateAndRandomizeSamples();
326  samples_.IndexFeatures(feature_space_);
327  }
328 }
329 
330 // Loads the basic font properties file into fontinfo_table_.
331 // Returns false on failure.
332 bool MasterTrainer::LoadFontInfo(const char* filename) {
333  FILE* fp = fopen(filename, "rb");
334  if (fp == nullptr) {
335  fprintf(stderr, "Failed to load font_properties from %s\n", filename);
336  return false;
337  }
338  int italic, bold, fixed, serif, fraktur;
339  while (!feof(fp)) {
340  FontInfo fontinfo;
341  char* font_name = new char[1024];
342  fontinfo.name = font_name;
343  fontinfo.properties = 0;
344  fontinfo.universal_id = 0;
345  if (tfscanf(fp, "%1024s %i %i %i %i %i\n", font_name, &italic, &bold,
346  &fixed, &serif, &fraktur) != 6) {
347  delete[] font_name;
348  continue;
349  }
350  fontinfo.properties =
351  (italic << 0) +
352  (bold << 1) +
353  (fixed << 2) +
354  (serif << 3) +
355  (fraktur << 4);
356  if (!fontinfo_table_.contains(fontinfo)) {
357  fontinfo_table_.push_back(fontinfo);
358  } else {
359  delete[] font_name;
360  }
361  }
362  fclose(fp);
363  return true;
364 }
365 
366 // Loads the xheight font properties file into xheights_.
367 // Returns false on failure.
368 bool MasterTrainer::LoadXHeights(const char* filename) {
369  tprintf("fontinfo table is of size %d\n", fontinfo_table_.size());
370  xheights_.init_to_size(fontinfo_table_.size(), -1);
371  if (filename == nullptr) return true;
372  FILE *f = fopen(filename, "rb");
373  if (f == nullptr) {
374  fprintf(stderr, "Failed to load font xheights from %s\n", filename);
375  return false;
376  }
377  tprintf("Reading x-heights from %s ...\n", filename);
378  FontInfo fontinfo;
379  fontinfo.properties = 0; // Not used to lookup in the table.
380  fontinfo.universal_id = 0;
381  char buffer[1024];
382  int xht;
383  int total_xheight = 0;
384  int xheight_count = 0;
385  while (!feof(f)) {
386  if (tfscanf(f, "%1023s %d\n", buffer, &xht) != 2)
387  continue;
388  buffer[1023] = '\0';
389  fontinfo.name = buffer;
390  if (!fontinfo_table_.contains(fontinfo)) continue;
391  int fontinfo_id = fontinfo_table_.get_index(fontinfo);
392  xheights_[fontinfo_id] = xht;
393  total_xheight += xht;
394  ++xheight_count;
395  }
396  if (xheight_count == 0) {
397  fprintf(stderr, "No valid xheights in %s!\n", filename);
398  fclose(f);
399  return false;
400  }
401  int mean_xheight = DivRounded(total_xheight, xheight_count);
402  for (int i = 0; i < fontinfo_table_.size(); ++i) {
403  if (xheights_[i] < 0)
404  xheights_[i] = mean_xheight;
405  }
406  fclose(f);
407  return true;
408 } // LoadXHeights
409 
410 // Reads spacing stats from filename and adds them to fontinfo_table.
411 bool MasterTrainer::AddSpacingInfo(const char *filename) {
412  FILE* fontinfo_file = fopen(filename, "rb");
413  if (fontinfo_file == nullptr)
414  return true; // We silently ignore missing files!
415  // Find the fontinfo_id.
416  int fontinfo_id = GetBestMatchingFontInfoId(filename);
417  if (fontinfo_id < 0) {
418  tprintf("No font found matching fontinfo filename %s\n", filename);
419  fclose(fontinfo_file);
420  return false;
421  }
422  tprintf("Reading spacing from %s for font %d...\n", filename, fontinfo_id);
423  // TODO(rays) scale should probably be a double, but keep as an int for now
424  // to duplicate current behavior.
425  int scale = kBlnXHeight / xheights_[fontinfo_id];
426  int num_unichars;
427  char uch[UNICHAR_LEN];
428  char kerned_uch[UNICHAR_LEN];
429  int x_gap, x_gap_before, x_gap_after, num_kerned;
430  ASSERT_HOST(tfscanf(fontinfo_file, "%d\n", &num_unichars) == 1);
431  FontInfo *fi = &fontinfo_table_.get(fontinfo_id);
432  fi->init_spacing(unicharset_.size());
433  FontSpacingInfo *spacing = nullptr;
434  for (int l = 0; l < num_unichars; ++l) {
435  if (tfscanf(fontinfo_file, "%s %d %d %d",
436  uch, &x_gap_before, &x_gap_after, &num_kerned) != 4) {
437  tprintf("Bad format of font spacing file %s\n", filename);
438  fclose(fontinfo_file);
439  return false;
440  }
441  bool valid = unicharset_.contains_unichar(uch);
442  if (valid) {
443  spacing = new FontSpacingInfo();
444  spacing->x_gap_before = static_cast<int16_t>(x_gap_before * scale);
445  spacing->x_gap_after = static_cast<int16_t>(x_gap_after * scale);
446  }
447  for (int k = 0; k < num_kerned; ++k) {
448  if (tfscanf(fontinfo_file, "%s %d", kerned_uch, &x_gap) != 2) {
449  tprintf("Bad format of font spacing file %s\n", filename);
450  fclose(fontinfo_file);
451  delete spacing;
452  return false;
453  }
454  if (!valid || !unicharset_.contains_unichar(kerned_uch)) continue;
455  spacing->kerned_unichar_ids.push_back(
456  unicharset_.unichar_to_id(kerned_uch));
457  spacing->kerned_x_gaps.push_back(static_cast<int16_t>(x_gap * scale));
458  }
459  if (valid) fi->add_spacing(unicharset_.unichar_to_id(uch), spacing);
460  }
461  fclose(fontinfo_file);
462  return true;
463 }
464 
465 // Returns the font id corresponding to the given font name.
466 // Returns -1 if the font cannot be found.
467 int MasterTrainer::GetFontInfoId(const char* font_name) {
468  FontInfo fontinfo;
469  // We are only borrowing the string, so it is OK to const cast it.
470  fontinfo.name = const_cast<char*>(font_name);
471  fontinfo.properties = 0; // Not used to lookup in the table
472  fontinfo.universal_id = 0;
473  return fontinfo_table_.get_index(fontinfo);
474 }
475 // Returns the font_id of the closest matching font name to the given
476 // filename. It is assumed that a substring of the filename will match
477 // one of the fonts. If more than one is matched, the longest is returned.
478 int MasterTrainer::GetBestMatchingFontInfoId(const char* filename) {
479  int fontinfo_id = -1;
480  int best_len = 0;
481  for (int f = 0; f < fontinfo_table_.size(); ++f) {
482  if (strstr(filename, fontinfo_table_.get(f).name) != nullptr) {
483  int len = strlen(fontinfo_table_.get(f).name);
484  // Use the longest matching length in case a substring of a font matched.
485  if (len > best_len) {
486  best_len = len;
487  fontinfo_id = f;
488  }
489  }
490  }
491  return fontinfo_id;
492 }
493 
494 // Sets up a flat shapetable with one shape per class/font combination.
496  // To exactly mimic the results of the previous implementation, the shapes
497  // must be clustered in order the fonts arrived, and reverse order of the
498  // characters within each font.
499  // Get a list of the fonts in the order they appeared.
500  GenericVector<int> active_fonts;
501  int num_shapes = flat_shapes_.NumShapes();
502  for (int s = 0; s < num_shapes; ++s) {
503  int font = flat_shapes_.GetShape(s)[0].font_ids[0];
504  int f = 0;
505  for (f = 0; f < active_fonts.size(); ++f) {
506  if (active_fonts[f] == font)
507  break;
508  }
509  if (f == active_fonts.size())
510  active_fonts.push_back(font);
511  }
512  // For each font in order, add all the shapes with that font in reverse order.
513  int num_fonts = active_fonts.size();
514  for (int f = 0; f < num_fonts; ++f) {
515  for (int s = num_shapes - 1; s >= 0; --s) {
516  int font = flat_shapes_.GetShape(s)[0].font_ids[0];
517  if (font == active_fonts[f]) {
518  shape_table->AddShape(flat_shapes_.GetShape(s));
519  }
520  }
521  }
522 }
523 
524 // Sets up a Clusterer for mftraining on a single shape_id.
525 // Call FreeClusterer on the return value after use.
527  const ShapeTable& shape_table,
529  int shape_id,
530  int* num_samples) {
531 
533  int num_params = feature_defs.FeatureDesc[desc_index]->NumParams;
534  ASSERT_HOST(num_params == MFCount);
535  CLUSTERER* clusterer = MakeClusterer(
536  num_params, feature_defs.FeatureDesc[desc_index]->ParamDesc);
537 
538  // We want to iterate over the samples of just the one shape.
539  IndexMapBiDi shape_map;
540  shape_map.Init(shape_table.NumShapes(), false);
541  shape_map.SetMap(shape_id, true);
542  shape_map.Setup();
543  // Reverse the order of the samples to match the previous behavior.
545  SampleIterator it;
546  it.Init(&shape_map, &shape_table, false, &samples_);
547  for (it.Begin(); !it.AtEnd(); it.Next()) {
548  sample_ptrs.push_back(&it.GetSample());
549  }
550  int sample_id = 0;
551  for (int i = sample_ptrs.size() - 1; i >= 0; --i) {
552  const TrainingSample* sample = sample_ptrs[i];
553  uint32_t num_features = sample->num_micro_features();
554  for (uint32_t f = 0; f < num_features; ++f)
555  MakeSample(clusterer, sample->micro_features()[f], sample_id);
556  ++sample_id;
557  }
558  *num_samples = sample_id;
559  return clusterer;
560 }
561 
562 // Writes the given float_classes (produced by SetupForFloat2Int) as inttemp
563 // to the given inttemp_file, and the corresponding pffmtable.
564 // The unicharset is the original encoding of graphemes, and shape_set should
565 // match the size of the shape_table, and may possibly be totally fake.
567  const UNICHARSET& shape_set,
568  const ShapeTable& shape_table,
569  CLASS_STRUCT* float_classes,
570  const char* inttemp_file,
571  const char* pffmtable_file) {
572  auto *classify = new tesseract::Classify();
573  // Move the fontinfo table to classify.
574  fontinfo_table_.MoveTo(&classify->get_fontinfo_table());
575  INT_TEMPLATES int_templates = classify->CreateIntTemplates(float_classes,
576  shape_set);
577  FILE* fp = fopen(inttemp_file, "wb");
578  if (fp == nullptr) {
579  tprintf("Error, failed to open file \"%s\"\n", inttemp_file);
580  } else {
581  classify->WriteIntTemplates(fp, int_templates, shape_set);
582  fclose(fp);
583  }
584  // Now write pffmtable. This is complicated by the fact that the adaptive
585  // classifier still wants one indexed by unichar-id, but the static
586  // classifier needs one indexed by its shape class id.
587  // We put the shapetable_cutoffs in a GenericVector, and compute the
588  // unicharset cutoffs along the way.
589  GenericVector<uint16_t> shapetable_cutoffs;
590  GenericVector<uint16_t> unichar_cutoffs;
591  for (int c = 0; c < unicharset.size(); ++c)
592  unichar_cutoffs.push_back(0);
593  /* then write out each class */
594  for (int i = 0; i < int_templates->NumClasses; ++i) {
595  INT_CLASS Class = ClassForClassId(int_templates, i);
596  // Todo: Test with min instead of max
597  // int MaxLength = LengthForConfigId(Class, 0);
598  uint16_t max_length = 0;
599  for (int config_id = 0; config_id < Class->NumConfigs; config_id++) {
600  // Todo: Test with min instead of max
601  // if (LengthForConfigId (Class, config_id) < MaxLength)
602  uint16_t length = Class->ConfigLengths[config_id];
603  if (length > max_length)
604  max_length = Class->ConfigLengths[config_id];
605  int shape_id = float_classes[i].font_set.get(config_id);
606  const Shape& shape = shape_table.GetShape(shape_id);
607  for (int c = 0; c < shape.size(); ++c) {
608  int unichar_id = shape[c].unichar_id;
609  if (length > unichar_cutoffs[unichar_id])
610  unichar_cutoffs[unichar_id] = length;
611  }
612  }
613  shapetable_cutoffs.push_back(max_length);
614  }
615  fp = fopen(pffmtable_file, "wb");
616  if (fp == nullptr) {
617  tprintf("Error, failed to open file \"%s\"\n", pffmtable_file);
618  } else {
619  shapetable_cutoffs.Serialize(fp);
620  for (int c = 0; c < unicharset.size(); ++c) {
621  const char *unichar = unicharset.id_to_unichar(c);
622  if (strcmp(unichar, " ") == 0) {
623  unichar = "NULL";
624  }
625  fprintf(fp, "%s %d\n", unichar, unichar_cutoffs[c]);
626  }
627  fclose(fp);
628  }
629  free_int_templates(int_templates);
630  delete classify;
631 }
632 
633 // Generate debug output relating to the canonical distance between the
634 // two given UTF8 grapheme strings.
635 void MasterTrainer::DebugCanonical(const char* unichar_str1,
636  const char* unichar_str2) {
637  int class_id1 = unicharset_.unichar_to_id(unichar_str1);
638  int class_id2 = unicharset_.unichar_to_id(unichar_str2);
639  if (class_id2 == INVALID_UNICHAR_ID)
640  class_id2 = class_id1;
641  if (class_id1 == INVALID_UNICHAR_ID) {
642  tprintf("No unicharset entry found for %s\n", unichar_str1);
643  return;
644  } else {
645  tprintf("Font ambiguities for unichar %d = %s and %d = %s\n",
646  class_id1, unichar_str1, class_id2, unichar_str2);
647  }
648  int num_fonts = samples_.NumFonts();
649  const IntFeatureMap& feature_map = feature_map_;
650  // Iterate the fonts to get the similarity with other fonst of the same
651  // class.
652  tprintf(" ");
653  for (int f = 0; f < num_fonts; ++f) {
654  if (samples_.NumClassSamples(f, class_id2, false) == 0)
655  continue;
656  tprintf("%6d", f);
657  }
658  tprintf("\n");
659  for (int f1 = 0; f1 < num_fonts; ++f1) {
660  // Map the features of the canonical_sample.
661  if (samples_.NumClassSamples(f1, class_id1, false) == 0)
662  continue;
663  tprintf("%4d ", f1);
664  for (int f2 = 0; f2 < num_fonts; ++f2) {
665  if (samples_.NumClassSamples(f2, class_id2, false) == 0)
666  continue;
667  float dist = samples_.ClusterDistance(f1, class_id1, f2, class_id2,
668  feature_map);
669  tprintf(" %5.3f", dist);
670  }
671  tprintf("\n");
672  }
673  // Build a fake ShapeTable containing all the sample types.
674  ShapeTable shapes(unicharset_);
675  for (int f = 0; f < num_fonts; ++f) {
676  if (samples_.NumClassSamples(f, class_id1, true) > 0)
677  shapes.AddShape(class_id1, f);
678  if (class_id1 != class_id2 &&
679  samples_.NumClassSamples(f, class_id2, true) > 0)
680  shapes.AddShape(class_id2, f);
681  }
682 }
683 
684 #ifndef GRAPHICS_DISABLED
685 // Debugging for cloud/canonical features.
686 // Displays a Features window containing:
687 // If unichar_str2 is in the unicharset, and canonical_font is non-negative,
688 // displays the canonical features of the char/font combination in red.
689 // If unichar_str1 is in the unicharset, and cloud_font is non-negative,
690 // displays the cloud feature of the char/font combination in green.
691 // The canonical features are drawn first to show which ones have no
692 // matches in the cloud features.
693 // Until the features window is destroyed, each click in the features window
694 // will display the samples that have that feature in a separate window.
695 void MasterTrainer::DisplaySamples(const char* unichar_str1, int cloud_font,
696  const char* unichar_str2,
697  int canonical_font) {
698  const IntFeatureMap& feature_map = feature_map_;
699  const IntFeatureSpace& feature_space = feature_map.feature_space();
700  ScrollView* f_window = CreateFeatureSpaceWindow("Features", 100, 500);
702  f_window);
703  int class_id2 = samples_.unicharset().unichar_to_id(unichar_str2);
704  if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {
705  const TrainingSample* sample = samples_.GetCanonicalSample(canonical_font,
706  class_id2);
707  for (uint32_t f = 0; f < sample->num_features(); ++f) {
708  RenderIntFeature(f_window, &sample->features()[f], ScrollView::RED);
709  }
710  }
711  int class_id1 = samples_.unicharset().unichar_to_id(unichar_str1);
712  if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {
713  const BitVector& cloud = samples_.GetCloudFeatures(cloud_font, class_id1);
714  for (int f = 0; f < cloud.size(); ++f) {
715  if (cloud[f]) {
716  INT_FEATURE_STRUCT feature =
717  feature_map.InverseIndexFeature(f);
718  RenderIntFeature(f_window, &feature, ScrollView::GREEN);
719  }
720  }
721  }
722  f_window->Update();
723  ScrollView* s_window = CreateFeatureSpaceWindow("Samples", 100, 500);
724  SVEventType ev_type;
725  do {
726  SVEvent* ev;
727  // Wait until a click or popup event.
728  ev = f_window->AwaitEvent(SVET_ANY);
729  ev_type = ev->type;
730  if (ev_type == SVET_CLICK) {
731  int feature_index = feature_space.XYToFeatureIndex(ev->x, ev->y);
732  if (feature_index >= 0) {
733  // Iterate samples and display those with the feature.
734  Shape shape;
735  shape.AddToShape(class_id1, cloud_font);
736  s_window->Clear();
737  samples_.DisplaySamplesWithFeature(feature_index, shape,
738  feature_space, ScrollView::GREEN,
739  s_window);
740  s_window->Update();
741  }
742  }
743  delete ev;
744  } while (ev_type != SVET_DESTROY);
745 }
746 #endif // GRAPHICS_DISABLED
747 
748 void MasterTrainer::TestClassifierVOld(bool replicate_samples,
749  ShapeClassifier* test_classifier,
750  ShapeClassifier* old_classifier) {
751  SampleIterator sample_it;
752  sample_it.Init(nullptr, nullptr, replicate_samples, &samples_);
753  ErrorCounter::DebugNewErrors(test_classifier, old_classifier,
754  CT_UNICHAR_TOPN_ERR, fontinfo_table_,
755  page_images_, &sample_it);
756 }
757 
758 // Tests the given test_classifier on the internal samples.
759 // See TestClassifier for details.
761  int report_level,
762  bool replicate_samples,
763  ShapeClassifier* test_classifier,
764  STRING* report_string) {
765  TestClassifier(error_mode, report_level, replicate_samples, &samples_,
766  test_classifier, report_string);
767 }
768 
769 // Tests the given test_classifier on the given samples.
770 // error_mode indicates what counts as an error.
771 // report_levels:
772 // 0 = no output.
773 // 1 = bottom-line error rate.
774 // 2 = bottom-line error rate + time.
775 // 3 = font-level error rate + time.
776 // 4 = list of all errors + short classifier debug output on 16 errors.
777 // 5 = list of all errors + short classifier debug output on 25 errors.
778 // If replicate_samples is true, then the test is run on an extended test
779 // sample including replicated and systematically perturbed samples.
780 // If report_string is non-nullptr, a summary of the results for each font
781 // is appended to the report_string.
783  int report_level,
784  bool replicate_samples,
785  TrainingSampleSet* samples,
786  ShapeClassifier* test_classifier,
787  STRING* report_string) {
788  SampleIterator sample_it;
789  sample_it.Init(nullptr, nullptr, replicate_samples, samples);
790  if (report_level > 0) {
791  int num_samples = 0;
792  for (sample_it.Begin(); !sample_it.AtEnd(); sample_it.Next())
793  ++num_samples;
794  tprintf("Iterator has charset size of %d/%d, %d shapes, %d samples\n",
795  sample_it.SparseCharsetSize(), sample_it.CompactCharsetSize(),
796  test_classifier->GetShapeTable()->NumShapes(), num_samples);
797  tprintf("Testing %sREPLICATED:\n", replicate_samples ? "" : "NON-");
798  }
799  double unichar_error = 0.0;
800  ErrorCounter::ComputeErrorRate(test_classifier, report_level,
801  error_mode, fontinfo_table_,
802  page_images_, &sample_it, &unichar_error,
803  nullptr, report_string);
804  return unichar_error;
805 }
806 
807 // Returns the average (in some sense) distance between the two given
808 // shapes, which may contain multiple fonts and/or unichars.
809 float MasterTrainer::ShapeDistance(const ShapeTable& shapes, int s1, int s2) {
810  const IntFeatureMap& feature_map = feature_map_;
811  const Shape& shape1 = shapes.GetShape(s1);
812  const Shape& shape2 = shapes.GetShape(s2);
813  int num_chars1 = shape1.size();
814  int num_chars2 = shape2.size();
815  float dist_sum = 0.0f;
816  int dist_count = 0;
817  if (num_chars1 > 1 || num_chars2 > 1) {
818  // In the multi-char case try to optimize the calculation by computing
819  // distances between characters of matching font where possible.
820  for (int c1 = 0; c1 < num_chars1; ++c1) {
821  for (int c2 = 0; c2 < num_chars2; ++c2) {
822  dist_sum += samples_.UnicharDistance(shape1[c1], shape2[c2],
823  true, feature_map);
824  ++dist_count;
825  }
826  }
827  } else {
828  // In the single unichar case, there is little alternative, but to compute
829  // the squared-order distance between pairs of fonts.
830  dist_sum = samples_.UnicharDistance(shape1[0], shape2[0],
831  false, feature_map);
832  ++dist_count;
833  }
834  return dist_sum / dist_count;
835 }
836 
837 // Replaces samples that are always fragmented with the corresponding
838 // fragment samples.
839 void MasterTrainer::ReplaceFragmentedSamples() {
840  if (fragments_ == nullptr) return;
841  // Remove samples that are replaced by fragments. Each class that was
842  // always naturally fragmented should be replaced by its fragments.
843  int num_samples = samples_.num_samples();
844  for (int s = 0; s < num_samples; ++s) {
845  TrainingSample* sample = samples_.mutable_sample(s);
846  if (fragments_[sample->class_id()] > 0)
847  samples_.KillSample(sample);
848  }
849  samples_.DeleteDeadSamples();
850 
851  // Get ids of fragments in junk_samples_ that replace the dead chars.
852  const UNICHARSET& frag_set = junk_samples_.unicharset();
853 #if 0
854  // TODO(rays) The original idea was to replace only graphemes that were
855  // always naturally fragmented, but that left a lot of the Indic graphemes
856  // out. Determine whether we can go back to that idea now that spacing
857  // is fixed in the training images, or whether this code is obsolete.
858  bool* good_junk = new bool[frag_set.size()];
859  memset(good_junk, 0, sizeof(*good_junk) * frag_set.size());
860  for (int dead_ch = 1; dead_ch < unicharset_.size(); ++dead_ch) {
861  int frag_ch = fragments_[dead_ch];
862  if (frag_ch <= 0) continue;
863  const char* frag_utf8 = frag_set.id_to_unichar(frag_ch);
865  // Mark the chars for all parts of the fragment as good in good_junk.
866  for (int part = 0; part < frag->get_total(); ++part) {
867  frag->set_pos(part);
868  int good_ch = frag_set.unichar_to_id(frag->to_string().c_str());
869  if (good_ch != INVALID_UNICHAR_ID)
870  good_junk[good_ch] = true; // We want this one.
871  }
872  delete frag;
873  }
874 #endif
875  // For now just use all the junk that was from natural fragments.
876  // Get samples of fragments in junk_samples_ that replace the dead chars.
877  int num_junks = junk_samples_.num_samples();
878  for (int s = 0; s < num_junks; ++s) {
879  TrainingSample* sample = junk_samples_.mutable_sample(s);
880  int junk_id = sample->class_id();
881  const char* frag_utf8 = frag_set.id_to_unichar(junk_id);
883  if (frag != nullptr && frag->is_natural()) {
884  junk_samples_.extract_sample(s);
885  samples_.AddSample(frag_set.id_to_unichar(junk_id), sample);
886  }
887  delete frag;
888  }
889  junk_samples_.DeleteDeadSamples();
890  junk_samples_.OrganizeByFontAndClass();
891  samples_.OrganizeByFontAndClass();
892  unicharset_.clear();
893  unicharset_.AppendOtherUnicharset(samples_.unicharset());
894  // delete [] good_junk;
895  // Fragments_ no longer needed?
896  delete [] fragments_;
897  fragments_ = nullptr;
898 }
899 
900 // Runs a hierarchical agglomerative clustering to merge shapes in the given
901 // shape_table, while satisfying the given constraints:
902 // * End with at least min_shapes left in shape_table,
903 // * No shape shall have more than max_shape_unichars in it,
904 // * Don't merge shapes where the distance between them exceeds max_dist.
905 const float kInfiniteDist = 999.0f;
906 void MasterTrainer::ClusterShapes(int min_shapes, int max_shape_unichars,
907  float max_dist, ShapeTable* shapes) {
908  int num_shapes = shapes->NumShapes();
909  int max_merges = num_shapes - min_shapes;
910  auto* shape_dists =
911  new GenericVector<ShapeDist>[num_shapes];
912  float min_dist = kInfiniteDist;
913  int min_s1 = 0;
914  int min_s2 = 0;
915  tprintf("Computing shape distances...");
916  for (int s1 = 0; s1 < num_shapes; ++s1) {
917  for (int s2 = s1 + 1; s2 < num_shapes; ++s2) {
918  ShapeDist dist(s1, s2, ShapeDistance(*shapes, s1, s2));
919  shape_dists[s1].push_back(dist);
920  if (dist.distance < min_dist) {
921  min_dist = dist.distance;
922  min_s1 = s1;
923  min_s2 = s2;
924  }
925  }
926  tprintf(" %d", s1);
927  }
928  tprintf("\n");
929  int num_merged = 0;
930  while (num_merged < max_merges && min_dist < max_dist) {
931  tprintf("Distance = %f: ", min_dist);
932  int num_unichars = shapes->MergedUnicharCount(min_s1, min_s2);
933  shape_dists[min_s1][min_s2 - min_s1 - 1].distance = kInfiniteDist;
934  if (num_unichars > max_shape_unichars) {
935  tprintf("Merge of %d and %d with %d would exceed max of %d unichars\n",
936  min_s1, min_s2, num_unichars, max_shape_unichars);
937  } else {
938  shapes->MergeShapes(min_s1, min_s2);
939  shape_dists[min_s2].clear();
940  ++num_merged;
941 
942  for (int s = 0; s < min_s1; ++s) {
943  if (!shape_dists[s].empty()) {
944  shape_dists[s][min_s1 - s - 1].distance =
945  ShapeDistance(*shapes, s, min_s1);
946  shape_dists[s][min_s2 - s -1].distance = kInfiniteDist;
947  }
948  }
949  for (int s2 = min_s1 + 1; s2 < num_shapes; ++s2) {
950  if (shape_dists[min_s1][s2 - min_s1 - 1].distance < kInfiniteDist)
951  shape_dists[min_s1][s2 - min_s1 - 1].distance =
952  ShapeDistance(*shapes, min_s1, s2);
953  }
954  for (int s = min_s1 + 1; s < min_s2; ++s) {
955  if (!shape_dists[s].empty()) {
956  shape_dists[s][min_s2 - s - 1].distance = kInfiniteDist;
957  }
958  }
959  }
960  min_dist = kInfiniteDist;
961  for (int s1 = 0; s1 < num_shapes; ++s1) {
962  for (int i = 0; i < shape_dists[s1].size(); ++i) {
963  if (shape_dists[s1][i].distance < min_dist) {
964  min_dist = shape_dists[s1][i].distance;
965  min_s1 = s1;
966  min_s2 = s1 + 1 + i;
967  }
968  }
969  }
970  }
971  tprintf("Stopped with %d merged, min dist %f\n", num_merged, min_dist);
972  delete [] shape_dists;
973  if (debug_level_ > 1) {
974  for (int s1 = 0; s1 < num_shapes; ++s1) {
975  if (shapes->MasterDestinationIndex(s1) == s1) {
976  tprintf("Master shape:%s\n", shapes->DebugStr(s1).c_str());
977  }
978  }
979  }
980 }
981 
982 
983 } // namespace tesseract.
UNICHARSET::load_from_file
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:378
INT_TEMPLATES_STRUCT
Definition: intproto.h:117
MFCount
Definition: mf.h:43
INT_CLASS_STRUCT::ConfigLengths
uint16_t ConfigLengths[MAX_NUM_CONFIGS]
Definition: intproto.h:110
ScrollView
Definition: scrollview.h:97
tesseract::ShapeTable::Serialize
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:241
tesseract::TrainingSampleSet::extract_sample
TrainingSample * extract_sample(int index)
Definition: trainingsampleset.h:165
tesseract::FontInfo::add_spacing
void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info)
Definition: fontinfo.h:80
SVET_DESTROY
Definition: scrollview.h:45
tesseract::MasterTrainer::SetupForClustering
CLUSTERER * SetupForClustering(const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs, int shape_id, int *num_samples)
Definition: mastertrainer.cpp:526
SVEventType
SVEventType
Definition: scrollview.h:44
tesseract::TrainingSampleSet::ClusterDistance
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
Definition: trainingsampleset.cpp:296
UNICHARSET::AppendOtherUnicharset
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:463
tesseract::FontSpacingInfo::x_gap_after
int16_t x_gap_after
Definition: fontinfo.h:53
SVET_CLICK
Definition: scrollview.h:47
kGeoFeatureType
const char *const kGeoFeatureType
Definition: featdefs.cpp:34
kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:23
tfscanf
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:181
tesseract::MasterTrainer::Serialize
bool Serialize(FILE *fp) const
Definition: mastertrainer.cpp:71
boxread.h
tesseract::ErrorCounter::DebugNewErrors
static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix * > &page_images, SampleIterator *it)
Definition: errorcounter.cpp:106
tesseract::Shape
Definition: shapetable.h:184
tesseract::NM_BASELINE
Definition: normalis.h:42
tesseract::FontSpacingInfo::kerned_unichar_ids
GenericVector< UNICHAR_ID > kerned_unichar_ids
Definition: fontinfo.h:54
ShortNameToFeatureType
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:269
tesseract::ShapeDist
Definition: mastertrainer.h:50
CHAR_FRAGMENT::to_string
static STRING to_string(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.cpp:1044
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
INT_CLASS_STRUCT
Definition: intproto.h:104
tesseract::FontSpacingInfo::x_gap_before
int16_t x_gap_before
Definition: fontinfo.h:52
baseline
Definition: mfoutline.h:62
tesseract::MasterTrainer::LoadUnicharset
void LoadUnicharset(const char *filename)
Definition: mastertrainer.cpp:87
tesseract::MasterTrainer::AddSample
void AddSample(bool verification, const char *unichar_str, TrainingSample *sample)
Definition: mastertrainer.cpp:162
tesseract::ShapeTable::AppendMasterShapes
void AppendMasterShapes(const ShapeTable &other, GenericVector< int > *shape_map)
Definition: shapetable.cpp:656
tesseract::TrainingSampleSet::mutable_sample
TrainingSample * mutable_sample(int index)
Definition: trainingsampleset.h:161
tesseract::ShapeTable::NumShapes
int NumShapes() const
Definition: shapetable.h:274
tesseract::MasterTrainer::MasterTrainer
MasterTrainer(NormalizationMode norm_mode, bool shape_analysis, bool replicate_samples, int debug_level)
Definition: mastertrainer.cpp:50
tesseract::TrainingSampleSet::LoadUnicharset
void LoadUnicharset(const char *filename)
Definition: trainingsampleset.cpp:113
errorcounter.h
tesseract::IntFeatureMap::InverseIndexFeature
INT_FEATURE_STRUCT InverseIndexFeature(int index_feature) const
Definition: intfeaturemap.cpp:56
tesseract::FontSpacingInfo
Definition: fontinfo.h:51
tesseract::TrainingSampleSet::NumClassSamples
int NumClassSamples(int font_id, int class_id, bool randomize) const
Definition: trainingsampleset.cpp:156
STRING
Definition: strngs.h:45
tesseract::Classify
Definition: classify.h:103
ScrollView::Clear
void Clear()
Definition: scrollview.cpp:588
tesseract::SampleIterator::Next
void Next()
Definition: sampleiterator.cpp:156
tesseract::ShapeClassifier::GetShapeTable
virtual const ShapeTable * GetShapeTable() const =0
GenericVector::contains
bool contains(const T &object) const
Definition: genericvector.h:793
GenericVector::Serialize
bool Serialize(FILE *fp) const
Definition: genericvector.h:929
mastertrainer.h
tesseract::FontInfoTable::Serialize
bool Serialize(FILE *fp) const
Definition: fontinfo.cpp:49
tesseract::SampleIterator::NormalizeSamples
double NormalizeSamples()
Definition: sampleiterator.cpp:233
tesseract::SampleIterator
Definition: sampleiterator.h:92
tesseract::kMinClusteredShapes
const int kMinClusteredShapes
Definition: mastertrainer.cpp:44
tesseract::IntFeatureSpace::Size
int Size() const
Definition: intfeaturespace.h:51
tesseract::MasterTrainer::~MasterTrainer
~MasterTrainer()
Definition: mastertrainer.cpp:62
tesseract::FontInfo::universal_id
int32_t universal_id
Definition: fontinfo.h:123
tesseract::TrainingSampleSet::Serialize
bool Serialize(FILE *fp) const
Definition: trainingsampleset.cpp:80
tesseract::CT_UNICHAR_TOPN_ERR
Definition: errorcounter.h:76
CHAR_FRAGMENT::is_natural
bool is_natural() const
Definition: unicharset.h:113
UNICHARSET::clear
void clear()
Definition: unicharset.h:306
kIntFeatureType
const char *const kIntFeatureType
Definition: featdefs.cpp:33
UNICHARSET::save_to_file
bool save_to_file(const char *const filename) const
Definition: unicharset.h:350
tesseract::TrainingSampleSet::DeleteDeadSamples
void DeleteDeadSamples()
Definition: trainingsampleset.cpp:497
tesseract::SampleIterator::Init
void Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize, TrainingSampleSet *sample_set)
Definition: sampleiterator.cpp:47
tesseract::TrainingSampleSet::num_samples
int num_samples() const
Definition: trainingsampleset.h:55
SVEvent::y
int y
Definition: scrollview.h:67
tesseract::TrainingSampleSet::GetCloudFeatures
const BitVector & GetCloudFeatures(int font_id, int class_id) const
Definition: trainingsampleset.cpp:211
tesseract::CountTypes
CountTypes
Definition: errorcounter.h:69
FEATURE_DEFS_STRUCT::FeatureDesc
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:46
tesseract::MasterTrainer::TestClassifier
double TestClassifier(CountTypes error_mode, int report_level, bool replicate_samples, TrainingSampleSet *samples, ShapeClassifier *test_classifier, STRING *report_string)
Definition: mastertrainer.cpp:782
tesseract::MasterTrainer::WriteInttempAndPFFMTable
void WriteInttempAndPFFMTable(const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
Definition: mastertrainer.cpp:566
CHAR_FRAGMENT::parse_from_string
static CHAR_FRAGMENT * parse_from_string(const char *str)
Definition: unicharset.cpp:1057
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
svmnode.h
ParseBoxFileStr
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:181
tesseract::MasterTrainer::DisplaySamples
void DisplaySamples(const char *unichar_str1, int cloud_font, const char *unichar_str2, int canonical_font)
Definition: mastertrainer.cpp:695
tesseract::MasterTrainer::AddSpacingInfo
bool AddSpacingInfo(const char *filename)
Definition: mastertrainer.cpp:411
tesseract::MasterTrainer::GetFontInfoId
int GetFontInfoId(const char *font_name)
Definition: mastertrainer.cpp:467
tesseract::ShapeClassifier
Definition: shapeclassifier.h:43
tesseract::TrainingSampleSet::ComputeCanonicalFeatures
void ComputeCanonicalFeatures()
Definition: trainingsampleset.cpp:694
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::TrainingSampleSet::UnicharDistance
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)
Definition: trainingsampleset.cpp:230
tesseract::MasterTrainer::LoadFontInfo
bool LoadFontInfo(const char *filename)
Definition: mastertrainer.cpp:332
tesseract::MasterTrainer::unicharset
const UNICHARSET & unicharset() const
Definition: mastertrainer.h:186
tesseract::FontSpacingInfo::kerned_x_gaps
GenericVector< int16_t > kerned_x_gaps
Definition: fontinfo.h:55
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
tesseract::MasterTrainer::GetBestMatchingFontInfoId
int GetBestMatchingFontInfoId(const char *filename)
Definition: mastertrainer.cpp:478
tesseract::FontInfo::properties
uint32_t properties
Definition: fontinfo.h:118
shapetable.h
tesseract::SampleIterator::GetSample
const TrainingSample & GetSample() const
Definition: sampleiterator.cpp:103
tesseract::MasterTrainer::ReadTrainingSamples
void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
Definition: mastertrainer.cpp:111
tesseract::FontInfoTable::MoveTo
void MoveTo(UnicityTable< FontInfo > *target)
Definition: fontinfo.cpp:107
FEATURE_DEFS_STRUCT
Definition: featdefs.h:44
UNICHARSET
Definition: unicharset.h:145
CHAR_FRAGMENT::is_ending
bool is_ending() const
Definition: unicharset.h:108
tesseract::ShapeTable::DebugStr
STRING DebugStr(int shape_id) const
Definition: shapetable.cpp:281
INT_TEMPLATES_STRUCT::NumClasses
int NumClasses
Definition: intproto.h:118
FEATURE_DESC_STRUCT::NumParams
uint16_t NumParams
Definition: ocrfeatures.h:52
CLASS_STRUCT::font_set
UnicityTableEqEq< int > font_set
Definition: protos.h:59
tesseract::ShapeTable::GetShape
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:319
feature_defs
FEATURE_DEFS_STRUCT feature_defs
Definition: commontraining.cpp:89
CreateFeatureSpaceWindow
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1764
tesseract::IndexMapBiDi::Setup
void Setup()
Definition: indexmapbidi.cpp:102
CLASS_STRUCT
Definition: protos.h:45
MakeClusterer
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:376
GenericVector::get_index
int get_index(const T &object) const
Definition: genericvector.h:781
UnicityTable::get
const T & get(int id) const
Return the object from an id.
Definition: unicity_table.h:140
character
Definition: mfoutline.h:62
tesseract::MasterTrainer::ShapeDistance
float ShapeDistance(const ShapeTable &shapes, int s1, int s2)
Definition: mastertrainer.cpp:809
tesseract::TrainingSampleSet::OrganizeByFontAndClass
void OrganizeByFontAndClass()
Definition: trainingsampleset.cpp:511
tesseract
Definition: baseapi.h:65
DivRounded
int DivRounded(int a, int b)
Definition: helpers.h:165
distance
UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first, const UnicodeText::const_iterator &last)
Definition: unicodetext.cc:44
FEATURE_DESC_STRUCT::ParamDesc
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:54
CHAR_FRAGMENT::get_total
int get_total() const
Definition: unicharset.h:72
tesseract::ShapeTable::MasterDestinationIndex
int MasterDestinationIndex(int shape_id) const
Definition: shapetable.cpp:531
tesseract::ErrorCounter::ComputeErrorRate
static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix * > &page_images, SampleIterator *it, double *unichar_error, double *scaled_error, STRING *fonts_report)
Definition: errorcounter.cpp:39
kMicroFeatureType
const char *const kMicroFeatureType
Definition: featdefs.cpp:31
tesseract::MasterTrainer::LoadXHeights
bool LoadXHeights(const char *filename)
Definition: mastertrainer.cpp:368
SVEvent::type
SVEventType type
Definition: scrollview.h:63
ScrollView::RED
Definition: scrollview.h:104
tesseract::TrainingSampleSet::ComputeCloudFeatures
void ComputeCloudFeatures(int feature_space_size)
Definition: trainingsampleset.cpp:712
tesseract::FontInfo
Definition: fontinfo.h:62
tesseract::FontInfo::init_spacing
void init_spacing(int unicharset_size)
Definition: fontinfo.h:73
tesseract::BitVector
Definition: bitvector.h:30
sample
Definition: cluster.h:31
GenericVector< int >
tesseract::FontInfo::name
char * name
Definition: fontinfo.h:117
tesseract::IntFeatureSpace
Definition: intfeaturespace.h:38
tesseract::ShapeTable::MergedUnicharCount
int MergedUnicharCount(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:503
tesseract::IntFeatureMap::feature_space
const IntFeatureSpace & feature_space() const
Definition: intfeaturemap.h:60
tesseract::TrainingSampleSet
Definition: trainingsampleset.h:43
tesseract::MasterTrainer::TestClassifierVOld
void TestClassifierVOld(bool replicate_samples, ShapeClassifier *test_classifier, ShapeClassifier *old_classifier)
Definition: mastertrainer.cpp:748
SVET_ANY
Definition: scrollview.h:55
shapeclassifier.h
CHAR_DESC_STRUCT
Definition: featdefs.h:38
CHAR_FRAGMENT
Definition: unicharset.h:48
tesseract::MasterTrainer::SetupMasterShapes
void SetupMasterShapes()
Definition: mastertrainer.cpp:245
tesseract::NormalizationMode
NormalizationMode
Definition: normalis.h:41
tesseract::kMaxUnicharsPerCluster
const int kMaxUnicharsPerCluster
Definition: mastertrainer.cpp:46
ScrollView::AwaitEvent
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:443
UNICHAR_LEN
#define UNICHAR_LEN
Definition: unichar.h:32
tesseract::TrainingSampleSet::AddSample
int AddSample(const char *unichar, TrainingSample *sample)
Definition: trainingsampleset.cpp:129
tesseract::Shape::AddToShape
void AddToShape(int unichar_id, int font_id)
Definition: shapetable.cpp:101
tesseract::kFontMergeDistance
const float kFontMergeDistance
Definition: mastertrainer.cpp:48
INT_FEATURE_STRUCT
Definition: intproto.h:131
tesseract::kInfiniteDist
const float kInfiniteDist
Definition: mastertrainer.cpp:905
tesseract::MasterTrainer::PreTrainingSetup
void PreTrainingSetup()
Definition: mastertrainer.cpp:233
tesseract::IndexMapBiDi
Definition: indexmapbidi.h:102
tesseract::TrainingSample
Definition: trainingsample.h:53
scanutils.h
tesseract::IntFeatureMap
Definition: intfeaturemap.h:48
featdefs.h
CLUSTERER
Definition: cluster.h:81
tesseract::ShapeDist::distance
float distance
Definition: mastertrainer.h:62
GenericVector::get
T & get(int index) const
Definition: genericvector.h:716
SVEvent
Definition: scrollview.h:60
UNICHARSET::contains_unichar
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
ScrollView::GREEN
Definition: scrollview.h:106
tesseract::IndexMapBiDi::Init
void Init(int size, bool all_mapped)
Definition: indexmapbidi.cpp:86
tesseract::MasterTrainer::IncludeJunk
void IncludeJunk()
Definition: mastertrainer.cpp:294
tesseract::Shape::size
int size() const
Definition: shapetable.h:199
tesseract::TrainingSampleSet::NumFonts
int NumFonts() const
Definition: trainingsampleset.h:61
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:706
kCNFeatureType
const char *const kCNFeatureType
Definition: featdefs.cpp:32
tesseract::MasterTrainer::DebugCanonical
void DebugCanonical(const char *unichar_str1, const char *unichar_str2)
Definition: mastertrainer.cpp:635
tesseract::ShapeTable::FindShape
int FindShape(int unichar_id, int font_id) const
Definition: shapetable.cpp:386
tesseract::MasterTrainer::ReplicateAndRandomizeSamplesIfRequired
void ReplicateAndRandomizeSamplesIfRequired()
Definition: mastertrainer.cpp:320
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::ShapeTable
Definition: shapetable.h:261
tesseract::BitVector::size
int size() const
Definition: bitvector.h:53
tesseract::TrainingSampleSet::charsetsize
int charsetsize() const
Definition: trainingsampleset.h:67
UNICHARSET::get_fragment
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:724
SVEvent::x
int x
Definition: scrollview.h:66
tesseract::ShapeTable::SummaryStr
STRING SummaryStr() const
Definition: shapetable.cpp:313
free_int_templates
void free_int_templates(INT_TEMPLATES templates)
Definition: intproto.cpp:697
ScrollView::Update
static void Update()
Definition: scrollview.cpp:708
tesseract::ShapeTable::AddShape
int AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:336
tesseract::MasterTrainer::SetupFlatShapeTable
void SetupFlatShapeTable(ShapeTable *shape_table)
Definition: mastertrainer.cpp:495
tesseract::TrainingSampleSet::IndexFeatures
void IndexFeatures(const IntFeatureSpace &feature_space)
Definition: trainingsampleset.cpp:485
tesseract::IndexMapBiDi::SetMap
void SetMap(int sparse_index, bool mapped)
Definition: indexmapbidi.cpp:95
classify.h
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
CHAR_FRAGMENT::is_beginning
bool is_beginning() const
Definition: unicharset.h:105
tesseract::TrainingSampleSet::DisplaySamplesWithFeature
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
Definition: trainingsampleset.cpp:743
INT_CLASS_STRUCT::NumConfigs
uint8_t NumConfigs
Definition: intproto.h:107
tesseract::IntFeatureSpace::Serialize
bool Serialize(FILE *fp) const
Definition: intfeaturespace.cpp:38
tesseract::TrainingSampleSet::KillSample
void KillSample(TrainingSample *sample)
Definition: trainingsampleset.cpp:492
tesseract::TrainingSampleSet::ReplicateAndRandomizeSamples
void ReplicateAndRandomizeSamples()
Definition: trainingsampleset.cpp:665
CHAR_FRAGMENT::set_pos
void set_pos(int p)
Definition: unicharset.h:68
tesseract::SampleIterator::Begin
void Begin()
Definition: sampleiterator.cpp:87
tesseract::IntFeatureSpace::XYToFeatureIndex
int XYToFeatureIndex(int x, int y) const
Definition: intfeaturespace.cpp:79
tesseract::MasterTrainer::LoadPageImages
void LoadPageImages(const char *filename)
Definition: mastertrainer.cpp:192
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::Serialize
bool Serialize(FILE *fp, const char *data, size_t n=1)
Definition: serialis.cpp:73
ClassForClassId
#define ClassForClassId(T, c)
Definition: intproto.h:177
tesseract::MasterTrainer::PostLoadCleanup
void PostLoadCleanup()
Definition: mastertrainer.cpp:210
tesseract::SampleIterator::SparseCharsetSize
int SparseCharsetSize() const
Definition: sampleiterator.cpp:202
tesseract::TrainingSampleSet::unicharset
const UNICHARSET & unicharset() const
Definition: trainingsampleset.h:64
tesseract::ClearFeatureSpaceWindow
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:987
MakeSample
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, int32_t CharID)
Definition: cluster.cpp:429
ReadCharDescription
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:235
tesseract::SampleIterator::AtEnd
bool AtEnd() const
Definition: sampleiterator.cpp:99
tesseract::ShapeTable::MergeShapes
void MergeShapes(int shape_id1, int shape_id2)
Definition: shapetable.cpp:513
UNICHARSET::size
int size() const
Definition: unicharset.h:341
RenderIntFeature
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1603
FreeCharDescription
void FreeCharDescription(CHAR_DESC CharDesc)
Definition: featdefs.cpp:128
tesseract::SampleIterator::CompactCharsetSize
int CompactCharsetSize() const
Definition: sampleiterator.cpp:196
sampleiterator.h
tesseract::MasterTrainer::TestClassifierOnSamples
void TestClassifierOnSamples(CountTypes error_mode, int report_level, bool replicate_samples, ShapeClassifier *test_classifier, STRING *report_string)
Definition: mastertrainer.cpp:760
TBOX
Definition: rect.h:33
tesseract::TrainingSampleSet::GetCanonicalSample
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
Definition: trainingsampleset.cpp:462
tesseract::TrainingSampleSet::ComputeCanonicalSamples
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)
Definition: trainingsampleset.cpp:568