tesseract  4.0.0-1-g2a2b
mastertrainer.cpp
Go to the documentation of this file.
1 // File: mastertrainer.cpp
3 // Description: Trainer to build the MasterClassifier.
4 // Author: Ray Smith
5 // Created: Wed Nov 03 18:10:01 PDT 2010
6 //
7 // (C) Copyright 2010, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 // Include automatically generated configuration file if running autoconf.
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #include "mastertrainer.h"
26 #include <cmath>
27 #include <ctime>
28 #include "allheaders.h"
29 #include "boxread.h"
30 #include "classify.h"
31 #include "errorcounter.h"
32 #include "featdefs.h"
33 #include "sampleiterator.h"
34 #include "shapeclassifier.h"
35 #include "shapetable.h"
36 #include "svmnode.h"
37 
38 #include "scanutils.h"
39 
40 namespace tesseract {
41 
42 // Constants controlling clustering. With a low kMinClusteredShapes and a high
43 // kMaxUnicharsPerCluster, then kFontMergeDistance is the only limiting factor.
44 // Min number of shapes in the output.
45 const int kMinClusteredShapes = 1;
46 // Max number of unichars in any individual cluster.
47 const int kMaxUnicharsPerCluster = 2000;
48 // Mean font distance below which to merge fonts and unichars.
49 const float kFontMergeDistance = 0.025;
50 
52  bool shape_analysis,
53  bool replicate_samples,
54  int debug_level)
55  : norm_mode_(norm_mode), samples_(fontinfo_table_),
56  junk_samples_(fontinfo_table_), verify_samples_(fontinfo_table_),
57  charsetsize_(0),
58  enable_shape_analysis_(shape_analysis),
59  enable_replication_(replicate_samples),
60  fragments_(nullptr), prev_unichar_id_(-1), debug_level_(debug_level) {
61 }
62 
64  delete [] fragments_;
65  for (int p = 0; p < page_images_.size(); ++p)
66  pixDestroy(&page_images_[p]);
67 }
68 
69 // WARNING! Serialize/DeSerialize are only partial, providing
70 // enough data to get the samples back and display them.
71 // Writes to the given file. Returns false in case of error.
72 bool MasterTrainer::Serialize(FILE* fp) const {
73  uint32_t value = norm_mode_;
74  if (!tesseract::Serialize(fp, &value)) return false;
75  if (!unicharset_.save_to_file(fp)) return false;
76  if (!feature_space_.Serialize(fp)) return false;
77  if (!samples_.Serialize(fp)) return false;
78  if (!junk_samples_.Serialize(fp)) return false;
79  if (!verify_samples_.Serialize(fp)) return false;
80  if (!master_shapes_.Serialize(fp)) return false;
81  if (!flat_shapes_.Serialize(fp)) return false;
82  if (!fontinfo_table_.Serialize(fp)) return false;
83  if (!xheights_.Serialize(fp)) return false;
84  return true;
85 }
86 
87 // Load an initial unicharset, or set one up if the file cannot be read.
88 void MasterTrainer::LoadUnicharset(const char* filename) {
89  if (!unicharset_.load_from_file(filename)) {
90  tprintf("Failed to load unicharset from file %s\n"
91  "Building unicharset for training from scratch...\n",
92  filename);
93  unicharset_.clear();
94  UNICHARSET initialized;
95  // Add special characters, as they were removed by the clear, but the
96  // default constructor puts them in.
97  unicharset_.AppendOtherUnicharset(initialized);
98  }
99  charsetsize_ = unicharset_.size();
100  delete [] fragments_;
101  fragments_ = new int[charsetsize_];
102  memset(fragments_, 0, sizeof(*fragments_) * charsetsize_);
103  samples_.LoadUnicharset(filename);
104  junk_samples_.LoadUnicharset(filename);
105  verify_samples_.LoadUnicharset(filename);
106 }
107 
108 // Reads the samples and their features from the given .tr format file,
109 // adding them to the trainer with the font_id from the content of the file.
110 // See mftraining.cpp for a description of the file format.
111 // If verification, then these are verification samples, not training.
112 void MasterTrainer::ReadTrainingSamples(const char* page_name,
114  bool verification) {
115  char buffer[2048];
116  const int int_feature_type = ShortNameToFeatureType(feature_defs, kIntFeatureType);
117  const int micro_feature_type = ShortNameToFeatureType(feature_defs,
119  const int cn_feature_type = ShortNameToFeatureType(feature_defs, kCNFeatureType);
120  const int geo_feature_type = ShortNameToFeatureType(feature_defs, kGeoFeatureType);
121 
122  FILE* fp = fopen(page_name, "rb");
123  if (fp == nullptr) {
124  tprintf("Failed to open tr file: %s\n", page_name);
125  return;
126  }
127  tr_filenames_.push_back(STRING(page_name));
128  while (fgets(buffer, sizeof(buffer), fp) != nullptr) {
129  if (buffer[0] == '\n')
130  continue;
131 
132  char* space = strchr(buffer, ' ');
133  if (space == nullptr) {
134  tprintf("Bad format in tr file, reading fontname, unichar\n");
135  continue;
136  }
137  *space++ = '\0';
138  int font_id = GetFontInfoId(buffer);
139  if (font_id < 0) font_id = 0;
140  int page_number;
141  STRING unichar;
142  TBOX bounding_box;
143  if (!ParseBoxFileStr(space, &page_number, &unichar, &bounding_box)) {
144  tprintf("Bad format in tr file, reading box coords\n");
145  continue;
146  }
147  CHAR_DESC char_desc = ReadCharDescription(feature_defs, fp);
149  sample->set_font_id(font_id);
150  sample->set_page_num(page_number + page_images_.size());
151  sample->set_bounding_box(bounding_box);
152  sample->ExtractCharDesc(int_feature_type, micro_feature_type,
153  cn_feature_type, geo_feature_type, char_desc);
154  AddSample(verification, unichar.string(), sample);
155  FreeCharDescription(char_desc);
156  }
157  charsetsize_ = unicharset_.size();
158  fclose(fp);
159 }
160 
161 // Adds the given single sample to the trainer, setting the classid
162 // appropriately from the given unichar_str.
163 void MasterTrainer::AddSample(bool verification, const char* unichar,
165  if (verification) {
166  verify_samples_.AddSample(unichar, sample);
167  prev_unichar_id_ = -1;
168  } else if (unicharset_.contains_unichar(unichar)) {
169  if (prev_unichar_id_ >= 0)
170  fragments_[prev_unichar_id_] = -1;
171  prev_unichar_id_ = samples_.AddSample(unichar, sample);
172  if (flat_shapes_.FindShape(prev_unichar_id_, sample->font_id()) < 0)
173  flat_shapes_.AddShape(prev_unichar_id_, sample->font_id());
174  } else {
175  const int junk_id = junk_samples_.AddSample(unichar, sample);
176  if (prev_unichar_id_ >= 0) {
178  if (frag != nullptr && frag->is_natural()) {
179  if (fragments_[prev_unichar_id_] == 0)
180  fragments_[prev_unichar_id_] = junk_id;
181  else if (fragments_[prev_unichar_id_] != junk_id)
182  fragments_[prev_unichar_id_] = -1;
183  }
184  delete frag;
185  }
186  prev_unichar_id_ = -1;
187  }
188 }
189 
190 // Loads all pages from the given tif filename and append to page_images_.
191 // Must be called after ReadTrainingSamples, as the current number of images
192 // is used as an offset for page numbers in the samples.
193 void MasterTrainer::LoadPageImages(const char* filename) {
194  size_t offset = 0;
195  int page;
196  Pix* pix;
197  for (page = 0;; page++) {
198  pix = pixReadFromMultipageTiff(filename, &offset);
199  if (!pix) break;
200  page_images_.push_back(pix);
201  if (!offset) break;
202  }
203  tprintf("Loaded %d page images from %s\n", page, filename);
204 }
205 
206 // Cleans up the samples after initial load from the tr files, and prior to
207 // saving the MasterTrainer:
208 // Remaps fragmented chars if running shape analysis.
209 // Sets up the samples appropriately for class/fontwise access.
210 // Deletes outlier samples.
212  if (debug_level_ > 0)
213  tprintf("PostLoadCleanup...\n");
214  if (enable_shape_analysis_)
215  ReplaceFragmentedSamples();
216  SampleIterator sample_it;
217  sample_it.Init(nullptr, nullptr, true, &verify_samples_);
218  sample_it.NormalizeSamples();
219  verify_samples_.OrganizeByFontAndClass();
220 
221  samples_.IndexFeatures(feature_space_);
222  // TODO(rays) DeleteOutliers is currently turned off to prove NOP-ness
223  // against current training.
224  // samples_.DeleteOutliers(feature_space_, debug_level_ > 0);
225  samples_.OrganizeByFontAndClass();
226  if (debug_level_ > 0)
227  tprintf("ComputeCanonicalSamples...\n");
228  samples_.ComputeCanonicalSamples(feature_map_, debug_level_ > 0);
229 }
230 
231 // Gets the samples ready for training. Use after both
232 // ReadTrainingSamples+PostLoadCleanup or DeSerialize.
233 // Re-indexes the features and computes canonical and cloud features.
235  if (debug_level_ > 0)
236  tprintf("PreTrainingSetup...\n");
237  samples_.IndexFeatures(feature_space_);
238  samples_.ComputeCanonicalFeatures();
239  if (debug_level_ > 0)
240  tprintf("ComputeCloudFeatures...\n");
241  samples_.ComputeCloudFeatures(feature_space_.Size());
242 }
243 
244 // Sets up the master_shapes_ table, which tells which fonts should stay
245 // together until they get to a leaf node classifier.
247  tprintf("Building master shape table\n");
248  const int num_fonts = samples_.NumFonts();
249 
250  ShapeTable char_shapes_begin_fragment(samples_.unicharset());
251  ShapeTable char_shapes_end_fragment(samples_.unicharset());
252  ShapeTable char_shapes(samples_.unicharset());
253  for (int c = 0; c < samples_.charsetsize(); ++c) {
254  ShapeTable shapes(samples_.unicharset());
255  for (int f = 0; f < num_fonts; ++f) {
256  if (samples_.NumClassSamples(f, c, true) > 0)
257  shapes.AddShape(c, f);
258  }
259  ClusterShapes(kMinClusteredShapes, 1, kFontMergeDistance, &shapes);
260 
261  const CHAR_FRAGMENT *fragment = samples_.unicharset().get_fragment(c);
262 
263  if (fragment == nullptr)
264  char_shapes.AppendMasterShapes(shapes, nullptr);
265  else if (fragment->is_beginning())
266  char_shapes_begin_fragment.AppendMasterShapes(shapes, nullptr);
267  else if (fragment->is_ending())
268  char_shapes_end_fragment.AppendMasterShapes(shapes, nullptr);
269  else
270  char_shapes.AppendMasterShapes(shapes, nullptr);
271  }
273  kFontMergeDistance, &char_shapes_begin_fragment);
274  char_shapes.AppendMasterShapes(char_shapes_begin_fragment, nullptr);
276  kFontMergeDistance, &char_shapes_end_fragment);
277  char_shapes.AppendMasterShapes(char_shapes_end_fragment, nullptr);
279  kFontMergeDistance, &char_shapes);
280  master_shapes_.AppendMasterShapes(char_shapes, nullptr);
281  tprintf("Master shape_table:%s\n", master_shapes_.SummaryStr().string());
282 }
283 
284 // Adds the junk_samples_ to the main samples_ set. Junk samples are initially
285 // fragments and n-grams (all incorrectly segmented characters).
286 // Various training functions may result in incorrectly segmented characters
287 // being added to the unicharset of the main samples, perhaps because they
288 // form a "radical" decomposition of some (Indic) grapheme, or because they
289 // just look the same as a real character (like rn/m)
290 // This function moves all the junk samples, to the main samples_ set, but
291 // desirable junk, being any sample for which the unichar already exists in
292 // the samples_ unicharset gets the unichar-ids re-indexed to match, but
293 // anything else gets re-marked as unichar_id 0 (space character) to identify
294 // it as junk to the error counter.
296  // Get ids of fragments in junk_samples_ that replace the dead chars.
297  const UNICHARSET& junk_set = junk_samples_.unicharset();
298  const UNICHARSET& sample_set = samples_.unicharset();
299  int num_junks = junk_samples_.num_samples();
300  tprintf("Moving %d junk samples to master sample set.\n", num_junks);
301  for (int s = 0; s < num_junks; ++s) {
302  TrainingSample* sample = junk_samples_.mutable_sample(s);
303  int junk_id = sample->class_id();
304  const char* junk_utf8 = junk_set.id_to_unichar(junk_id);
305  int sample_id = sample_set.unichar_to_id(junk_utf8);
306  if (sample_id == INVALID_UNICHAR_ID)
307  sample_id = 0;
308  sample->set_class_id(sample_id);
309  junk_samples_.extract_sample(s);
310  samples_.AddSample(sample_id, sample);
311  }
312  junk_samples_.DeleteDeadSamples();
313  samples_.OrganizeByFontAndClass();
314 }
315 
316 // Replicates the samples and perturbs them if the enable_replication_ flag
317 // is set. MUST be used after the last call to OrganizeByFontAndClass on
318 // the training samples, ie after IncludeJunk if it is going to be used, as
319 // OrganizeByFontAndClass will eat the replicated samples into the regular
320 // samples.
322  if (enable_replication_) {
323  if (debug_level_ > 0)
324  tprintf("ReplicateAndRandomize...\n");
325  verify_samples_.ReplicateAndRandomizeSamples();
326  samples_.ReplicateAndRandomizeSamples();
327  samples_.IndexFeatures(feature_space_);
328  }
329 }
330 
331 // Loads the basic font properties file into fontinfo_table_.
332 // Returns false on failure.
333 bool MasterTrainer::LoadFontInfo(const char* filename) {
334  FILE* fp = fopen(filename, "rb");
335  if (fp == nullptr) {
336  fprintf(stderr, "Failed to load font_properties from %s\n", filename);
337  return false;
338  }
339  int italic, bold, fixed, serif, fraktur;
340  while (!feof(fp)) {
341  FontInfo fontinfo;
342  char* font_name = new char[1024];
343  fontinfo.name = font_name;
344  fontinfo.properties = 0;
345  fontinfo.universal_id = 0;
346  if (tfscanf(fp, "%1024s %i %i %i %i %i\n", font_name, &italic, &bold,
347  &fixed, &serif, &fraktur) != 6) {
348  delete[] font_name;
349  continue;
350  }
351  fontinfo.properties =
352  (italic << 0) +
353  (bold << 1) +
354  (fixed << 2) +
355  (serif << 3) +
356  (fraktur << 4);
357  if (!fontinfo_table_.contains(fontinfo)) {
358  fontinfo_table_.push_back(fontinfo);
359  } else {
360  delete[] font_name;
361  }
362  }
363  fclose(fp);
364  return true;
365 }
366 
367 // Loads the xheight font properties file into xheights_.
368 // Returns false on failure.
369 bool MasterTrainer::LoadXHeights(const char* filename) {
370  tprintf("fontinfo table is of size %d\n", fontinfo_table_.size());
371  xheights_.init_to_size(fontinfo_table_.size(), -1);
372  if (filename == nullptr) return true;
373  FILE *f = fopen(filename, "rb");
374  if (f == nullptr) {
375  fprintf(stderr, "Failed to load font xheights from %s\n", filename);
376  return false;
377  }
378  tprintf("Reading x-heights from %s ...\n", filename);
379  FontInfo fontinfo;
380  fontinfo.properties = 0; // Not used to lookup in the table.
381  fontinfo.universal_id = 0;
382  char buffer[1024];
383  int xht;
384  int total_xheight = 0;
385  int xheight_count = 0;
386  while (!feof(f)) {
387  if (tfscanf(f, "%1023s %d\n", buffer, &xht) != 2)
388  continue;
389  buffer[1023] = '\0';
390  fontinfo.name = buffer;
391  if (!fontinfo_table_.contains(fontinfo)) continue;
392  int fontinfo_id = fontinfo_table_.get_index(fontinfo);
393  xheights_[fontinfo_id] = xht;
394  total_xheight += xht;
395  ++xheight_count;
396  }
397  if (xheight_count == 0) {
398  fprintf(stderr, "No valid xheights in %s!\n", filename);
399  fclose(f);
400  return false;
401  }
402  int mean_xheight = DivRounded(total_xheight, xheight_count);
403  for (int i = 0; i < fontinfo_table_.size(); ++i) {
404  if (xheights_[i] < 0)
405  xheights_[i] = mean_xheight;
406  }
407  fclose(f);
408  return true;
409 } // LoadXHeights
410 
411 // Reads spacing stats from filename and adds them to fontinfo_table.
412 bool MasterTrainer::AddSpacingInfo(const char *filename) {
413  FILE* fontinfo_file = fopen(filename, "rb");
414  if (fontinfo_file == nullptr)
415  return true; // We silently ignore missing files!
416  // Find the fontinfo_id.
417  int fontinfo_id = GetBestMatchingFontInfoId(filename);
418  if (fontinfo_id < 0) {
419  tprintf("No font found matching fontinfo filename %s\n", filename);
420  fclose(fontinfo_file);
421  return false;
422  }
423  tprintf("Reading spacing from %s for font %d...\n", filename, fontinfo_id);
424  // TODO(rays) scale should probably be a double, but keep as an int for now
425  // to duplicate current behavior.
426  int scale = kBlnXHeight / xheights_[fontinfo_id];
427  int num_unichars;
428  char uch[UNICHAR_LEN];
429  char kerned_uch[UNICHAR_LEN];
430  int x_gap, x_gap_before, x_gap_after, num_kerned;
431  ASSERT_HOST(tfscanf(fontinfo_file, "%d\n", &num_unichars) == 1);
432  FontInfo *fi = &fontinfo_table_.get(fontinfo_id);
433  fi->init_spacing(unicharset_.size());
434  FontSpacingInfo *spacing = nullptr;
435  for (int l = 0; l < num_unichars; ++l) {
436  if (tfscanf(fontinfo_file, "%s %d %d %d",
437  uch, &x_gap_before, &x_gap_after, &num_kerned) != 4) {
438  tprintf("Bad format of font spacing file %s\n", filename);
439  fclose(fontinfo_file);
440  return false;
441  }
442  bool valid = unicharset_.contains_unichar(uch);
443  if (valid) {
444  spacing = new FontSpacingInfo();
445  spacing->x_gap_before = static_cast<int16_t>(x_gap_before * scale);
446  spacing->x_gap_after = static_cast<int16_t>(x_gap_after * scale);
447  }
448  for (int k = 0; k < num_kerned; ++k) {
449  if (tfscanf(fontinfo_file, "%s %d", kerned_uch, &x_gap) != 2) {
450  tprintf("Bad format of font spacing file %s\n", filename);
451  fclose(fontinfo_file);
452  delete spacing;
453  return false;
454  }
455  if (!valid || !unicharset_.contains_unichar(kerned_uch)) continue;
456  spacing->kerned_unichar_ids.push_back(
457  unicharset_.unichar_to_id(kerned_uch));
458  spacing->kerned_x_gaps.push_back(static_cast<int16_t>(x_gap * scale));
459  }
460  if (valid) fi->add_spacing(unicharset_.unichar_to_id(uch), spacing);
461  }
462  fclose(fontinfo_file);
463  return true;
464 }
465 
466 // Returns the font id corresponding to the given font name.
467 // Returns -1 if the font cannot be found.
468 int MasterTrainer::GetFontInfoId(const char* font_name) {
469  FontInfo fontinfo;
470  // We are only borrowing the string, so it is OK to const cast it.
471  fontinfo.name = const_cast<char*>(font_name);
472  fontinfo.properties = 0; // Not used to lookup in the table
473  fontinfo.universal_id = 0;
474  return fontinfo_table_.get_index(fontinfo);
475 }
476 // Returns the font_id of the closest matching font name to the given
477 // filename. It is assumed that a substring of the filename will match
478 // one of the fonts. If more than one is matched, the longest is returned.
479 int MasterTrainer::GetBestMatchingFontInfoId(const char* filename) {
480  int fontinfo_id = -1;
481  int best_len = 0;
482  for (int f = 0; f < fontinfo_table_.size(); ++f) {
483  if (strstr(filename, fontinfo_table_.get(f).name) != nullptr) {
484  int len = strlen(fontinfo_table_.get(f).name);
485  // Use the longest matching length in case a substring of a font matched.
486  if (len > best_len) {
487  best_len = len;
488  fontinfo_id = f;
489  }
490  }
491  }
492  return fontinfo_id;
493 }
494 
495 // Sets up a flat shapetable with one shape per class/font combination.
497  // To exactly mimic the results of the previous implementation, the shapes
498  // must be clustered in order the fonts arrived, and reverse order of the
499  // characters within each font.
500  // Get a list of the fonts in the order they appeared.
501  GenericVector<int> active_fonts;
502  int num_shapes = flat_shapes_.NumShapes();
503  for (int s = 0; s < num_shapes; ++s) {
504  int font = flat_shapes_.GetShape(s)[0].font_ids[0];
505  int f = 0;
506  for (f = 0; f < active_fonts.size(); ++f) {
507  if (active_fonts[f] == font)
508  break;
509  }
510  if (f == active_fonts.size())
511  active_fonts.push_back(font);
512  }
513  // For each font in order, add all the shapes with that font in reverse order.
514  int num_fonts = active_fonts.size();
515  for (int f = 0; f < num_fonts; ++f) {
516  for (int s = num_shapes - 1; s >= 0; --s) {
517  int font = flat_shapes_.GetShape(s)[0].font_ids[0];
518  if (font == active_fonts[f]) {
519  shape_table->AddShape(flat_shapes_.GetShape(s));
520  }
521  }
522  }
523 }
524 
525 // Sets up a Clusterer for mftraining on a single shape_id.
526 // Call FreeClusterer on the return value after use.
528  const ShapeTable& shape_table,
530  int shape_id,
531  int* num_samples) {
532 
534  int num_params = feature_defs.FeatureDesc[desc_index]->NumParams;
535  ASSERT_HOST(num_params == MFCount);
536  CLUSTERER* clusterer = MakeClusterer(
537  num_params, feature_defs.FeatureDesc[desc_index]->ParamDesc);
538 
539  // We want to iterate over the samples of just the one shape.
540  IndexMapBiDi shape_map;
541  shape_map.Init(shape_table.NumShapes(), false);
542  shape_map.SetMap(shape_id, true);
543  shape_map.Setup();
544  // Reverse the order of the samples to match the previous behavior.
546  SampleIterator it;
547  it.Init(&shape_map, &shape_table, false, &samples_);
548  for (it.Begin(); !it.AtEnd(); it.Next()) {
549  sample_ptrs.push_back(&it.GetSample());
550  }
551  int sample_id = 0;
552  for (int i = sample_ptrs.size() - 1; i >= 0; --i) {
553  const TrainingSample* sample = sample_ptrs[i];
554  uint32_t num_features = sample->num_micro_features();
555  for (uint32_t f = 0; f < num_features; ++f)
556  MakeSample(clusterer, sample->micro_features()[f], sample_id);
557  ++sample_id;
558  }
559  *num_samples = sample_id;
560  return clusterer;
561 }
562 
563 // Writes the given float_classes (produced by SetupForFloat2Int) as inttemp
564 // to the given inttemp_file, and the corresponding pffmtable.
565 // The unicharset is the original encoding of graphemes, and shape_set should
566 // match the size of the shape_table, and may possibly be totally fake.
568  const UNICHARSET& shape_set,
569  const ShapeTable& shape_table,
570  CLASS_STRUCT* float_classes,
571  const char* inttemp_file,
572  const char* pffmtable_file) {
573  tesseract::Classify *classify = new tesseract::Classify();
574  // Move the fontinfo table to classify.
575  fontinfo_table_.MoveTo(&classify->get_fontinfo_table());
576  INT_TEMPLATES int_templates = classify->CreateIntTemplates(float_classes,
577  shape_set);
578  FILE* fp = fopen(inttemp_file, "wb");
579  if (fp == nullptr) {
580  tprintf("Error, failed to open file \"%s\"\n", inttemp_file);
581  } else {
582  classify->WriteIntTemplates(fp, int_templates, shape_set);
583  fclose(fp);
584  }
585  // Now write pffmtable. This is complicated by the fact that the adaptive
586  // classifier still wants one indexed by unichar-id, but the static
587  // classifier needs one indexed by its shape class id.
588  // We put the shapetable_cutoffs in a GenericVector, and compute the
589  // unicharset cutoffs along the way.
590  GenericVector<uint16_t> shapetable_cutoffs;
591  GenericVector<uint16_t> unichar_cutoffs;
592  for (int c = 0; c < unicharset.size(); ++c)
593  unichar_cutoffs.push_back(0);
594  /* then write out each class */
595  for (int i = 0; i < int_templates->NumClasses; ++i) {
596  INT_CLASS Class = ClassForClassId(int_templates, i);
597  // Todo: Test with min instead of max
598  // int MaxLength = LengthForConfigId(Class, 0);
599  uint16_t max_length = 0;
600  for (int config_id = 0; config_id < Class->NumConfigs; config_id++) {
601  // Todo: Test with min instead of max
602  // if (LengthForConfigId (Class, config_id) < MaxLength)
603  uint16_t length = Class->ConfigLengths[config_id];
604  if (length > max_length)
605  max_length = Class->ConfigLengths[config_id];
606  int shape_id = float_classes[i].font_set.get(config_id);
607  const Shape& shape = shape_table.GetShape(shape_id);
608  for (int c = 0; c < shape.size(); ++c) {
609  int unichar_id = shape[c].unichar_id;
610  if (length > unichar_cutoffs[unichar_id])
611  unichar_cutoffs[unichar_id] = length;
612  }
613  }
614  shapetable_cutoffs.push_back(max_length);
615  }
616  fp = fopen(pffmtable_file, "wb");
617  if (fp == nullptr) {
618  tprintf("Error, failed to open file \"%s\"\n", pffmtable_file);
619  } else {
620  shapetable_cutoffs.Serialize(fp);
621  for (int c = 0; c < unicharset.size(); ++c) {
622  const char *unichar = unicharset.id_to_unichar(c);
623  if (strcmp(unichar, " ") == 0) {
624  unichar = "NULL";
625  }
626  fprintf(fp, "%s %d\n", unichar, unichar_cutoffs[c]);
627  }
628  fclose(fp);
629  }
630  free_int_templates(int_templates);
631  delete classify;
632 }
633 
634 // Generate debug output relating to the canonical distance between the
635 // two given UTF8 grapheme strings.
636 void MasterTrainer::DebugCanonical(const char* unichar_str1,
637  const char* unichar_str2) {
638  int class_id1 = unicharset_.unichar_to_id(unichar_str1);
639  int class_id2 = unicharset_.unichar_to_id(unichar_str2);
640  if (class_id2 == INVALID_UNICHAR_ID)
641  class_id2 = class_id1;
642  if (class_id1 == INVALID_UNICHAR_ID) {
643  tprintf("No unicharset entry found for %s\n", unichar_str1);
644  return;
645  } else {
646  tprintf("Font ambiguities for unichar %d = %s and %d = %s\n",
647  class_id1, unichar_str1, class_id2, unichar_str2);
648  }
649  int num_fonts = samples_.NumFonts();
650  const IntFeatureMap& feature_map = feature_map_;
651  // Iterate the fonts to get the similarity with other fonst of the same
652  // class.
653  tprintf(" ");
654  for (int f = 0; f < num_fonts; ++f) {
655  if (samples_.NumClassSamples(f, class_id2, false) == 0)
656  continue;
657  tprintf("%6d", f);
658  }
659  tprintf("\n");
660  for (int f1 = 0; f1 < num_fonts; ++f1) {
661  // Map the features of the canonical_sample.
662  if (samples_.NumClassSamples(f1, class_id1, false) == 0)
663  continue;
664  tprintf("%4d ", f1);
665  for (int f2 = 0; f2 < num_fonts; ++f2) {
666  if (samples_.NumClassSamples(f2, class_id2, false) == 0)
667  continue;
668  float dist = samples_.ClusterDistance(f1, class_id1, f2, class_id2,
669  feature_map);
670  tprintf(" %5.3f", dist);
671  }
672  tprintf("\n");
673  }
674  // Build a fake ShapeTable containing all the sample types.
675  ShapeTable shapes(unicharset_);
676  for (int f = 0; f < num_fonts; ++f) {
677  if (samples_.NumClassSamples(f, class_id1, true) > 0)
678  shapes.AddShape(class_id1, f);
679  if (class_id1 != class_id2 &&
680  samples_.NumClassSamples(f, class_id2, true) > 0)
681  shapes.AddShape(class_id2, f);
682  }
683 }
684 
685 #ifndef GRAPHICS_DISABLED
686 // Debugging for cloud/canonical features.
687 // Displays a Features window containing:
688 // If unichar_str2 is in the unicharset, and canonical_font is non-negative,
689 // displays the canonical features of the char/font combination in red.
690 // If unichar_str1 is in the unicharset, and cloud_font is non-negative,
691 // displays the cloud feature of the char/font combination in green.
692 // The canonical features are drawn first to show which ones have no
693 // matches in the cloud features.
694 // Until the features window is destroyed, each click in the features window
695 // will display the samples that have that feature in a separate window.
696 void MasterTrainer::DisplaySamples(const char* unichar_str1, int cloud_font,
697  const char* unichar_str2,
698  int canonical_font) {
699  const IntFeatureMap& feature_map = feature_map_;
700  const IntFeatureSpace& feature_space = feature_map.feature_space();
701  ScrollView* f_window = CreateFeatureSpaceWindow("Features", 100, 500);
703  f_window);
704  int class_id2 = samples_.unicharset().unichar_to_id(unichar_str2);
705  if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {
706  const TrainingSample* sample = samples_.GetCanonicalSample(canonical_font,
707  class_id2);
708  for (uint32_t f = 0; f < sample->num_features(); ++f) {
709  RenderIntFeature(f_window, &sample->features()[f], ScrollView::RED);
710  }
711  }
712  int class_id1 = samples_.unicharset().unichar_to_id(unichar_str1);
713  if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {
714  const BitVector& cloud = samples_.GetCloudFeatures(cloud_font, class_id1);
715  for (int f = 0; f < cloud.size(); ++f) {
716  if (cloud[f]) {
717  INT_FEATURE_STRUCT feature =
718  feature_map.InverseIndexFeature(f);
719  RenderIntFeature(f_window, &feature, ScrollView::GREEN);
720  }
721  }
722  }
723  f_window->Update();
724  ScrollView* s_window = CreateFeatureSpaceWindow("Samples", 100, 500);
725  SVEventType ev_type;
726  do {
727  SVEvent* ev;
728  // Wait until a click or popup event.
729  ev = f_window->AwaitEvent(SVET_ANY);
730  ev_type = ev->type;
731  if (ev_type == SVET_CLICK) {
732  int feature_index = feature_space.XYToFeatureIndex(ev->x, ev->y);
733  if (feature_index >= 0) {
734  // Iterate samples and display those with the feature.
735  Shape shape;
736  shape.AddToShape(class_id1, cloud_font);
737  s_window->Clear();
738  samples_.DisplaySamplesWithFeature(feature_index, shape,
739  feature_space, ScrollView::GREEN,
740  s_window);
741  s_window->Update();
742  }
743  }
744  delete ev;
745  } while (ev_type != SVET_DESTROY);
746 }
747 #endif // GRAPHICS_DISABLED
748 
749 void MasterTrainer::TestClassifierVOld(bool replicate_samples,
750  ShapeClassifier* test_classifier,
751  ShapeClassifier* old_classifier) {
752  SampleIterator sample_it;
753  sample_it.Init(nullptr, nullptr, replicate_samples, &samples_);
754  ErrorCounter::DebugNewErrors(test_classifier, old_classifier,
755  CT_UNICHAR_TOPN_ERR, fontinfo_table_,
756  page_images_, &sample_it);
757 }
758 
759 // Tests the given test_classifier on the internal samples.
760 // See TestClassifier for details.
762  int report_level,
763  bool replicate_samples,
764  ShapeClassifier* test_classifier,
765  STRING* report_string) {
766  TestClassifier(error_mode, report_level, replicate_samples, &samples_,
767  test_classifier, report_string);
768 }
769 
770 // Tests the given test_classifier on the given samples.
771 // error_mode indicates what counts as an error.
772 // report_levels:
773 // 0 = no output.
774 // 1 = bottom-line error rate.
775 // 2 = bottom-line error rate + time.
776 // 3 = font-level error rate + time.
777 // 4 = list of all errors + short classifier debug output on 16 errors.
778 // 5 = list of all errors + short classifier debug output on 25 errors.
779 // If replicate_samples is true, then the test is run on an extended test
780 // sample including replicated and systematically perturbed samples.
781 // If report_string is non-nullptr, a summary of the results for each font
782 // is appended to the report_string.
784  int report_level,
785  bool replicate_samples,
786  TrainingSampleSet* samples,
787  ShapeClassifier* test_classifier,
788  STRING* report_string) {
789  SampleIterator sample_it;
790  sample_it.Init(nullptr, nullptr, replicate_samples, samples);
791  if (report_level > 0) {
792  int num_samples = 0;
793  for (sample_it.Begin(); !sample_it.AtEnd(); sample_it.Next())
794  ++num_samples;
795  tprintf("Iterator has charset size of %d/%d, %d shapes, %d samples\n",
796  sample_it.SparseCharsetSize(), sample_it.CompactCharsetSize(),
797  test_classifier->GetShapeTable()->NumShapes(), num_samples);
798  tprintf("Testing %sREPLICATED:\n", replicate_samples ? "" : "NON-");
799  }
800  double unichar_error = 0.0;
801  ErrorCounter::ComputeErrorRate(test_classifier, report_level,
802  error_mode, fontinfo_table_,
803  page_images_, &sample_it, &unichar_error,
804  nullptr, report_string);
805  return unichar_error;
806 }
807 
808 // Returns the average (in some sense) distance between the two given
809 // shapes, which may contain multiple fonts and/or unichars.
810 float MasterTrainer::ShapeDistance(const ShapeTable& shapes, int s1, int s2) {
811  const IntFeatureMap& feature_map = feature_map_;
812  const Shape& shape1 = shapes.GetShape(s1);
813  const Shape& shape2 = shapes.GetShape(s2);
814  int num_chars1 = shape1.size();
815  int num_chars2 = shape2.size();
816  float dist_sum = 0.0f;
817  int dist_count = 0;
818  if (num_chars1 > 1 || num_chars2 > 1) {
819  // In the multi-char case try to optimize the calculation by computing
820  // distances between characters of matching font where possible.
821  for (int c1 = 0; c1 < num_chars1; ++c1) {
822  for (int c2 = 0; c2 < num_chars2; ++c2) {
823  dist_sum += samples_.UnicharDistance(shape1[c1], shape2[c2],
824  true, feature_map);
825  ++dist_count;
826  }
827  }
828  } else {
829  // In the single unichar case, there is little alternative, but to compute
830  // the squared-order distance between pairs of fonts.
831  dist_sum = samples_.UnicharDistance(shape1[0], shape2[0],
832  false, feature_map);
833  ++dist_count;
834  }
835  return dist_sum / dist_count;
836 }
837 
838 // Replaces samples that are always fragmented with the corresponding
839 // fragment samples.
840 void MasterTrainer::ReplaceFragmentedSamples() {
841  if (fragments_ == nullptr) return;
842  // Remove samples that are replaced by fragments. Each class that was
843  // always naturally fragmented should be replaced by its fragments.
844  int num_samples = samples_.num_samples();
845  for (int s = 0; s < num_samples; ++s) {
846  TrainingSample* sample = samples_.mutable_sample(s);
847  if (fragments_[sample->class_id()] > 0)
848  samples_.KillSample(sample);
849  }
850  samples_.DeleteDeadSamples();
851 
852  // Get ids of fragments in junk_samples_ that replace the dead chars.
853  const UNICHARSET& frag_set = junk_samples_.unicharset();
854 #if 0
855  // TODO(rays) The original idea was to replace only graphemes that were
856  // always naturally fragmented, but that left a lot of the Indic graphemes
857  // out. Determine whether we can go back to that idea now that spacing
858  // is fixed in the training images, or whether this code is obsolete.
859  bool* good_junk = new bool[frag_set.size()];
860  memset(good_junk, 0, sizeof(*good_junk) * frag_set.size());
861  for (int dead_ch = 1; dead_ch < unicharset_.size(); ++dead_ch) {
862  int frag_ch = fragments_[dead_ch];
863  if (frag_ch <= 0) continue;
864  const char* frag_utf8 = frag_set.id_to_unichar(frag_ch);
866  // Mark the chars for all parts of the fragment as good in good_junk.
867  for (int part = 0; part < frag->get_total(); ++part) {
868  frag->set_pos(part);
869  int good_ch = frag_set.unichar_to_id(frag->to_string().string());
870  if (good_ch != INVALID_UNICHAR_ID)
871  good_junk[good_ch] = true; // We want this one.
872  }
873  delete frag;
874  }
875 #endif
876  // For now just use all the junk that was from natural fragments.
877  // Get samples of fragments in junk_samples_ that replace the dead chars.
878  int num_junks = junk_samples_.num_samples();
879  for (int s = 0; s < num_junks; ++s) {
880  TrainingSample* sample = junk_samples_.mutable_sample(s);
881  int junk_id = sample->class_id();
882  const char* frag_utf8 = frag_set.id_to_unichar(junk_id);
884  if (frag != nullptr && frag->is_natural()) {
885  junk_samples_.extract_sample(s);
886  samples_.AddSample(frag_set.id_to_unichar(junk_id), sample);
887  }
888  delete frag;
889  }
890  junk_samples_.DeleteDeadSamples();
891  junk_samples_.OrganizeByFontAndClass();
892  samples_.OrganizeByFontAndClass();
893  unicharset_.clear();
894  unicharset_.AppendOtherUnicharset(samples_.unicharset());
895  // delete [] good_junk;
896  // Fragments_ no longer needed?
897  delete [] fragments_;
898  fragments_ = nullptr;
899 }
900 
901 // Runs a hierarchical agglomerative clustering to merge shapes in the given
902 // shape_table, while satisfying the given constraints:
903 // * End with at least min_shapes left in shape_table,
904 // * No shape shall have more than max_shape_unichars in it,
905 // * Don't merge shapes where the distance between them exceeds max_dist.
906 const float kInfiniteDist = 999.0f;
907 void MasterTrainer::ClusterShapes(int min_shapes, int max_shape_unichars,
908  float max_dist, ShapeTable* shapes) {
909  int num_shapes = shapes->NumShapes();
910  int max_merges = num_shapes - min_shapes;
911  GenericVector<ShapeDist>* shape_dists =
912  new GenericVector<ShapeDist>[num_shapes];
913  float min_dist = kInfiniteDist;
914  int min_s1 = 0;
915  int min_s2 = 0;
916  tprintf("Computing shape distances...");
917  for (int s1 = 0; s1 < num_shapes; ++s1) {
918  for (int s2 = s1 + 1; s2 < num_shapes; ++s2) {
919  ShapeDist dist(s1, s2, ShapeDistance(*shapes, s1, s2));
920  shape_dists[s1].push_back(dist);
921  if (dist.distance < min_dist) {
922  min_dist = dist.distance;
923  min_s1 = s1;
924  min_s2 = s2;
925  }
926  }
927  tprintf(" %d", s1);
928  }
929  tprintf("\n");
930  int num_merged = 0;
931  while (num_merged < max_merges && min_dist < max_dist) {
932  tprintf("Distance = %f: ", min_dist);
933  int num_unichars = shapes->MergedUnicharCount(min_s1, min_s2);
934  shape_dists[min_s1][min_s2 - min_s1 - 1].distance = kInfiniteDist;
935  if (num_unichars > max_shape_unichars) {
936  tprintf("Merge of %d and %d with %d would exceed max of %d unichars\n",
937  min_s1, min_s2, num_unichars, max_shape_unichars);
938  } else {
939  shapes->MergeShapes(min_s1, min_s2);
940  shape_dists[min_s2].clear();
941  ++num_merged;
942 
943  for (int s = 0; s < min_s1; ++s) {
944  if (!shape_dists[s].empty()) {
945  shape_dists[s][min_s1 - s - 1].distance =
946  ShapeDistance(*shapes, s, min_s1);
947  shape_dists[s][min_s2 - s -1].distance = kInfiniteDist;
948  }
949  }
950  for (int s2 = min_s1 + 1; s2 < num_shapes; ++s2) {
951  if (shape_dists[min_s1][s2 - min_s1 - 1].distance < kInfiniteDist)
952  shape_dists[min_s1][s2 - min_s1 - 1].distance =
953  ShapeDistance(*shapes, min_s1, s2);
954  }
955  for (int s = min_s1 + 1; s < min_s2; ++s) {
956  if (!shape_dists[s].empty()) {
957  shape_dists[s][min_s2 - s - 1].distance = kInfiniteDist;
958  }
959  }
960  }
961  min_dist = kInfiniteDist;
962  for (int s1 = 0; s1 < num_shapes; ++s1) {
963  for (int i = 0; i < shape_dists[s1].size(); ++i) {
964  if (shape_dists[s1][i].distance < min_dist) {
965  min_dist = shape_dists[s1][i].distance;
966  min_s1 = s1;
967  min_s2 = s1 + 1 + i;
968  }
969  }
970  }
971  }
972  tprintf("Stopped with %d merged, min dist %f\n", num_merged, min_dist);
973  delete [] shape_dists;
974  if (debug_level_ > 1) {
975  for (int s1 = 0; s1 < num_shapes; ++s1) {
976  if (shapes->MasterDestinationIndex(s1) == s1) {
977  tprintf("Master shape:%s\n", shapes->DebugStr(s1).string());
978  }
979  }
980  }
981 }
982 
983 
984 } // namespace tesseract.
void MoveTo(UnicityTable< FontInfo > *target)
Definition: fontinfo.cpp:106
bool is_beginning() const
Definition: unicharset.h:106
void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info)
Definition: fontinfo.h:80
bool LoadXHeights(const char *filename)
void KillSample(TrainingSample *sample)
const IntFeatureSpace & feature_space() const
Definition: intfeaturemap.h:60
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:192
void LoadUnicharset(const char *filename)
void DisplaySamples(const char *unichar_str1, int cloud_font, const char *unichar_str2, int canonical_font)
void FreeCharDescription(CHAR_DESC CharDesc)
Definition: featdefs.cpp:129
STRING SummaryStr() const
Definition: shapetable.cpp:313
int size() const
Definition: genericvector.h:71
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:241
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:998
Definition: cluster.h:32
void free_int_templates(INT_TEMPLATES templates)
Definition: intproto.cpp:708
const UNICHARSET & unicharset() const
void SetMap(int sparse_index, bool mapped)
bool save_to_file(const char *const filename) const
Definition: unicharset.h:345
bool AddSpacingInfo(const char *filename)
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
void LoadUnicharset(const char *filename)
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:463
void ComputeCloudFeatures(int feature_space_size)
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:236
void AddSample(bool verification, const char *unichar_str, TrainingSample *sample)
STRING DebugStr(int shape_id) const
Definition: shapetable.cpp:281
const char * string() const
Definition: strngs.cpp:196
Definition: rect.h:34
float ShapeDistance(const ShapeTable &shapes, int s1, int s2)
const int kBlnXHeight
Definition: normalis.h:24
void TestClassifierVOld(bool replicate_samples, ShapeClassifier *test_classifier, ShapeClassifier *old_classifier)
SVEventType
Definition: scrollview.h:45
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
GenericVector< int16_t > kerned_x_gaps
Definition: fontinfo.h:55
static STRING to_string(const char *unichar, int pos, int total, bool natural)
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:59
MasterTrainer(NormalizationMode norm_mode, bool shape_analysis, bool replicate_samples, int debug_level)
int x
Definition: scrollview.h:66
int get_total() const
Definition: unicharset.h:73
int size() const
Definition: bitvector.h:56
#define UNICHAR_LEN
Definition: unichar.h:31
uint32_t properties
Definition: fontinfo.h:118
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, int32_t CharID)
Definition: cluster.cpp:452
int AddSample(const char *unichar, TrainingSample *sample)
bool contains(const T &object) const
bool Serialize(FILE *fp) const
TrainingSample * extract_sample(int index)
static void Update()
Definition: scrollview.cpp:711
INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:535
int size() const
Definition: shapetable.h:200
void DebugCanonical(const char *unichar_str1, const char *unichar_str2)
double TestClassifier(CountTypes error_mode, int report_level, bool replicate_samples, TrainingSampleSet *samples, ShapeClassifier *test_classifier, STRING *report_string)
UnicityTableEqEq< int > font_set
Definition: protos.h:67
CLUSTERER * SetupForClustering(const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs, int shape_id, int *num_samples)
void LoadPageImages(const char *filename)
FEATURE_DEFS_STRUCT feature_defs
int size() const
Definition: unicharset.h:336
void TestClassifierOnSamples(CountTypes error_mode, int report_level, bool replicate_samples, ShapeClassifier *test_classifier, STRING *report_string)
int FindShape(int unichar_id, int font_id) const
Definition: shapetable.cpp:386
const char * kGeoFeatureType
Definition: featdefs.cpp:35
bool Serialize(FILE *fp) const
GenericVector< UNICHAR_ID > kerned_unichar_ids
Definition: fontinfo.h:54
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:56
int get_index(const T &object) const
void init_spacing(int unicharset_size)
Definition: fontinfo.h:73
void Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize, TrainingSampleSet *sample_set)
uint8_t NumConfigs
Definition: intproto.h:108
static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix *> &page_images, SampleIterator *it)
T & get(int index) const
void ReplicateAndRandomizeSamplesIfRequired()
NormalizationMode
Definition: normalis.h:42
void AppendMasterShapes(const ShapeTable &other, GenericVector< int > *shape_map)
Definition: shapetable.cpp:656
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:445
const int kMinClusteredShapes
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
const int kMaxUnicharsPerCluster
void init_to_size(int size, const T &t)
const float kInfiniteDist
int GetFontInfoId(const char *font_name)
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)
bool LoadFontInfo(const char *filename)
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:1030
uint16_t ConfigLengths[MAX_NUM_CONFIGS]
Definition: intproto.h:111
void set_pos(int p)
Definition: unicharset.h:69
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1628
int NumClassSamples(int font_id, int class_id, bool randomize) const
SVEventType type
Definition: scrollview.h:64
void IndexFeatures(const IntFeatureSpace &feature_space)
const BitVector & GetCloudFeatures(int font_id, int class_id) const
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int MasterDestinationIndex(int shape_id) const
Definition: shapetable.cpp:531
INT_FEATURE_STRUCT InverseIndexFeature(int index_feature) const
const char * kMicroFeatureType
Definition: featdefs.cpp:32
bool is_ending() const
Definition: unicharset.h:109
int push_back(T object)
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:320
const UNICHARSET & unicharset() const
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729
virtual const ShapeTable * GetShapeTable() const =0
Definition: strngs.h:45
const T & get(int id) const
Return the object from an id.
bool Serialize(FILE *fp) const
static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix *> &page_images, SampleIterator *it, double *unichar_error, double *scaled_error, STRING *fonts_report)
Definition: mf.h:30
bool Serialize(FILE *fp) const
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1789
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:48
const float kFontMergeDistance
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
bool Serialize(FILE *fp) const
Definition: fontinfo.cpp:49
void clear()
Definition: unicharset.h:301
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
void Clear()
Definition: scrollview.cpp:591
int AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:336
void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:270
void SetupFlatShapeTable(ShapeTable *shape_table)
int32_t universal_id
Definition: fontinfo.h:123
const char * kCNFeatureType
Definition: featdefs.cpp:33
void MergeShapes(int shape_id1, int shape_id2)
Definition: shapetable.cpp:513
const TrainingSample & GetSample() const
TrainingSample * mutable_sample(int index)
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
int DivRounded(int a, int b)
Definition: helpers.h:162
#define ClassForClassId(T, c)
Definition: intproto.h:176
int y
Definition: scrollview.h:67
bool is_natural() const
Definition: unicharset.h:114
int NumShapes() const
Definition: shapetable.h:275
int MergedUnicharCount(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:503
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:383
const char * kIntFeatureType
Definition: featdefs.cpp:34
int GetBestMatchingFontInfoId(const char *filename)
void AddToShape(int unichar_id, int font_id)
Definition: shapetable.cpp:101
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:399
void WriteInttempAndPFFMTable(const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
void Init(int size, bool all_mapped)
static CHAR_FRAGMENT * parse_from_string(const char *str)
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:173
int XYToFeatureIndex(int x, int y) const
#define ASSERT_HOST(x)
Definition: errcode.h:84