tesseract  4.0.0-1-g2a2b
commontraining.cpp
Go to the documentation of this file.
1 // Copyright 2008 Google Inc. All Rights Reserved.
2 // Author: scharron@google.com (Samuel Charron)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 #include "commontraining.h"
15 
16 #ifdef DISABLED_LEGACY_ENGINE
17 
18 #include <algorithm>
19 #include <cmath>
20 
21 #include "params.h"
22 #include "tessopt.h"
23 #include "tprintf.h"
24 
25 
26 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
27 INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
28 STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
29 STRING_PARAM_FLAG(D, "", "Directory to write output files to");
30 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
31 STRING_PARAM_FLAG(X, "", "File listing font xheights");
32 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
33 STRING_PARAM_FLAG(O, "", "File to write unicharset to");
34 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
35 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
36 
37 
50 void ParseArguments(int* argc, char ***argv) {
51  STRING usage;
52  if (*argc) {
53  usage += (*argv)[0];
54  usage += " -v | --version | ";
55  usage += (*argv)[0];
56  }
57  usage += " [.tr files ...]";
58  tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
59 }
60 
61 #else
62 
63 #include <algorithm>
64 #include <cmath>
65 
66 #include "allheaders.h"
67 #include "ccutil.h"
68 #include "classify.h"
69 #include "cluster.h"
70 #include "clusttool.h"
71 #include "emalloc.h"
72 #include "featdefs.h"
73 #include "fontinfo.h"
74 #include "globals.h"
75 #include "intfeaturespace.h"
76 #include "mastertrainer.h"
77 #include "mf.h"
78 #include "oldlist.h"
79 #include "params.h"
80 #include "shapetable.h"
81 #include "tessdatamanager.h"
82 #include "tessopt.h"
83 #include "tprintf.h"
84 #include "unicity_table.h"
85 
86 using tesseract::CCUtil;
90 
91 // Global Variables.
92 
93 // global variable to hold configuration parameters to control clustering
94 // -M 0.625 -B 0.05 -I 1.0 -C 1e-6.
95 CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 };
98 
99 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
100 INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
101 STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
102 STRING_PARAM_FLAG(D, "", "Directory to write output files to");
103 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
104 STRING_PARAM_FLAG(X, "", "File listing font xheights");
105 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
106 STRING_PARAM_FLAG(O, "", "File to write unicharset to");
107 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
108 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
109 DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,
110  "Min number of samples per proto as % of total");
111 DOUBLE_PARAM_FLAG(clusterconfig_max_illegal, Config.MaxIllegal,
112  "Max percentage of samples in a cluster which have more"
113  " than 1 feature in that cluster");
114 DOUBLE_PARAM_FLAG(clusterconfig_independence, Config.Independence,
115  "Desired independence between dimensions");
116 DOUBLE_PARAM_FLAG(clusterconfig_confidence, Config.Confidence,
117  "Desired confidence in prototypes created");
118 
130 void ParseArguments(int* argc, char ***argv) {
131  STRING usage;
132  if (*argc) {
133  usage += (*argv)[0];
134  usage += " -v | --version | ";
135  usage += (*argv)[0];
136  }
137  usage += " [.tr files ...]";
138  tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
139  // Record the index of the first non-flag argument to 1, since we set
140  // remove_flags to true when parsing the flags.
141  tessoptind = 1;
142  // Set some global values based on the flags.
144  std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
146  std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_max_illegal)));
148  std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_independence)));
150  std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_confidence)));
151  // Set additional parameters from config file if specified.
152  if (!FLAGS_configfile.empty()) {
154  FLAGS_configfile.c_str(),
156  ccutil.params());
157  }
158 }
159 
160 namespace tesseract {
161 // Helper loads shape table from the given file.
162 ShapeTable* LoadShapeTable(const STRING& file_prefix) {
163  ShapeTable* shape_table = nullptr;
164  STRING shape_table_file = file_prefix;
165  shape_table_file += kShapeTableFileSuffix;
166  TFile shape_fp;
167  if (shape_fp.Open(shape_table_file.string(), nullptr)) {
168  shape_table = new ShapeTable;
169  if (!shape_table->DeSerialize(&shape_fp)) {
170  delete shape_table;
171  shape_table = nullptr;
172  tprintf("Error: Failed to read shape table %s\n",
173  shape_table_file.string());
174  } else {
175  int num_shapes = shape_table->NumShapes();
176  tprintf("Read shape table %s of %d shapes\n",
177  shape_table_file.string(), num_shapes);
178  }
179  } else {
180  tprintf("Warning: No shape table file present: %s\n",
181  shape_table_file.string());
182  }
183  return shape_table;
184 }
185 
186 // Helper to write the shape_table.
187 void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table) {
188  STRING shape_table_file = file_prefix;
189  shape_table_file += kShapeTableFileSuffix;
190  FILE* fp = fopen(shape_table_file.string(), "wb");
191  if (fp != nullptr) {
192  if (!shape_table.Serialize(fp)) {
193  fprintf(stderr, "Error writing shape table: %s\n",
194  shape_table_file.string());
195  }
196  fclose(fp);
197  } else {
198  fprintf(stderr, "Error creating shape table: %s\n",
199  shape_table_file.string());
200  }
201 }
202 
219 MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
220  bool replication,
221  ShapeTable** shape_table,
222  STRING* file_prefix) {
224  InitIntegerFX();
225  *file_prefix = "";
226  if (!FLAGS_D.empty()) {
227  *file_prefix += FLAGS_D.c_str();
228  *file_prefix += "/";
229  }
230  // If we are shape clustering (nullptr shape_table) or we successfully load
231  // a shape_table written by a previous shape clustering, then
232  // shape_analysis will be true, meaning that the MasterTrainer will replace
233  // some members of the unicharset with their fragments.
234  bool shape_analysis = false;
235  if (shape_table != nullptr) {
236  *shape_table = LoadShapeTable(*file_prefix);
237  if (*shape_table != nullptr) shape_analysis = true;
238  } else {
239  shape_analysis = true;
240  }
242  shape_analysis,
243  replication,
244  FLAGS_debug_level);
245  IntFeatureSpace fs;
247  trainer->LoadUnicharset(FLAGS_U.c_str());
248  // Get basic font information from font_properties.
249  if (!FLAGS_F.empty()) {
250  if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
251  delete trainer;
252  return nullptr;
253  }
254  }
255  if (!FLAGS_X.empty()) {
256  if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
257  delete trainer;
258  return nullptr;
259  }
260  }
261  trainer->SetFeatureSpace(fs);
262  const char* page_name;
263  // Load training data from .tr files on the command line.
264  while ((page_name = GetNextFilename(argc, argv)) != nullptr) {
265  tprintf("Reading %s ...\n", page_name);
266  trainer->ReadTrainingSamples(page_name, feature_defs, false);
267 
268  // If there is a file with [lang].[fontname].exp[num].fontinfo present,
269  // read font spacing information in to fontinfo_table.
270  int pagename_len = strlen(page_name);
271  char* fontinfo_file_name = new char[pagename_len + 7];
272  strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
273  strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
274  trainer->AddSpacingInfo(fontinfo_file_name);
275  delete[] fontinfo_file_name;
276 
277  // Load the images into memory if required by the classifier.
278  if (FLAGS_load_images) {
279  STRING image_name = page_name;
280  // Chop off the tr and replace with tif. Extension must be tif!
281  image_name.truncate_at(image_name.length() - 2);
282  image_name += "tif";
283  trainer->LoadPageImages(image_name.string());
284  }
285  }
286  trainer->PostLoadCleanup();
287  // Write the master trainer if required.
288  if (!FLAGS_output_trainer.empty()) {
289  FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
290  if (fp == nullptr) {
291  tprintf("Can't create saved trainer data!\n");
292  } else {
293  trainer->Serialize(fp);
294  fclose(fp);
295  }
296  }
297  trainer->PreTrainingSetup();
298  if (!FLAGS_O.empty() &&
299  !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
300  fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
301  delete trainer;
302  return nullptr;
303  }
304  if (shape_table != nullptr) {
305  // If we previously failed to load a shapetable, then shape clustering
306  // wasn't run so make a flat one now.
307  if (*shape_table == nullptr) {
308  *shape_table = new ShapeTable;
309  trainer->SetupFlatShapeTable(*shape_table);
310  tprintf("Flat shape table summary: %s\n",
311  (*shape_table)->SummaryStr().string());
312  }
313  (*shape_table)->set_unicharset(trainer->unicharset());
314  }
315  return trainer;
316 }
317 
318 } // namespace tesseract.
319 
320 /*---------------------------------------------------------------------------*/
331 const char *GetNextFilename(int argc, const char* const * argv) {
332  if (tessoptind < argc)
333  return argv[tessoptind++];
334  else
335  return nullptr;
336 } /* GetNextFilename */
337 
338 /*---------------------------------------------------------------------------*/
348 LABELEDLIST FindList(LIST List, char* Label) {
349  LABELEDLIST LabeledList;
350 
351  iterate (List)
352  {
353  LabeledList = (LABELEDLIST) first_node (List);
354  if (strcmp (LabeledList->Label, Label) == 0)
355  return (LabeledList);
356  }
357  return (nullptr);
358 
359 } /* FindList */
360 
361 /*---------------------------------------------------------------------------*/
369 LABELEDLIST NewLabeledList(const char* Label) {
370  LABELEDLIST LabeledList;
371 
372  LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE));
373  LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
374  strcpy (LabeledList->Label, Label);
375  LabeledList->List = NIL_LIST;
376  LabeledList->SampleCount = 0;
377  LabeledList->font_sample_count = 0;
378  return (LabeledList);
379 
380 } /* NewLabeledList */
381 
382 /*---------------------------------------------------------------------------*/
383 // TODO(rays) This is now used only by cntraining. Convert cntraining to use
384 // the new method or get rid of it entirely.
399 void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_definitions,
400  const char *feature_name, int max_samples,
401  UNICHARSET* unicharset,
402  FILE* file, LIST* training_samples) {
403  char buffer[2048];
404  char unichar[UNICHAR_LEN + 1];
405  LABELEDLIST char_sample;
406  FEATURE_SET feature_samples;
407  CHAR_DESC char_desc;
408  uint32_t feature_type =
409  ShortNameToFeatureType(feature_definitions, feature_name);
410 
411  // Zero out the font_sample_count for all the classes.
412  LIST it = *training_samples;
413  iterate(it) {
414  char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
415  char_sample->font_sample_count = 0;
416  }
417 
418  while (fgets(buffer, 2048, file) != nullptr) {
419  if (buffer[0] == '\n')
420  continue;
421 
422  sscanf(buffer, "%*s %s", unichar);
423  if (unicharset != nullptr && !unicharset->contains_unichar(unichar)) {
424  unicharset->unichar_insert(unichar);
425  if (unicharset->size() > MAX_NUM_CLASSES) {
426  tprintf("Error: Size of unicharset in training is "
427  "greater than MAX_NUM_CLASSES\n");
428  exit(1);
429  }
430  }
431  char_sample = FindList(*training_samples, unichar);
432  if (char_sample == nullptr) {
433  char_sample = NewLabeledList(unichar);
434  *training_samples = push(*training_samples, char_sample);
435  }
436  char_desc = ReadCharDescription(feature_definitions, file);
437  feature_samples = char_desc->FeatureSets[feature_type];
438  if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
439  char_sample->List = push(char_sample->List, feature_samples);
440  char_sample->SampleCount++;
441  char_sample->font_sample_count++;
442  } else {
443  FreeFeatureSet(feature_samples);
444  }
445  for (size_t i = 0; i < char_desc->NumFeatureSets; i++) {
446  if (feature_type != i)
447  FreeFeatureSet(char_desc->FeatureSets[i]);
448  }
449  free(char_desc);
450  }
451 } // ReadTrainingSamples
452 
453 
454 /*---------------------------------------------------------------------------*/
462 void FreeTrainingSamples(LIST CharList) {
463  LABELEDLIST char_sample;
464  FEATURE_SET FeatureSet;
465  LIST FeatureList;
466 
467  LIST nodes = CharList;
468  iterate(CharList) { /* iterate through all of the fonts */
469  char_sample = (LABELEDLIST) first_node(CharList);
470  FeatureList = char_sample->List;
471  iterate(FeatureList) { /* iterate through all of the classes */
472  FeatureSet = (FEATURE_SET) first_node(FeatureList);
473  FreeFeatureSet(FeatureSet);
474  }
475  FreeLabeledList(char_sample);
476  }
477  destroy(nodes);
478 } /* FreeTrainingSamples */
479 
480 /*---------------------------------------------------------------------------*/
489 void FreeLabeledList(LABELEDLIST LabeledList) {
490  destroy(LabeledList->List);
491  free(LabeledList->Label);
492  free(LabeledList);
493 } /* FreeLabeledList */
494 
495 /*---------------------------------------------------------------------------*/
508  LABELEDLIST char_sample,
509  const char* program_feature_type) {
510  uint16_t N;
511  int i, j;
512  float* Sample = nullptr;
513  CLUSTERER *Clusterer;
514  int32_t CharID;
515  LIST FeatureList = nullptr;
516  FEATURE_SET FeatureSet = nullptr;
517 
518  int32_t desc_index =
519  ShortNameToFeatureType(FeatureDefs, program_feature_type);
520  N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
521  Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
522 
523  FeatureList = char_sample->List;
524  CharID = 0;
525  iterate(FeatureList) {
526  FeatureSet = (FEATURE_SET) first_node(FeatureList);
527  for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
528  if (Sample == nullptr) Sample = (float*)Emalloc(N * sizeof(float));
529  for (j = 0; j < N; j++)
530  Sample[j] = FeatureSet->Features[i]->Params[j];
531  MakeSample (Clusterer, Sample, CharID);
532  }
533  CharID++;
534  }
535  free(Sample);
536  return Clusterer;
537 
538 } /* SetUpForClustering */
539 
540 /*------------------------------------------------------------------------*/
541 void MergeInsignificantProtos(LIST ProtoList, const char* label,
542  CLUSTERER* Clusterer,
543  CLUSTERCONFIG* clusterconfig) {
544  PROTOTYPE* Prototype;
545  bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
546 
547  LIST pProtoList = ProtoList;
548  iterate(pProtoList) {
549  Prototype = (PROTOTYPE *) first_node (pProtoList);
550  if (Prototype->Significant || Prototype->Merged)
551  continue;
552  float best_dist = 0.125;
553  PROTOTYPE* best_match = nullptr;
554  // Find the nearest alive prototype.
555  LIST list_it = ProtoList;
556  iterate(list_it) {
557  PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it);
558  if (test_p != Prototype && !test_p->Merged) {
559  float dist = ComputeDistance(Clusterer->SampleSize,
560  Clusterer->ParamDesc,
561  Prototype->Mean, test_p->Mean);
562  if (dist < best_dist) {
563  best_match = test_p;
564  best_dist = dist;
565  }
566  }
567  }
568  if (best_match != nullptr && !best_match->Significant) {
569  if (debug)
570  tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
571  best_match->NumSamples, Prototype->NumSamples,
572  best_match->Mean[0], best_match->Mean[1],
573  Prototype->Mean[0], Prototype->Mean[1]);
574  best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
575  Clusterer->ParamDesc,
576  best_match->NumSamples,
577  Prototype->NumSamples,
578  best_match->Mean,
579  best_match->Mean, Prototype->Mean);
580  Prototype->NumSamples = 0;
581  Prototype->Merged = 1;
582  } else if (best_match != nullptr) {
583  if (debug)
584  tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
585  Prototype->Mean[0], Prototype->Mean[1],
586  best_match->Mean[0], best_match->Mean[1]);
587  Prototype->Merged = 1;
588  }
589  }
590  // Mark significant those that now have enough samples.
591  int min_samples =
592  static_cast<int32_t>(clusterconfig->MinSamples * Clusterer->NumChar);
593  pProtoList = ProtoList;
594  iterate(pProtoList) {
595  Prototype = (PROTOTYPE *) first_node (pProtoList);
596  // Process insignificant protos that do not match a green one
597  if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
598  !Prototype->Merged) {
599  if (debug)
600  tprintf("Red proto at %g,%g becoming green\n",
601  Prototype->Mean[0], Prototype->Mean[1]);
602  Prototype->Significant = true;
603  }
604  }
605 } /* MergeInsignificantProtos */
606 
607 /*-----------------------------------------------------------------------------*/
609  LIST ProtoList)
610 {
611  PROTOTYPE* Prototype;
612 
613  iterate(ProtoList)
614  {
615  Prototype = (PROTOTYPE *) first_node (ProtoList);
616  free(Prototype->Variance.Elliptical);
617  Prototype->Variance.Elliptical = nullptr;
618  free(Prototype->Magnitude.Elliptical);
619  Prototype->Magnitude.Elliptical = nullptr;
620  free(Prototype->Weight.Elliptical);
621  Prototype->Weight.Elliptical = nullptr;
622  }
623 }
624 
625 /*------------------------------------------------------------------------*/
627  LIST ProtoList,
628  bool KeepSigProtos,
629  bool KeepInsigProtos,
630  int N)
631 
632 {
633  LIST NewProtoList = NIL_LIST;
634  LIST pProtoList;
635  PROTOTYPE* Proto;
636  PROTOTYPE* NewProto;
637  int i;
638 
639  pProtoList = ProtoList;
640  iterate(pProtoList)
641  {
642  Proto = (PROTOTYPE *) first_node (pProtoList);
643  if ((Proto->Significant && KeepSigProtos) ||
644  (!Proto->Significant && KeepInsigProtos))
645  {
646  NewProto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
647 
648  NewProto->Mean = (float *)Emalloc(N * sizeof(float));
649  NewProto->Significant = Proto->Significant;
650  NewProto->Style = Proto->Style;
651  NewProto->NumSamples = Proto->NumSamples;
652  NewProto->Cluster = nullptr;
653  NewProto->Distrib = nullptr;
654 
655  for (i=0; i < N; i++)
656  NewProto->Mean[i] = Proto->Mean[i];
657  if (Proto->Variance.Elliptical != nullptr) {
658  NewProto->Variance.Elliptical = (float *)Emalloc(N * sizeof(float));
659  for (i=0; i < N; i++)
660  NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
661  }
662  else
663  NewProto->Variance.Elliptical = nullptr;
664  //---------------------------------------------
665  if (Proto->Magnitude.Elliptical != nullptr) {
666  NewProto->Magnitude.Elliptical = (float *)Emalloc(N * sizeof(float));
667  for (i=0; i < N; i++)
668  NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
669  }
670  else
671  NewProto->Magnitude.Elliptical = nullptr;
672  //------------------------------------------------
673  if (Proto->Weight.Elliptical != nullptr) {
674  NewProto->Weight.Elliptical = (float *)Emalloc(N * sizeof(float));
675  for (i=0; i < N; i++)
676  NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
677  }
678  else
679  NewProto->Weight.Elliptical = nullptr;
680 
681  NewProto->TotalMagnitude = Proto->TotalMagnitude;
682  NewProto->LogMagnitude = Proto->LogMagnitude;
683  NewProtoList = push_last(NewProtoList, NewProto);
684  }
685  }
686  FreeProtoList(&ProtoList);
687  return (NewProtoList);
688 } /* RemoveInsignificantProtos */
689 
690 /*----------------------------------------------------------------------------*/
691 MERGE_CLASS FindClass(LIST List, const char* Label) {
692  MERGE_CLASS MergeClass;
693 
694  iterate (List)
695  {
696  MergeClass = (MERGE_CLASS) first_node (List);
697  if (strcmp (MergeClass->Label, Label) == 0)
698  return (MergeClass);
699  }
700  return (nullptr);
701 
702 } /* FindClass */
703 
704 /*---------------------------------------------------------------------------*/
705 MERGE_CLASS NewLabeledClass(const char* Label) {
706  MERGE_CLASS MergeClass;
707 
708  MergeClass = new MERGE_CLASS_NODE;
709  MergeClass->Label = (char*)Emalloc (strlen (Label)+1);
710  strcpy (MergeClass->Label, Label);
711  MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS);
712  return (MergeClass);
713 
714 } /* NewLabeledClass */
715 
716 /*-----------------------------------------------------------------------------*/
724 void FreeLabeledClassList(LIST ClassList) {
725  MERGE_CLASS MergeClass;
726 
727  LIST nodes = ClassList;
728  iterate(ClassList) /* iterate through all of the fonts */
729  {
730  MergeClass = (MERGE_CLASS) first_node (ClassList);
731  free (MergeClass->Label);
732  FreeClass(MergeClass->Class);
733  delete MergeClass;
734  }
735  destroy(nodes);
736 
737 } /* FreeLabeledClassList */
738 
739 /* SetUpForFloat2Int */
741  LIST LabeledClassList) {
742  MERGE_CLASS MergeClass;
743  CLASS_TYPE Class;
744  int NumProtos;
745  int NumConfigs;
746  int NumWords;
747  int i, j;
748  float Values[3];
749  PROTO NewProto;
750  PROTO OldProto;
751  BIT_VECTOR NewConfig;
752  BIT_VECTOR OldConfig;
753 
754  // printf("Float2Int ...\n");
755 
756  CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
757  iterate(LabeledClassList)
758  {
759  UnicityTableEqEq<int> font_set;
760  MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
761  Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
762  NumProtos = MergeClass->Class->NumProtos;
763  NumConfigs = MergeClass->Class->NumConfigs;
764  font_set.move(&MergeClass->Class->font_set);
765  Class->NumProtos = NumProtos;
766  Class->MaxNumProtos = NumProtos;
767  Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos);
768  for(i=0; i < NumProtos; i++)
769  {
770  NewProto = ProtoIn(Class, i);
771  OldProto = ProtoIn(MergeClass->Class, i);
772  Values[0] = OldProto->X;
773  Values[1] = OldProto->Y;
774  Values[2] = OldProto->Angle;
775  Normalize(Values);
776  NewProto->X = OldProto->X;
777  NewProto->Y = OldProto->Y;
778  NewProto->Length = OldProto->Length;
779  NewProto->Angle = OldProto->Angle;
780  NewProto->A = Values[0];
781  NewProto->B = Values[1];
782  NewProto->C = Values[2];
783  }
784 
785  Class->NumConfigs = NumConfigs;
786  Class->MaxNumConfigs = NumConfigs;
787  Class->font_set.move(&font_set);
788  Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs);
789  NumWords = WordsInVectorOfSize(NumProtos);
790  for(i=0; i < NumConfigs; i++)
791  {
792  NewConfig = NewBitVector(NumProtos);
793  OldConfig = MergeClass->Class->Configurations[i];
794  for(j=0; j < NumWords; j++)
795  NewConfig[j] = OldConfig[j];
796  Class->Configurations[i] = NewConfig;
797  }
798  }
799  return float_classes;
800 } // SetUpForFloat2Int
801 
802 /*--------------------------------------------------------------------------*/
803 void Normalize (
804  float *Values)
805 {
806  float Slope;
807  float Intercept;
808  float Normalizer;
809 
810  Slope = tan(Values [2] * 2 * M_PI);
811  Intercept = Values [1] - Slope * Values [0];
812  Normalizer = 1 / sqrt (Slope * Slope + 1.0);
813 
814  Values [0] = Slope * Normalizer;
815  Values [1] = - Normalizer;
816  Values [2] = Intercept * Normalizer;
817 } // Normalize
818 
819 /*-------------------------------------------------------------------------*/
820 void FreeNormProtoList(LIST CharList)
821 
822 {
823  LABELEDLIST char_sample;
824 
825  LIST nodes = CharList;
826  iterate(CharList) /* iterate through all of the fonts */
827  {
828  char_sample = (LABELEDLIST) first_node (CharList);
829  FreeLabeledList (char_sample);
830  }
831  destroy(nodes);
832 
833 } // FreeNormProtoList
834 
835 /*---------------------------------------------------------------------------*/
837  LIST* NormProtoList,
838  LIST ProtoList,
839  char* CharName)
840 {
841  PROTOTYPE* Proto;
842  LABELEDLIST LabeledProtoList;
843 
844  LabeledProtoList = NewLabeledList(CharName);
845  iterate(ProtoList)
846  {
847  Proto = (PROTOTYPE *) first_node (ProtoList);
848  LabeledProtoList->List = push(LabeledProtoList->List, Proto);
849  }
850  *NormProtoList = push(*NormProtoList, LabeledProtoList);
851 }
852 
853 /*---------------------------------------------------------------------------*/
854 int NumberOfProtos(LIST ProtoList, bool CountSigProtos,
855  bool CountInsigProtos) {
856  int N = 0;
857  iterate(ProtoList)
858  {
859  PROTOTYPE* Proto = (PROTOTYPE*)first_node(ProtoList);
860  if ((Proto->Significant && CountSigProtos) ||
861  (!Proto->Significant && CountInsigProtos))
862  N++;
863  }
864  return(N);
865 }
866 
867 #endif // def DISABLED_LEGACY_ENGINE
LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
int16_t NumProtos
Definition: protos.h:61
bool LoadXHeights(const char *filename)
CLUSTERCONFIG Config
float * Mean
Definition: cluster.h:78
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:241
void move(UnicityTable< T > *from)
void FreeLabeledList(LABELEDLIST LabeledList)
float X
Definition: protos.h:46
const UNICHARSET & unicharset() const
DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples, "Min number of samples per proto as % of total")
const int kBoostXYBuckets
#define WordsInVectorOfSize(NumBits)
Definition: bitvec.h:63
void Init(uint8_t xbuckets, uint8_t ybuckets, uint8_t thetabuckets)
float MinSamples
Definition: cluster.h:50
bool save_to_file(const char *const filename) const
Definition: unicharset.h:345
float B
Definition: protos.h:44
bool AddSpacingInfo(const char *filename)
void LoadUnicharset(const char *filename)
PARAM_DESC * ParamDesc
Definition: cluster.h:88
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:236
const char * string() const
Definition: strngs.cpp:196
PROTO_STRUCT * PROTO
Definition: protos.h:51
void InitIntegerFX()
Definition: intfx.cpp:53
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
LABELEDLIST NewLabeledList(const char *Label)
void ParseArguments(int *argc, char ***argv)
void * Emalloc(int Size)
Definition: emalloc.cpp:31
LIST push(LIST list, void *element)
Definition: oldlist.cpp:283
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:246
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
float TotalMagnitude
Definition: cluster.h:79
const int kBoostDirBuckets
unsigned Merged
Definition: cluster.h:69
#define UNICHAR_LEN
Definition: unichar.h:31
PROTO Prototypes
Definition: protos.h:63
LIST destroy(LIST list)
Definition: oldlist.cpp:170
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, int32_t CharID)
Definition: cluster.cpp:452
float * Elliptical
Definition: cluster.h:64
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *clusterconfig)
struct LABELEDLISTNODE * LABELEDLIST
int16_t MaxNumProtos
Definition: protos.h:62
UnicityTableEqEq< int > font_set
Definition: protos.h:67
FLOATUNION Weight
Definition: cluster.h:83
#define ProtoIn(Class, Pid)
Definition: protos.h:121
void LoadPageImages(const char *filename)
int16_t NumConfigs
Definition: protos.h:64
float Params[1]
Definition: ocrfeatures.h:62
FEATURE_DEFS_STRUCT feature_defs
int size() const
Definition: unicharset.h:336
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:625
float Y
Definition: protos.h:47
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:56
float Length
Definition: protos.h:49
const char * c_str() const
Definition: strngs.cpp:207
uint16_t MaxNumFeatures
Definition: ocrfeatures.h:68
FEATURE Features[1]
Definition: ocrfeatures.h:69
void FreeClass(CLASS_TYPE Class)
Definition: protos.cpp:212
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
MERGE_CLASS FindClass(LIST List, const char *Label)
#define MAX_NUM_PROTOS
Definition: intproto.h:48
float MaxIllegal
Definition: cluster.h:51
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
DISTRIBUTION * Distrib
Definition: cluster.h:77
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:563
INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging")
BIT_VECTOR NewBitVector(int NumBits)
Definition: bitvec.cpp:82
unsigned Style
Definition: cluster.h:74
void WriteShapeTable(const STRING &file_prefix, const ShapeTable &shape_table)
MERGE_CLASS NewLabeledClass(const char *Label)
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
void FreeNormProtoList(LIST CharList)
int16_t MaxNumConfigs
Definition: protos.h:65
void SetFeatureSpace(const IntFeatureSpace &fs)
Definition: mastertrainer.h:82
unsigned Significant
Definition: cluster.h:68
float C
Definition: protos.h:45
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:112
bool LoadFontInfo(const char *filename)
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:39
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:297
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:42
ParamsVectors * params()
Definition: ccutil.h:62
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
Definition: cluster.cpp:852
double Confidence
Definition: cluster.h:54
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
FEATURE_SET_STRUCT * FEATURE_SET
Definition: ocrfeatures.h:71
FLOATUNION Magnitude
Definition: cluster.h:82
float Independence
Definition: cluster.h:53
void truncate_at(int32_t index)
Definition: strngs.cpp:267
STRING_PARAM_FLAG(configfile, "", "File to load more configs from")
int32_t NumChar
Definition: cluster.h:93
void FreeTrainingSamples(LIST CharList)
void FreeLabeledClassList(LIST ClassList)
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:196
#define first_node(l)
Definition: oldlist.h:141
LABELEDLIST FindList(LIST List, char *Label)
#define NIL_LIST
Definition: oldlist.h:127
CLUSTER * Cluster
Definition: cluster.h:76
uint32_t NumFeatureSets
Definition: featdefs.h:41
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
CLASS_TYPE Class
float Angle
Definition: protos.h:48
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:65
Definition: strngs.h:45
bool Serialize(FILE *fp) const
#define iterate(l)
Definition: oldlist.h:161
unsigned NumSamples
Definition: cluster.h:75
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:48
void CleanUpUnusedData(LIST ProtoList)
const char * GetNextFilename(int argc, const char *const *argv)
void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:270
float LogMagnitude
Definition: cluster.h:80
MERGE_CLASS_NODE * MERGE_CLASS
CLASS_TYPE NewClass(int NumProtos, int NumConfigs)
Definition: protos.cpp:244
void SetupFlatShapeTable(ShapeTable *shape_table)
CONFIGS Configurations
Definition: protos.h:66
void Normalize(float *Values)
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
int16_t SampleSize
Definition: cluster.h:87
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
float ComputeDistance(int k, PARAM_DESC *dim, float p1[], float p2[])
Definition: kdtree.cpp:450
int tessoptind
Definition: tessopt.cpp:24
int NumShapes() const
Definition: shapetable.h:275
#define MAX_NUM_CLASSES
Definition: matchdefs.h:32
CCUtil ccutil
int32_t length() const
Definition: strngs.cpp:191
float A
Definition: protos.h:43
FLOATUNION Variance
Definition: cluster.h:81
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:399
ShapeTable * LoadShapeTable(const STRING &file_prefix)