tesseract  5.0.0-alpha-619-ge9db
commontraining.cpp
Go to the documentation of this file.
1 // Copyright 2008 Google Inc. All Rights Reserved.
2 // Author: scharron@google.com (Samuel Charron)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 #define _USE_MATH_DEFINES // for M_PI
15 #include "commontraining.h"
16 #include <algorithm>
17 #include <cmath> // for M_PI
18 
19 #ifdef DISABLED_LEGACY_ENGINE
20 
21 #include "params.h"
22 #include "tessopt.h"
23 #include "tprintf.h"
24 
25 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
26 INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
27 STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
28 STRING_PARAM_FLAG(D, "", "Directory to write output files to");
29 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
30 STRING_PARAM_FLAG(X, "", "File listing font xheights");
31 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
32 STRING_PARAM_FLAG(O, "", "File to write unicharset to");
33 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
34 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
35 
47 void ParseArguments(int* argc, char ***argv) {
48  STRING usage;
49  if (*argc) {
50  usage += (*argv)[0];
51  usage += " -v | --version | ";
52  usage += (*argv)[0];
53  }
54  usage += " [.tr files ...]";
55  tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
56 }
57 
58 #else
59 
60 #include "allheaders.h"
61 #include "ccutil.h"
62 #include "classify.h"
63 #include "cluster.h"
64 #include "clusttool.h"
65 #include "emalloc.h"
66 #include "featdefs.h"
67 #include "fontinfo.h"
68 #include "intfeaturespace.h"
69 #include "mastertrainer.h"
70 #include "mf.h"
71 #include "oldlist.h"
72 #include "params.h"
73 #include "shapetable.h"
74 #include "tessdatamanager.h"
75 #include "tessopt.h"
76 #include "tprintf.h"
77 #include "unicity_table.h"
78 
79 using tesseract::CCUtil;
83 
84 // Global Variables.
85 
86 // global variable to hold configuration parameters to control clustering
87 // -M 0.625 -B 0.05 -I 1.0 -C 1e-6.
88 CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 };
90 static CCUtil ccutil;
91 
92 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
93 static INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
94 static STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
95 STRING_PARAM_FLAG(D, "", "Directory to write output files to");
96 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
97 STRING_PARAM_FLAG(X, "", "File listing font xheights");
98 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
99 STRING_PARAM_FLAG(O, "", "File to write unicharset to");
100 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
101 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
102 static DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,
103  "Min number of samples per proto as % of total");
104 static DOUBLE_PARAM_FLAG(clusterconfig_max_illegal, Config.MaxIllegal,
105  "Max percentage of samples in a cluster which have more"
106  " than 1 feature in that cluster");
107 static DOUBLE_PARAM_FLAG(clusterconfig_independence, Config.Independence,
108  "Desired independence between dimensions");
109 static DOUBLE_PARAM_FLAG(clusterconfig_confidence, Config.Confidence,
110  "Desired confidence in prototypes created");
111 
122 void ParseArguments(int* argc, char ***argv) {
123  STRING usage;
124  if (*argc) {
125  usage += (*argv)[0];
126  usage += " -v | --version | ";
127  usage += (*argv)[0];
128  }
129  usage += " [.tr files ...]";
130  tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
131  // Record the index of the first non-flag argument to 1, since we set
132  // remove_flags to true when parsing the flags.
133  tessoptind = 1;
134  // Set some global values based on the flags.
136  std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
138  std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_max_illegal)));
140  std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_independence)));
142  std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_confidence)));
143  // Set additional parameters from config file if specified.
144  if (!FLAGS_configfile.empty()) {
146  FLAGS_configfile.c_str(),
148  ccutil.params());
149  }
150 }
151 
152 namespace tesseract {
153 // Helper loads shape table from the given file.
154 ShapeTable* LoadShapeTable(const STRING& file_prefix) {
155  ShapeTable* shape_table = nullptr;
156  STRING shape_table_file = file_prefix;
157  shape_table_file += kShapeTableFileSuffix;
158  TFile shape_fp;
159  if (shape_fp.Open(shape_table_file.c_str(), nullptr)) {
160  shape_table = new ShapeTable;
161  if (!shape_table->DeSerialize(&shape_fp)) {
162  delete shape_table;
163  shape_table = nullptr;
164  tprintf("Error: Failed to read shape table %s\n",
165  shape_table_file.c_str());
166  } else {
167  int num_shapes = shape_table->NumShapes();
168  tprintf("Read shape table %s of %d shapes\n",
169  shape_table_file.c_str(), num_shapes);
170  }
171  } else {
172  tprintf("Warning: No shape table file present: %s\n",
173  shape_table_file.c_str());
174  }
175  return shape_table;
176 }
177 
178 // Helper to write the shape_table.
179 void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table) {
180  STRING shape_table_file = file_prefix;
181  shape_table_file += kShapeTableFileSuffix;
182  FILE* fp = fopen(shape_table_file.c_str(), "wb");
183  if (fp != nullptr) {
184  if (!shape_table.Serialize(fp)) {
185  fprintf(stderr, "Error writing shape table: %s\n",
186  shape_table_file.c_str());
187  }
188  fclose(fp);
189  } else {
190  fprintf(stderr, "Error creating shape table: %s\n",
191  shape_table_file.c_str());
192  }
193 }
194 
211 MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
212  bool replication,
213  ShapeTable** shape_table,
214  STRING* file_prefix) {
216  InitIntegerFX();
217  *file_prefix = "";
218  if (!FLAGS_D.empty()) {
219  *file_prefix += FLAGS_D.c_str();
220  *file_prefix += "/";
221  }
222  // If we are shape clustering (nullptr shape_table) or we successfully load
223  // a shape_table written by a previous shape clustering, then
224  // shape_analysis will be true, meaning that the MasterTrainer will replace
225  // some members of the unicharset with their fragments.
226  bool shape_analysis = false;
227  if (shape_table != nullptr) {
228  *shape_table = LoadShapeTable(*file_prefix);
229  if (*shape_table != nullptr) shape_analysis = true;
230  } else {
231  shape_analysis = true;
232  }
234  shape_analysis,
235  replication,
236  FLAGS_debug_level);
237  IntFeatureSpace fs;
239  trainer->LoadUnicharset(FLAGS_U.c_str());
240  // Get basic font information from font_properties.
241  if (!FLAGS_F.empty()) {
242  if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
243  delete trainer;
244  return nullptr;
245  }
246  }
247  if (!FLAGS_X.empty()) {
248  if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
249  delete trainer;
250  return nullptr;
251  }
252  }
253  trainer->SetFeatureSpace(fs);
254  const char* page_name;
255  // Load training data from .tr files on the command line.
256  while ((page_name = GetNextFilename(argc, argv)) != nullptr) {
257  tprintf("Reading %s ...\n", page_name);
258  trainer->ReadTrainingSamples(page_name, feature_defs, false);
259 
260  // If there is a file with [lang].[fontname].exp[num].fontinfo present,
261  // read font spacing information in to fontinfo_table.
262  int pagename_len = strlen(page_name);
263  char* fontinfo_file_name = new char[pagename_len + 7];
264  strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
265  strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
266  trainer->AddSpacingInfo(fontinfo_file_name);
267  delete[] fontinfo_file_name;
268 
269  // Load the images into memory if required by the classifier.
270  if (FLAGS_load_images) {
271  STRING image_name = page_name;
272  // Chop off the tr and replace with tif. Extension must be tif!
273  image_name.truncate_at(image_name.length() - 2);
274  image_name += "tif";
275  trainer->LoadPageImages(image_name.c_str());
276  }
277  }
278  trainer->PostLoadCleanup();
279  // Write the master trainer if required.
280  if (!FLAGS_output_trainer.empty()) {
281  FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
282  if (fp == nullptr) {
283  tprintf("Can't create saved trainer data!\n");
284  } else {
285  trainer->Serialize(fp);
286  fclose(fp);
287  }
288  }
289  trainer->PreTrainingSetup();
290  if (!FLAGS_O.empty() &&
291  !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
292  fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
293  delete trainer;
294  return nullptr;
295  }
296  if (shape_table != nullptr) {
297  // If we previously failed to load a shapetable, then shape clustering
298  // wasn't run so make a flat one now.
299  if (*shape_table == nullptr) {
300  *shape_table = new ShapeTable;
301  trainer->SetupFlatShapeTable(*shape_table);
302  tprintf("Flat shape table summary: %s\n",
303  (*shape_table)->SummaryStr().c_str());
304  }
305  (*shape_table)->set_unicharset(trainer->unicharset());
306  }
307  return trainer;
308 }
309 
310 } // namespace tesseract.
311 
312 /*---------------------------------------------------------------------------*/
323 const char *GetNextFilename(int argc, const char* const * argv) {
324  if (tessoptind < argc)
325  return argv[tessoptind++];
326  else
327  return nullptr;
328 } /* GetNextFilename */
329 
330 /*---------------------------------------------------------------------------*/
340 LABELEDLIST FindList(LIST List, char* Label) {
341  LABELEDLIST LabeledList;
342 
343  iterate (List)
344  {
345  LabeledList = reinterpret_cast<LABELEDLIST>first_node (List);
346  if (strcmp (LabeledList->Label, Label) == 0)
347  return (LabeledList);
348  }
349  return (nullptr);
350 
351 } /* FindList */
352 
353 /*---------------------------------------------------------------------------*/
361 LABELEDLIST NewLabeledList(const char* Label) {
362  LABELEDLIST LabeledList;
363 
364  LabeledList = static_cast<LABELEDLIST>(Emalloc (sizeof (LABELEDLISTNODE)));
365  LabeledList->Label = static_cast<char*>(Emalloc (strlen (Label)+1));
366  strcpy (LabeledList->Label, Label);
367  LabeledList->List = NIL_LIST;
368  LabeledList->SampleCount = 0;
369  LabeledList->font_sample_count = 0;
370  return (LabeledList);
371 
372 } /* NewLabeledList */
373 
374 /*---------------------------------------------------------------------------*/
375 // TODO(rays) This is now used only by cntraining. Convert cntraining to use
376 // the new method or get rid of it entirely.
389 void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_definitions,
390  const char *feature_name, int max_samples,
391  UNICHARSET* unicharset,
392  FILE* file, LIST* training_samples) {
393  char buffer[2048];
394  char unichar[UNICHAR_LEN + 1];
395  LABELEDLIST char_sample;
396  FEATURE_SET feature_samples;
397  CHAR_DESC char_desc;
398  uint32_t feature_type =
399  ShortNameToFeatureType(feature_definitions, feature_name);
400 
401  // Zero out the font_sample_count for all the classes.
402  LIST it = *training_samples;
403  iterate(it) {
404  char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
405  char_sample->font_sample_count = 0;
406  }
407 
408  while (fgets(buffer, 2048, file) != nullptr) {
409  if (buffer[0] == '\n')
410  continue;
411 
412  sscanf(buffer, "%*s %s", unichar);
413  if (unicharset != nullptr && !unicharset->contains_unichar(unichar)) {
414  unicharset->unichar_insert(unichar);
415  if (unicharset->size() > MAX_NUM_CLASSES) {
416  tprintf("Error: Size of unicharset in training is "
417  "greater than MAX_NUM_CLASSES\n");
418  exit(1);
419  }
420  }
421  char_sample = FindList(*training_samples, unichar);
422  if (char_sample == nullptr) {
423  char_sample = NewLabeledList(unichar);
424  *training_samples = push(*training_samples, char_sample);
425  }
426  char_desc = ReadCharDescription(feature_definitions, file);
427  feature_samples = char_desc->FeatureSets[feature_type];
428  if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
429  char_sample->List = push(char_sample->List, feature_samples);
430  char_sample->SampleCount++;
431  char_sample->font_sample_count++;
432  } else {
433  FreeFeatureSet(feature_samples);
434  }
435  for (size_t i = 0; i < char_desc->NumFeatureSets; i++) {
436  if (feature_type != i)
437  FreeFeatureSet(char_desc->FeatureSets[i]);
438  }
439  free(char_desc);
440  }
441 } // ReadTrainingSamples
442 
443 
444 /*---------------------------------------------------------------------------*/
450 void FreeTrainingSamples(LIST CharList) {
451  LABELEDLIST char_sample;
452  FEATURE_SET FeatureSet;
453  LIST FeatureList;
454 
455  LIST nodes = CharList;
456  iterate(CharList) { /* iterate through all of the fonts */
457  char_sample = reinterpret_cast<LABELEDLIST>first_node(CharList);
458  FeatureList = char_sample->List;
459  iterate(FeatureList) { /* iterate through all of the classes */
460  FeatureSet = reinterpret_cast<FEATURE_SET>first_node(FeatureList);
461  FreeFeatureSet(FeatureSet);
462  }
463  FreeLabeledList(char_sample);
464  }
465  destroy(nodes);
466 } /* FreeTrainingSamples */
467 
468 /*---------------------------------------------------------------------------*/
476 void FreeLabeledList(LABELEDLIST LabeledList) {
477  destroy(LabeledList->List);
478  free(LabeledList->Label);
479  free(LabeledList);
480 } /* FreeLabeledList */
481 
482 /*---------------------------------------------------------------------------*/
495  LABELEDLIST char_sample,
496  const char* program_feature_type) {
497  uint16_t N;
498  int i, j;
499  float* Sample = nullptr;
500  CLUSTERER *Clusterer;
501  int32_t CharID;
502  LIST FeatureList = nullptr;
503  FEATURE_SET FeatureSet = nullptr;
504 
505  int32_t desc_index =
506  ShortNameToFeatureType(FeatureDefs, program_feature_type);
507  N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
508  Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
509 
510  FeatureList = char_sample->List;
511  CharID = 0;
512  iterate(FeatureList) {
513  FeatureSet = reinterpret_cast<FEATURE_SET>first_node(FeatureList);
514  for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
515  if (Sample == nullptr) Sample = static_cast<float*>(Emalloc(N * sizeof(float)));
516  for (j = 0; j < N; j++)
517  Sample[j] = FeatureSet->Features[i]->Params[j];
518  MakeSample (Clusterer, Sample, CharID);
519  }
520  CharID++;
521  }
522  free(Sample);
523  return Clusterer;
524 
525 } /* SetUpForClustering */
526 
527 /*------------------------------------------------------------------------*/
528 void MergeInsignificantProtos(LIST ProtoList, const char* label,
529  CLUSTERER* Clusterer,
530  CLUSTERCONFIG* clusterconfig) {
531  PROTOTYPE* Prototype;
532  bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
533 
534  LIST pProtoList = ProtoList;
535  iterate(pProtoList) {
536  Prototype = reinterpret_cast<PROTOTYPE *>first_node (pProtoList);
537  if (Prototype->Significant || Prototype->Merged)
538  continue;
539  float best_dist = 0.125;
540  PROTOTYPE* best_match = nullptr;
541  // Find the nearest alive prototype.
542  LIST list_it = ProtoList;
543  iterate(list_it) {
544  PROTOTYPE* test_p = reinterpret_cast<PROTOTYPE *>first_node (list_it);
545  if (test_p != Prototype && !test_p->Merged) {
546  float dist = ComputeDistance(Clusterer->SampleSize,
547  Clusterer->ParamDesc,
548  Prototype->Mean, test_p->Mean);
549  if (dist < best_dist) {
550  best_match = test_p;
551  best_dist = dist;
552  }
553  }
554  }
555  if (best_match != nullptr && !best_match->Significant) {
556  if (debug)
557  tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
558  best_match->NumSamples, Prototype->NumSamples,
559  best_match->Mean[0], best_match->Mean[1],
560  Prototype->Mean[0], Prototype->Mean[1]);
561  best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
562  Clusterer->ParamDesc,
563  best_match->NumSamples,
564  Prototype->NumSamples,
565  best_match->Mean,
566  best_match->Mean, Prototype->Mean);
567  Prototype->NumSamples = 0;
568  Prototype->Merged = true;
569  } else if (best_match != nullptr) {
570  if (debug)
571  tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
572  Prototype->Mean[0], Prototype->Mean[1],
573  best_match->Mean[0], best_match->Mean[1]);
574  Prototype->Merged = true;
575  }
576  }
577  // Mark significant those that now have enough samples.
578  int min_samples =
579  static_cast<int32_t>(clusterconfig->MinSamples * Clusterer->NumChar);
580  pProtoList = ProtoList;
581  iterate(pProtoList) {
582  Prototype = reinterpret_cast<PROTOTYPE *>first_node (pProtoList);
583  // Process insignificant protos that do not match a green one
584  if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
585  !Prototype->Merged) {
586  if (debug)
587  tprintf("Red proto at %g,%g becoming green\n",
588  Prototype->Mean[0], Prototype->Mean[1]);
589  Prototype->Significant = true;
590  }
591  }
592 } /* MergeInsignificantProtos */
593 
594 /*-----------------------------------------------------------------------------*/
596  LIST ProtoList)
597 {
598  PROTOTYPE* Prototype;
599 
600  iterate(ProtoList)
601  {
602  Prototype = reinterpret_cast<PROTOTYPE *>first_node (ProtoList);
603  free(Prototype->Variance.Elliptical);
604  Prototype->Variance.Elliptical = nullptr;
605  free(Prototype->Magnitude.Elliptical);
606  Prototype->Magnitude.Elliptical = nullptr;
607  free(Prototype->Weight.Elliptical);
608  Prototype->Weight.Elliptical = nullptr;
609  }
610 }
611 
612 /*------------------------------------------------------------------------*/
614  LIST ProtoList,
615  bool KeepSigProtos,
616  bool KeepInsigProtos,
617  int N)
618 
619 {
620  LIST NewProtoList = NIL_LIST;
621  LIST pProtoList;
622  PROTOTYPE* Proto;
623  PROTOTYPE* NewProto;
624  int i;
625 
626  pProtoList = ProtoList;
627  iterate(pProtoList)
628  {
629  Proto = reinterpret_cast<PROTOTYPE *>first_node (pProtoList);
630  if ((Proto->Significant && KeepSigProtos) ||
631  (!Proto->Significant && KeepInsigProtos))
632  {
633  NewProto = static_cast<PROTOTYPE *>(Emalloc(sizeof(PROTOTYPE)));
634 
635  NewProto->Mean = static_cast<float *>(Emalloc(N * sizeof(float)));
636  NewProto->Significant = Proto->Significant;
637  NewProto->Style = Proto->Style;
638  NewProto->NumSamples = Proto->NumSamples;
639  NewProto->Cluster = nullptr;
640  NewProto->Distrib = nullptr;
641 
642  for (i=0; i < N; i++)
643  NewProto->Mean[i] = Proto->Mean[i];
644  if (Proto->Variance.Elliptical != nullptr) {
645  NewProto->Variance.Elliptical = static_cast<float *>(Emalloc(N * sizeof(float)));
646  for (i=0; i < N; i++)
647  NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
648  }
649  else
650  NewProto->Variance.Elliptical = nullptr;
651  //---------------------------------------------
652  if (Proto->Magnitude.Elliptical != nullptr) {
653  NewProto->Magnitude.Elliptical = static_cast<float *>(Emalloc(N * sizeof(float)));
654  for (i=0; i < N; i++)
655  NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
656  }
657  else
658  NewProto->Magnitude.Elliptical = nullptr;
659  //------------------------------------------------
660  if (Proto->Weight.Elliptical != nullptr) {
661  NewProto->Weight.Elliptical = static_cast<float *>(Emalloc(N * sizeof(float)));
662  for (i=0; i < N; i++)
663  NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
664  }
665  else
666  NewProto->Weight.Elliptical = nullptr;
667 
668  NewProto->TotalMagnitude = Proto->TotalMagnitude;
669  NewProto->LogMagnitude = Proto->LogMagnitude;
670  NewProtoList = push_last(NewProtoList, NewProto);
671  }
672  }
673  FreeProtoList(&ProtoList);
674  return (NewProtoList);
675 } /* RemoveInsignificantProtos */
676 
677 /*----------------------------------------------------------------------------*/
678 MERGE_CLASS FindClass(LIST List, const char* Label) {
679  MERGE_CLASS MergeClass;
680 
681  iterate (List)
682  {
683  MergeClass = reinterpret_cast<MERGE_CLASS>first_node (List);
684  if (strcmp (MergeClass->Label, Label) == 0)
685  return (MergeClass);
686  }
687  return (nullptr);
688 
689 } /* FindClass */
690 
691 /*---------------------------------------------------------------------------*/
692 MERGE_CLASS NewLabeledClass(const char* Label) {
693  MERGE_CLASS MergeClass;
694 
695  MergeClass = new MERGE_CLASS_NODE;
696  MergeClass->Label = static_cast<char*>(Emalloc (strlen (Label)+1));
697  strcpy (MergeClass->Label, Label);
698  MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS);
699  return (MergeClass);
700 
701 } /* NewLabeledClass */
702 
703 /*-----------------------------------------------------------------------------*/
709 void FreeLabeledClassList(LIST ClassList) {
710  MERGE_CLASS MergeClass;
711 
712  LIST nodes = ClassList;
713  iterate(ClassList) /* iterate through all of the fonts */
714  {
715  MergeClass = reinterpret_cast<MERGE_CLASS>first_node (ClassList);
716  free (MergeClass->Label);
717  FreeClass(MergeClass->Class);
718  delete MergeClass;
719  }
720  destroy(nodes);
721 
722 } /* FreeLabeledClassList */
723 
724 /* SetUpForFloat2Int */
726  LIST LabeledClassList) {
727  MERGE_CLASS MergeClass;
728  CLASS_TYPE Class;
729  int NumProtos;
730  int NumConfigs;
731  int NumWords;
732  int i, j;
733  float Values[3];
734  PROTO NewProto;
735  PROTO OldProto;
736  BIT_VECTOR NewConfig;
737  BIT_VECTOR OldConfig;
738 
739  // printf("Float2Int ...\n");
740 
741  CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
742  iterate(LabeledClassList)
743  {
744  UnicityTableEqEq<int> font_set;
745  MergeClass = reinterpret_cast<MERGE_CLASS>first_node (LabeledClassList);
746  Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
747  NumProtos = MergeClass->Class->NumProtos;
748  NumConfigs = MergeClass->Class->NumConfigs;
749  font_set.move(&MergeClass->Class->font_set);
750  Class->NumProtos = NumProtos;
751  Class->MaxNumProtos = NumProtos;
752  Class->Prototypes = static_cast<PROTO>(Emalloc (sizeof(PROTO_STRUCT) * NumProtos));
753  for(i=0; i < NumProtos; i++)
754  {
755  NewProto = ProtoIn(Class, i);
756  OldProto = ProtoIn(MergeClass->Class, i);
757  Values[0] = OldProto->X;
758  Values[1] = OldProto->Y;
759  Values[2] = OldProto->Angle;
760  Normalize(Values);
761  NewProto->X = OldProto->X;
762  NewProto->Y = OldProto->Y;
763  NewProto->Length = OldProto->Length;
764  NewProto->Angle = OldProto->Angle;
765  NewProto->A = Values[0];
766  NewProto->B = Values[1];
767  NewProto->C = Values[2];
768  }
769 
770  Class->NumConfigs = NumConfigs;
771  Class->MaxNumConfigs = NumConfigs;
772  Class->font_set.move(&font_set);
773  Class->Configurations = static_cast<BIT_VECTOR*>(Emalloc (sizeof(BIT_VECTOR) * NumConfigs));
774  NumWords = WordsInVectorOfSize(NumProtos);
775  for(i=0; i < NumConfigs; i++)
776  {
777  NewConfig = NewBitVector(NumProtos);
778  OldConfig = MergeClass->Class->Configurations[i];
779  for(j=0; j < NumWords; j++)
780  NewConfig[j] = OldConfig[j];
781  Class->Configurations[i] = NewConfig;
782  }
783  }
784  return float_classes;
785 } // SetUpForFloat2Int
786 
787 /*--------------------------------------------------------------------------*/
788 void Normalize (
789  float *Values)
790 {
791  float Slope;
792  float Intercept;
793  float Normalizer;
794 
795  Slope = tan(Values [2] * 2 * M_PI);
796  Intercept = Values [1] - Slope * Values [0];
797  Normalizer = 1 / sqrt (Slope * Slope + 1.0);
798 
799  Values [0] = Slope * Normalizer;
800  Values [1] = - Normalizer;
801  Values [2] = Intercept * Normalizer;
802 } // Normalize
803 
804 /*-------------------------------------------------------------------------*/
805 void FreeNormProtoList(LIST CharList)
806 
807 {
808  LABELEDLIST char_sample;
809 
810  LIST nodes = CharList;
811  iterate(CharList) /* iterate through all of the fonts */
812  {
813  char_sample = reinterpret_cast<LABELEDLIST>first_node (CharList);
814  FreeLabeledList (char_sample);
815  }
816  destroy(nodes);
817 
818 } // FreeNormProtoList
819 
820 /*---------------------------------------------------------------------------*/
822  LIST* NormProtoList,
823  LIST ProtoList,
824  char* CharName)
825 {
826  PROTOTYPE* Proto;
827  LABELEDLIST LabeledProtoList;
828 
829  LabeledProtoList = NewLabeledList(CharName);
830  iterate(ProtoList)
831  {
832  Proto = reinterpret_cast<PROTOTYPE *>first_node (ProtoList);
833  LabeledProtoList->List = push(LabeledProtoList->List, Proto);
834  }
835  *NormProtoList = push(*NormProtoList, LabeledProtoList);
836 }
837 
838 /*---------------------------------------------------------------------------*/
839 int NumberOfProtos(LIST ProtoList, bool CountSigProtos,
840  bool CountInsigProtos) {
841  int N = 0;
842  iterate(ProtoList)
843  {
844  PROTOTYPE* Proto = reinterpret_cast<PROTOTYPE*>first_node(ProtoList);
845  if ((Proto->Significant && CountSigProtos) ||
846  (!Proto->Significant && CountInsigProtos))
847  N++;
848  }
849  return(N);
850 }
851 
852 #endif // def DISABLED_LEGACY_ENGINE
FindList
LABELEDLIST FindList(LIST List, char *Label)
Definition: commontraining.cpp:340
PROTO_STRUCT::Length
float Length
Definition: protos.h:41
emalloc.h
tesseract::ShapeTable::Serialize
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:241
tesseract::ParamUtils::ReadParamsFile
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:39
ReadTrainingSamples
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
Definition: commontraining.cpp:389
CLUSTERCONFIG
Definition: cluster.h:45
PROTOTYPE::TotalMagnitude
float TotalMagnitude
Definition: cluster.h:74
Normalize
void Normalize(float *Values)
Definition: commontraining.cpp:788
commontraining.h
InitFeatureDefs
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:111
NumberOfProtos
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
Definition: commontraining.cpp:839
CLUSTERER::NumChar
int32_t NumChar
Definition: cluster.h:88
tesseract::MasterTrainer::Serialize
bool Serialize(FILE *fp) const
Definition: mastertrainer.cpp:71
DOUBLE_PARAM_FLAG
#define DOUBLE_PARAM_FLAG(name, val, comment)
Definition: commandlineflags.h:29
tesseract::SET_PARAM_CONSTRAINT_NON_INIT_ONLY
Definition: params.h:53
first_node
#define first_node(l)
Definition: oldlist.h:84
STRING_PARAM_FLAG
STRING_PARAM_FLAG(D, "", "Directory to write output files to")
unicity_table.h
CLASS_STRUCT::Configurations
CONFIGS Configurations
Definition: protos.h:58
Emalloc
void * Emalloc(int Size)
Definition: emalloc.cpp:31
elliptical
Definition: cluster.h:43
ShortNameToFeatureType
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:269
CLUSTERCONFIG::Independence
float Independence
Definition: cluster.h:50
PROTOTYPE::LogMagnitude
float LogMagnitude
Definition: cluster.h:75
tesseract::LoadShapeTable
ShapeTable * LoadShapeTable(const STRING &file_prefix)
Definition: commontraining.cpp:154
list_rec
Definition: oldlist.h:73
PROTO_STRUCT
Definition: protos.h:34
LABELEDLISTNODE::font_sample_count
int font_sample_count
Definition: commontraining.h:82
UnicityTableEqEq< int >
tesseract::MasterTrainer::LoadUnicharset
void LoadUnicharset(const char *filename)
Definition: mastertrainer.cpp:87
MERGE_CLASS_NODE
Definition: commontraining.h:87
FreeLabeledClassList
void FreeLabeledClassList(LIST ClassList)
Definition: commontraining.cpp:709
params.h
FreeProtoList
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:538
tesseract::ShapeTable::NumShapes
int NumShapes() const
Definition: shapetable.h:274
mf.h
CLASS_STRUCT::NumProtos
int16_t NumProtos
Definition: protos.h:53
PROTOTYPE::Magnitude
FLOATUNION Magnitude
Definition: cluster.h:77
intfeaturespace.h
CHAR_DESC_STRUCT::FeatureSets
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:40
Config
CLUSTERCONFIG Config
Definition: commontraining.cpp:88
STRING
Definition: strngs.h:45
CLASS_STRUCT::MaxNumProtos
int16_t MaxNumProtos
Definition: protos.h:54
STRING::truncate_at
void truncate_at(int32_t index)
Definition: strngs.cpp:258
CLUSTERER::SampleSize
int16_t SampleSize
Definition: cluster.h:82
ComputeDistance
float ComputeDistance(int k, PARAM_DESC *dim, float p1[], float p2[])
Definition: kdtree.cpp:447
tesseract::ParseCommandLineFlags
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
Definition: commandlineflags.cpp:166
mastertrainer.h
FindClass
MERGE_CLASS FindClass(LIST List, const char *Label)
Definition: commontraining.cpp:678
NIL_LIST
#define NIL_LIST
Definition: oldlist.h:68
tesseract::MasterTrainer::SetFeatureSpace
void SetFeatureSpace(const IntFeatureSpace &fs)
Definition: mastertrainer.h:82
NewLabeledList
LABELEDLIST NewLabeledList(const char *Label)
Definition: commontraining.cpp:361
CLUSTERCONFIG::MaxIllegal
float MaxIllegal
Definition: cluster.h:48
oldlist.h
SetUpForFloat2Int
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
Definition: commontraining.cpp:725
PROTO_STRUCT::B
float B
Definition: protos.h:36
tesseract::ShapeTable::DeSerialize
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:246
PROTOTYPE
Definition: cluster.h:62
INT_PARAM_FLAG
INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging")
InitIntegerFX
void InitIntegerFX()
Definition: intfx.cpp:48
tesseract::IntFeatureSpace::Init
void Init(uint8_t xbuckets, uint8_t ybuckets, uint8_t thetabuckets)
Definition: intfeaturespace.cpp:30
ProtoIn
#define ProtoIn(Class, Pid)
Definition: protos.h:82
tesseract::TFile::Open
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:210
UNICHARSET::save_to_file
bool save_to_file(const char *const filename) const
Definition: unicharset.h:350
FEATURE_DEFS_STRUCT::FeatureDesc
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:46
kBoostDirBuckets
const int kBoostDirBuckets
Definition: intfeaturespace.h:30
kBoostXYBuckets
const int kBoostXYBuckets
Definition: intfeaturespace.h:29
CLUSTERCONFIG::Confidence
double Confidence
Definition: cluster.h:51
CLUSTERER::ParamDesc
PARAM_DESC * ParamDesc
Definition: cluster.h:83
PROTOTYPE::Merged
bool Merged
Definition: cluster.h:64
NewClass
CLASS_TYPE NewClass(int NumProtos, int NumConfigs)
Definition: protos.cpp:151
tesseract::MasterTrainer::AddSpacingInfo
bool AddSpacingInfo(const char *filename)
Definition: mastertrainer.cpp:411
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
MAX_NUM_CONFIGS
#define MAX_NUM_CONFIGS
Definition: intproto.h:46
file
Definition: include_gunit.h:22
MAX_NUM_PROTOS
#define MAX_NUM_PROTOS
Definition: intproto.h:47
MAX_NUM_CLASSES
#define MAX_NUM_CLASSES
Definition: matchdefs.h:29
FLOATUNION::Elliptical
float * Elliptical
Definition: cluster.h:59
tesseract::MasterTrainer::LoadFontInfo
bool LoadFontInfo(const char *filename)
Definition: mastertrainer.cpp:332
tesseract::MasterTrainer::unicharset
const UNICHARSET & unicharset() const
Definition: mastertrainer.h:186
ccutil.h
MERGE_CLASS_NODE::Label
char * Label
Definition: commontraining.h:89
PROTO_STRUCT::Y
float Y
Definition: protos.h:39
CleanUpUnusedData
void CleanUpUnusedData(LIST ProtoList)
Definition: commontraining.cpp:595
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
PROTO_STRUCT::C
float C
Definition: protos.h:37
PROTOTYPE::Weight
FLOATUNION Weight
Definition: cluster.h:78
shapetable.h
CLASS_STRUCT::NumConfigs
int16_t NumConfigs
Definition: protos.h:56
tesseract::TFile
Definition: serialis.h:75
tesseract::MasterTrainer::ReadTrainingSamples
void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
Definition: mastertrainer.cpp:111
FEATURE_DEFS_STRUCT
Definition: featdefs.h:44
UNICHARSET
Definition: unicharset.h:145
MergeClusters
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
Definition: cluster.cpp:824
PROTO_STRUCT::X
float X
Definition: protos.h:38
FEATURE_SET_STRUCT::MaxNumFeatures
uint16_t MaxNumFeatures
Definition: ocrfeatures.h:66
FEATURE_DESC_STRUCT::NumParams
uint16_t NumParams
Definition: ocrfeatures.h:52
CLASS_STRUCT::font_set
UnicityTableEqEq< int > font_set
Definition: protos.h:59
FEATURE_SET_STRUCT::Features
FEATURE Features[1]
Definition: ocrfeatures.h:67
feature_defs
FEATURE_DEFS_STRUCT feature_defs
Definition: commontraining.cpp:89
tesseract::WriteShapeTable
void WriteShapeTable(const STRING &file_prefix, const ShapeTable &shape_table)
Definition: commontraining.cpp:179
GetNextFilename
const char * GetNextFilename(int argc, const char *const *argv)
Definition: commontraining.cpp:323
CLASS_STRUCT
Definition: protos.h:45
PROTO_STRUCT::Angle
float Angle
Definition: protos.h:40
RemoveInsignificantProtos
LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
Definition: commontraining.cpp:613
MakeClusterer
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:376
BIT_VECTOR
uint32_t * BIT_VECTOR
Definition: bitvec.h:27
tesseract
Definition: baseapi.h:65
fontinfo.h
push
LIST push(LIST list, void *element)
Definition: oldlist.cpp:172
FEATURE_STRUCT::Params
float Params[1]
Definition: ocrfeatures.h:60
FEATURE_DESC_STRUCT::ParamDesc
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:54
CLASS_STRUCT::MaxNumConfigs
int16_t MaxNumConfigs
Definition: protos.h:57
tesseract::CCUtil::params
ParamsVectors * params()
Definition: ccutil.h:51
FreeTrainingSamples
void FreeTrainingSamples(LIST CharList)
Definition: commontraining.cpp:450
tesseract::MasterTrainer::LoadXHeights
bool LoadXHeights(const char *filename)
Definition: mastertrainer.cpp:368
tprintf.h
tesseract::MasterTrainer
Definition: mastertrainer.h:69
FEATURE_SET_STRUCT
Definition: ocrfeatures.h:64
tesseract::IntFeatureSpace
Definition: intfeaturespace.h:38
tesseract::NM_CHAR_ANISOTROPIC
Definition: normalis.h:44
LABELEDLISTNODE
Definition: commontraining.h:78
tessoptind
int tessoptind
Definition: tessopt.cpp:23
MERGE_CLASS_NODE::Class
CLASS_TYPE Class
Definition: commontraining.h:91
CHAR_DESC_STRUCT
Definition: featdefs.h:38
CLASS_STRUCT::Prototypes
PROTO Prototypes
Definition: protos.h:55
UNICHAR_LEN
#define UNICHAR_LEN
Definition: unichar.h:32
STRING::length
int32_t length() const
Definition: strngs.cpp:187
PROTO_STRUCT::A
float A
Definition: protos.h:35
cluster.h
tesseract::LoadTrainingData
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
Definition: commontraining.cpp:211
PROTOTYPE::Significant
bool Significant
Definition: cluster.h:63
SetUpForClustering
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
Definition: commontraining.cpp:494
PROTOTYPE::Mean
float * Mean
Definition: cluster.h:73
ParseArguments
void ParseArguments(int *argc, char ***argv)
Definition: commontraining.cpp:122
tesseract::MasterTrainer::PreTrainingSetup
void PreTrainingSetup()
Definition: mastertrainer.cpp:233
MergeInsignificantProtos
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *clusterconfig)
Definition: commontraining.cpp:528
featdefs.h
CLUSTERER
Definition: cluster.h:81
UNICHARSET::contains_unichar
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
iterate
#define iterate(l)
Definition: oldlist.h:92
FreeFeatureSet
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:61
FreeClass
void FreeClass(CLASS_TYPE Class)
Definition: protos.cpp:121
destroy
LIST destroy(LIST list)
Definition: oldlist.cpp:123
PROTOTYPE::Style
unsigned Style
Definition: cluster.h:69
NewLabeledClass
MERGE_CLASS NewLabeledClass(const char *Label)
Definition: commontraining.cpp:692
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::ShapeTable
Definition: shapetable.h:261
LABELEDLISTNODE::SampleCount
int SampleCount
Definition: commontraining.h:81
CHAR_DESC_STRUCT::NumFeatureSets
uint32_t NumFeatureSets
Definition: featdefs.h:39
tesseract::ParamUtils
Definition: params.h:64
FreeLabeledList
void FreeLabeledList(LABELEDLIST LabeledList)
Definition: commontraining.cpp:476
tesseract::MasterTrainer::SetupFlatShapeTable
void SetupFlatShapeTable(ShapeTable *shape_table)
Definition: mastertrainer.cpp:495
PROTOTYPE::Variance
FLOATUNION Variance
Definition: cluster.h:76
classify.h
tesseract::MasterTrainer::LoadPageImages
void LoadPageImages(const char *filename)
Definition: mastertrainer.cpp:192
tessopt.h
tesseract::MasterTrainer::PostLoadCleanup
void PostLoadCleanup()
Definition: mastertrainer.cpp:210
CLUSTERCONFIG::MinSamples
float MinSamples
Definition: cluster.h:47
PROTOTYPE::NumSamples
unsigned NumSamples
Definition: cluster.h:70
AddToNormProtosList
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
Definition: commontraining.cpp:821
push_last
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:185
tesseract::CCUtil
Definition: ccutil.h:40
PROTOTYPE::Cluster
CLUSTER * Cluster
Definition: cluster.h:71
LABELEDLISTNODE::Label
char * Label
Definition: commontraining.h:80
MakeSample
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, int32_t CharID)
Definition: cluster.cpp:429
UNICHARSET::unichar_insert
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:625
tessdatamanager.h
ReadCharDescription
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:235
FreeNormProtoList
void FreeNormProtoList(LIST CharList)
Definition: commontraining.cpp:805
UNICHARSET::size
int size() const
Definition: unicharset.h:341
clusttool.h
UnicityTable::move
void move(UnicityTable< T > *from)
Definition: unicity_table.h:185
LABELEDLISTNODE::List
LIST List
Definition: commontraining.h:83
PROTOTYPE::Distrib
DISTRIBUTION * Distrib
Definition: cluster.h:72