tesseract  5.0.0-alpha-619-ge9db
mftraining.cpp File Reference
#include <cmath>
#include <cstring>
#include <cstdio>
#include "classify.h"
#include "cluster.h"
#include "clusttool.h"
#include "commontraining.h"
#include "featdefs.h"
#include "fontinfo.h"
#include <tesseract/genericvector.h>
#include "indexmapbidi.h"
#include "intproto.h"
#include "mastertrainer.h"
#include "mergenf.h"
#include "mf.h"
#include "ocrfeatures.h"
#include "oldlist.h"
#include "protos.h"
#include "shapetable.h"
#include "tessopt.h"
#include "tprintf.h"
#include "unicity_table.h"

Go to the source code of this file.

Macros

#define _USE_MATH_DEFINES
 

Functions

int main (int argc, char **argv)
 

Macro Definition Documentation

◆ _USE_MATH_DEFINES

#define _USE_MATH_DEFINES

Definition at line 23 of file mftraining.cpp.

Function Documentation

◆ main()

int main ( int  argc,
char **  argv 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName UTF8-char-str xmin ymin xmax ymax page-number
    NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

The result of this program is a binary inttemp file used by the OCR engine.

Parameters
argcnumber of command line arguments
argvarray of command line arguments
Returns
0 if no error occurred

Definition at line 205 of file mftraining.cpp.

206  {
207  tesseract::CheckSharedLibraryVersion();
208 
209  ParseArguments(&argc, &argv);
210 
211  ShapeTable* shape_table = nullptr;
212  STRING file_prefix;
213  // Load the training data.
214  MasterTrainer* trainer = tesseract::LoadTrainingData(argc, argv,
215  false,
216  &shape_table,
217  &file_prefix);
218  if (trainer == nullptr) return 1; // Failed.
219 
220  // Setup an index mapping from the shapes in the shape table to the classes
221  // that will be trained. In keeping with the original design, each shape
222  // with the same list of unichars becomes a different class and the configs
223  // represent the different combinations of fonts.
224  IndexMapBiDi config_map;
225  SetupConfigMap(shape_table, &config_map);
226 
227  WriteShapeTable(file_prefix, *shape_table);
228  // If the shape_table is flat, then either we didn't run shape clustering, or
229  // it did nothing, so we just output the trainer's unicharset.
230  // Otherwise shape_set will hold a fake unicharset with an entry for each
231  // shape in the shape table, and we will output that instead.
232  UNICHARSET shape_set;
233  const UNICHARSET* unicharset = &trainer->unicharset();
234  // If we ran shapeclustering (and it worked) then at least one shape will
235  // have multiple unichars, so we have to build a fake unicharset.
236  if (shape_table->AnyMultipleUnichars()) {
237  unicharset = &shape_set;
238  // Now build a fake unicharset for the compact shape space to keep the
239  // output modules happy that we are doing things correctly.
240  int num_shapes = config_map.CompactSize();
241  for (int s = 0; s < num_shapes; ++s) {
242  char shape_label[14];
243  snprintf(shape_label, sizeof(shape_label), "sh%04d", s);
244  shape_set.unichar_insert(shape_label);
245  }
246  }
247 
248  // Now train each config separately.
249  int num_configs = shape_table->NumShapes();
250  LIST mf_classes = NIL_LIST;
251  for (int s = 0; s < num_configs; ++s) {
252  int unichar_id, font_id;
253  if (unicharset == &shape_set) {
254  // Using fake unichar_ids from the config_map/shape_set.
255  unichar_id = config_map.SparseToCompact(s);
256  } else {
257  // Get the real unichar_id from the shape table/unicharset.
258  shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id);
259  }
260  const char* class_label = unicharset->id_to_unichar(unichar_id);
261  mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table,
262  trainer);
263  }
264  STRING inttemp_file = file_prefix;
265  inttemp_file += "inttemp";
266  STRING pffmtable_file = file_prefix;
267  pffmtable_file += "pffmtable";
268  CLASS_STRUCT* float_classes = SetUpForFloat2Int(*unicharset, mf_classes);
269  // Now write the inttemp and pffmtable.
270  trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset,
271  *shape_table, float_classes,
272  inttemp_file.c_str(),
273  pffmtable_file.c_str());
274  for (int c = 0; c < unicharset->size(); ++c) {
275  FreeClassFields(&float_classes[c]);
276  }
277  delete [] float_classes;
278  FreeLabeledClassList(mf_classes);
279  delete trainer;
280  delete shape_table;
281  printf("Done!\n");
282  if (!FLAGS_test_ch.empty()) {
283  // If we are displaying debug window(s), wait for the user to look at them.
284  printf("Hit return to exit...\n");
285  while (getchar() != '\n');
286  }
287  return 0;
list_rec
Definition: oldlist.h:73
FreeLabeledClassList
void FreeLabeledClassList(LIST ClassList)
Definition: commontraining.cpp:709
tesseract::ShapeTable::NumShapes
int NumShapes() const
Definition: shapetable.h:274
STRING
Definition: strngs.h:45
tesseract::IndexMapBiDi::SparseToCompact
int SparseToCompact(int sparse_index) const override
Definition: indexmapbidi.h:138
NIL_LIST
#define NIL_LIST
Definition: oldlist.h:68
SetUpForFloat2Int
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
Definition: commontraining.cpp:725
tesseract::MasterTrainer::WriteInttempAndPFFMTable
void WriteInttempAndPFFMTable(const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
Definition: mastertrainer.cpp:566
FreeClassFields
void FreeClassFields(CLASS_TYPE Class)
Definition: protos.cpp:133
tesseract::ShapeTable::GetFirstUnicharAndFont
void GetFirstUnicharAndFont(int shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:404
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::MasterTrainer::unicharset
const UNICHARSET & unicharset() const
Definition: mastertrainer.h:186
UNICHARSET
Definition: unicharset.h:145
tesseract::WriteShapeTable
void WriteShapeTable(const STRING &file_prefix, const ShapeTable &shape_table)
Definition: commontraining.cpp:179
CLASS_STRUCT
Definition: protos.h:45
tesseract::MasterTrainer
Definition: mastertrainer.h:69
tesseract::LoadTrainingData
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
Definition: commontraining.cpp:211
ParseArguments
void ParseArguments(int *argc, char ***argv)
Definition: commontraining.cpp:122
tesseract::IndexMapBiDi
Definition: indexmapbidi.h:102
tesseract::ShapeTable::AnyMultipleUnichars
bool AnyMultipleUnichars() const
Definition: shapetable.cpp:444
tesseract::ShapeTable
Definition: shapetable.h:261
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
tesseract::IndexMap::CompactSize
int CompactSize() const
Definition: indexmapbidi.h:61
UNICHARSET::unichar_insert
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:625
UNICHARSET::size
int size() const
Definition: unicharset.h:341