tesseract  5.0.0-alpha-619-ge9db
mftraining.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: mftraining.c
3  ** Purpose: Separates training pages into files for each character.
4  ** Strips from files only the features and there parameters of
5  ** the feature type mf.
6  ** Author: Dan Johnson
7  ** Revisment: Christy Russon
8  **
9  ** (c) Copyright Hewlett-Packard Company, 1988.
10  ** Licensed under the Apache License, Version 2.0 (the "License");
11  ** you may not use this file except in compliance with the License.
12  ** You may obtain a copy of the License at
13  ** http://www.apache.org/licenses/LICENSE-2.0
14  ** Unless required by applicable law or agreed to in writing, software
15  ** distributed under the License is distributed on an "AS IS" BASIS,
16  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  ** See the License for the specific language governing permissions and
18  ** limitations under the License.
19 ******************************************************************************/
20 /*----------------------------------------------------------------------------
21  Include Files and Type Defines
22 ----------------------------------------------------------------------------*/
23 
24 #define _USE_MATH_DEFINES // for M_PI
25 #ifdef HAVE_CONFIG_H
26 #include "config_auto.h"
27 #endif
28 
29 #include <cmath> // for M_PI
30 #include <cstring>
31 #include <cstdio>
32 
33 #include "classify.h"
34 #include "cluster.h"
35 #include "clusttool.h"
36 #include "commontraining.h"
37 #include "featdefs.h"
38 #include "fontinfo.h"
40 #include "indexmapbidi.h"
41 #include "intproto.h"
42 #include "mastertrainer.h"
43 #include "mergenf.h"
44 #include "mf.h"
45 #include "ocrfeatures.h"
46 #include "oldlist.h"
47 #include "protos.h"
48 #include "shapetable.h"
49 #include "tessopt.h"
50 #include "tprintf.h"
51 #include "unicity_table.h"
52 
55 using tesseract::Shape;
57 
58 /*----------------------------------------------------------------------------
59  Public Code
60 -----------------------------------------------------------------------------*/
61 #ifndef GRAPHICS_DISABLED
62 static void DisplayProtoList(const char* ch, LIST protolist) {
63  void* window = c_create_window("Char samples", 50, 200,
64  520, 520, -130.0, 130.0, -130.0, 130.0);
65  LIST proto = protolist;
66  iterate(proto) {
67  PROTOTYPE* prototype = reinterpret_cast<PROTOTYPE *>(first_node(proto));
68  if (prototype->Significant)
69  c_line_color_index(window, Green);
70  else if (prototype->NumSamples == 0)
71  c_line_color_index(window, Blue);
72  else if (prototype->Merged)
73  c_line_color_index(window, Magenta);
74  else
75  c_line_color_index(window, Red);
76  float x = CenterX(prototype->Mean);
77  float y = CenterY(prototype->Mean);
78  double angle = OrientationOf(prototype->Mean) * 2 * M_PI;
79  float dx = static_cast<float>(LengthOf(prototype->Mean) * cos(angle) / 2);
80  float dy = static_cast<float>(LengthOf(prototype->Mean) * sin(angle) / 2);
81  c_move(window, (x - dx) * 256, (y - dy) * 256);
82  c_draw(window, (x + dx) * 256, (y + dy) * 256);
83  if (prototype->Significant)
84  tprintf("Green proto at (%g,%g)+(%g,%g) %d samples\n",
85  x, y, dx, dy, prototype->NumSamples);
86  else if (prototype->NumSamples > 0 && !prototype->Merged)
87  tprintf("Red proto at (%g,%g)+(%g,%g) %d samples\n",
88  x, y, dx, dy, prototype->NumSamples);
89  }
90  c_make_current(window);
91 }
92 #endif // GRAPHICS_DISABLED
93 
94 // Helper to run clustering on a single config.
95 // Mostly copied from the old mftraining, but with renamed variables.
96 static LIST ClusterOneConfig(int shape_id, const char* class_label,
97  LIST mf_classes,
98  const ShapeTable& shape_table,
99  MasterTrainer* trainer) {
100  int num_samples;
101  CLUSTERER *clusterer = trainer->SetupForClustering(shape_table,
102  feature_defs,
103  shape_id,
104  &num_samples);
105  Config.MagicSamples = num_samples;
106  LIST proto_list = ClusterSamples(clusterer, &Config);
107  CleanUpUnusedData(proto_list);
108 
109  // Merge protos where reasonable to make more of them significant by
110  // representing almost all samples of the class/font.
111  MergeInsignificantProtos(proto_list, class_label, clusterer, &Config);
112  #ifndef GRAPHICS_DISABLED
113  if (strcmp(FLAGS_test_ch.c_str(), class_label) == 0)
114  DisplayProtoList(FLAGS_test_ch.c_str(), proto_list);
115  #endif // GRAPHICS_DISABLED
116  // Delete the protos that will not be used in the inttemp output file.
117  proto_list = RemoveInsignificantProtos(proto_list, true,
118  false,
119  clusterer->SampleSize);
120  FreeClusterer(clusterer);
121  MERGE_CLASS merge_class = FindClass(mf_classes, class_label);
122  if (merge_class == nullptr) {
123  merge_class = NewLabeledClass(class_label);
124  mf_classes = push(mf_classes, merge_class);
125  }
126  int config_id = AddConfigToClass(merge_class->Class);
127  merge_class->Class->font_set.push_back(shape_id);
128  LIST proto_it = proto_list;
129  iterate(proto_it) {
130  PROTOTYPE* prototype = reinterpret_cast<PROTOTYPE*>(first_node(proto_it));
131  // See if proto can be approximated by existing proto.
132  int p_id = FindClosestExistingProto(merge_class->Class,
133  merge_class->NumMerged, prototype);
134  if (p_id == NO_PROTO) {
135  // Need to make a new proto, as it doesn't match anything.
136  p_id = AddProtoToClass(merge_class->Class);
137  MakeNewFromOld(ProtoIn(merge_class->Class, p_id), prototype);
138  merge_class->NumMerged[p_id] = 1;
139  } else {
140  PROTO_STRUCT dummy_proto;
141  MakeNewFromOld(&dummy_proto, prototype);
142  // Merge with the similar proto.
143  ComputeMergedProto(ProtoIn(merge_class->Class, p_id), &dummy_proto,
144  static_cast<float>(merge_class->NumMerged[p_id]),
145  1.0,
146  ProtoIn(merge_class->Class, p_id));
147  merge_class->NumMerged[p_id]++;
148  }
149  AddProtoToConfig(p_id, merge_class->Class->Configurations[config_id]);
150  }
151  FreeProtoList(&proto_list);
152  return mf_classes;
153 }
154 
155 // Helper to setup the config map.
156 // Setup an index mapping from the shapes in the shape table to the classes
157 // that will be trained. In keeping with the original design, each shape
158 // with the same list of unichars becomes a different class and the configs
159 // represent the different combinations of fonts.
160 static void SetupConfigMap(ShapeTable* shape_table, IndexMapBiDi* config_map) {
161  int num_configs = shape_table->NumShapes();
162  config_map->Init(num_configs, true);
163  config_map->Setup();
164  for (int c1 = 0; c1 < num_configs; ++c1) {
165  // Only process ids that are not already merged.
166  if (config_map->SparseToCompact(c1) == c1) {
167  Shape* shape1 = shape_table->MutableShape(c1);
168  // Find all the subsequent shapes that are equal.
169  for (int c2 = c1 + 1; c2 < num_configs; ++c2) {
170  if (shape_table->MutableShape(c2)->IsEqualUnichars(shape1)) {
171  config_map->Merge(c1, c2);
172  }
173  }
174  }
175  }
176  config_map->CompleteMerges();
177 }
178 
206 int main (int argc, char **argv) {
207  tesseract::CheckSharedLibraryVersion();
208 
209  ParseArguments(&argc, &argv);
210 
211  ShapeTable* shape_table = nullptr;
212  STRING file_prefix;
213  // Load the training data.
214  MasterTrainer* trainer = tesseract::LoadTrainingData(argc, argv,
215  false,
216  &shape_table,
217  &file_prefix);
218  if (trainer == nullptr) return 1; // Failed.
219 
220  // Setup an index mapping from the shapes in the shape table to the classes
221  // that will be trained. In keeping with the original design, each shape
222  // with the same list of unichars becomes a different class and the configs
223  // represent the different combinations of fonts.
224  IndexMapBiDi config_map;
225  SetupConfigMap(shape_table, &config_map);
226 
227  WriteShapeTable(file_prefix, *shape_table);
228  // If the shape_table is flat, then either we didn't run shape clustering, or
229  // it did nothing, so we just output the trainer's unicharset.
230  // Otherwise shape_set will hold a fake unicharset with an entry for each
231  // shape in the shape table, and we will output that instead.
232  UNICHARSET shape_set;
233  const UNICHARSET* unicharset = &trainer->unicharset();
234  // If we ran shapeclustering (and it worked) then at least one shape will
235  // have multiple unichars, so we have to build a fake unicharset.
236  if (shape_table->AnyMultipleUnichars()) {
237  unicharset = &shape_set;
238  // Now build a fake unicharset for the compact shape space to keep the
239  // output modules happy that we are doing things correctly.
240  int num_shapes = config_map.CompactSize();
241  for (int s = 0; s < num_shapes; ++s) {
242  char shape_label[14];
243  snprintf(shape_label, sizeof(shape_label), "sh%04d", s);
244  shape_set.unichar_insert(shape_label);
245  }
246  }
247 
248  // Now train each config separately.
249  int num_configs = shape_table->NumShapes();
250  LIST mf_classes = NIL_LIST;
251  for (int s = 0; s < num_configs; ++s) {
252  int unichar_id, font_id;
253  if (unicharset == &shape_set) {
254  // Using fake unichar_ids from the config_map/shape_set.
255  unichar_id = config_map.SparseToCompact(s);
256  } else {
257  // Get the real unichar_id from the shape table/unicharset.
258  shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id);
259  }
260  const char* class_label = unicharset->id_to_unichar(unichar_id);
261  mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table,
262  trainer);
263  }
264  STRING inttemp_file = file_prefix;
265  inttemp_file += "inttemp";
266  STRING pffmtable_file = file_prefix;
267  pffmtable_file += "pffmtable";
268  CLASS_STRUCT* float_classes = SetUpForFloat2Int(*unicharset, mf_classes);
269  // Now write the inttemp and pffmtable.
270  trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset,
271  *shape_table, float_classes,
272  inttemp_file.c_str(),
273  pffmtable_file.c_str());
274  for (int c = 0; c < unicharset->size(); ++c) {
275  FreeClassFields(&float_classes[c]);
276  }
277  delete [] float_classes;
278  FreeLabeledClassList(mf_classes);
279  delete trainer;
280  delete shape_table;
281  printf("Done!\n");
282  if (!FLAGS_test_ch.empty()) {
283  // If we are displaying debug window(s), wait for the user to look at them.
284  printf("Hit return to exit...\n");
285  while (getchar() != '\n');
286  }
287  return 0;
288 } /* main */
AddProtoToClass
int AddProtoToClass(CLASS_TYPE Class)
Definition: protos.cpp:82
FindClosestExistingProto
int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[], PROTOTYPE *Prototype)
Definition: mergenf.cpp:155
tesseract::MasterTrainer::SetupForClustering
CLUSTERER * SetupForClustering(const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs, int shape_id, int *num_samples)
Definition: mastertrainer.cpp:526
CLUSTERCONFIG::MagicSamples
int MagicSamples
Definition: cluster.h:52
commontraining.h
FreeClusterer
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:514
first_node
#define first_node(l)
Definition: oldlist.h:84
unicity_table.h
tesseract::Shape
Definition: shapetable.h:184
AddConfigToClass
int AddConfigToClass(CLASS_TYPE Class)
Definition: protos.cpp:45
tesseract::IndexMapBiDi::CompleteMerges
void CompleteMerges()
Definition: indexmapbidi.cpp:160
CLASS_STRUCT::Configurations
CONFIGS Configurations
Definition: protos.h:58
list_rec
Definition: oldlist.h:73
ComputeMergedProto
void ComputeMergedProto(PROTO p1, PROTO p2, float w1, float w2, PROTO MergedProto)
Definition: mergenf.cpp:123
PROTO_STRUCT
Definition: protos.h:34
MERGE_CLASS_NODE
Definition: commontraining.h:87
FreeLabeledClassList
void FreeLabeledClassList(LIST ClassList)
Definition: commontraining.cpp:709
FreeProtoList
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:538
tesseract::ShapeTable::NumShapes
int NumShapes() const
Definition: shapetable.h:274
mf.h
Config
CLUSTERCONFIG Config
Definition: commontraining.cpp:88
STRING
Definition: strngs.h:45
CLUSTERER::SampleSize
int16_t SampleSize
Definition: cluster.h:82
tesseract::IndexMapBiDi::SparseToCompact
int SparseToCompact(int sparse_index) const override
Definition: indexmapbidi.h:138
mastertrainer.h
FindClass
MERGE_CLASS FindClass(LIST List, const char *Label)
Definition: commontraining.cpp:678
Blue
Definition: callcpp.h:33
LengthOf
#define LengthOf(M)
Definition: mergenf.h:51
NIL_LIST
#define NIL_LIST
Definition: oldlist.h:68
oldlist.h
SetUpForFloat2Int
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
Definition: commontraining.cpp:725
PROTOTYPE
Definition: cluster.h:62
UnicityTable::push_back
int push_back(T object)
Add an element in the table.
Definition: unicity_table.h:168
ProtoIn
#define ProtoIn(Class, Pid)
Definition: protos.h:82
ClusterSamples
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:483
genericvector.h
c_move
void c_move(void *win, double x, double y)
Definition: callcpp.cpp:71
tesseract::MasterTrainer::WriteInttempAndPFFMTable
void WriteInttempAndPFFMTable(const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
Definition: mastertrainer.cpp:566
FreeClassFields
void FreeClassFields(CLASS_TYPE Class)
Definition: protos.cpp:133
PROTOTYPE::Merged
bool Merged
Definition: cluster.h:64
tesseract::ShapeTable::MutableShape
Shape * MutableShape(int shape_id)
Definition: shapetable.h:322
tesseract::ShapeTable::GetFirstUnicharAndFont
void GetFirstUnicharAndFont(int shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:404
AddProtoToConfig
#define AddProtoToConfig(Pid, Config)
Definition: protos.h:73
tesseract::IndexMapBiDi::Merge
bool Merge(int compact_index1, int compact_index2)
Definition: indexmapbidi.cpp:128
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
NO_PROTO
#define NO_PROTO
Definition: matchdefs.h:40
tesseract::MasterTrainer::unicharset
const UNICHARSET & unicharset() const
Definition: mastertrainer.h:186
ocrfeatures.h
CleanUpUnusedData
void CleanUpUnusedData(LIST ProtoList)
Definition: commontraining.cpp:595
Red
Definition: callcpp.h:29
shapetable.h
Magenta
Definition: callcpp.h:34
UNICHARSET
Definition: unicharset.h:145
MakeNewFromOld
void MakeNewFromOld(PROTO New, PROTOTYPE *Old)
Definition: mergenf.cpp:193
CLASS_STRUCT::font_set
UnicityTableEqEq< int > font_set
Definition: protos.h:59
feature_defs
FEATURE_DEFS_STRUCT feature_defs
Definition: commontraining.cpp:89
tesseract::WriteShapeTable
void WriteShapeTable(const STRING &file_prefix, const ShapeTable &shape_table)
Definition: commontraining.cpp:179
tesseract::IndexMapBiDi::Setup
void Setup()
Definition: indexmapbidi.cpp:102
CLASS_STRUCT
Definition: protos.h:45
RemoveInsignificantProtos
LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
Definition: commontraining.cpp:613
fontinfo.h
CenterY
#define CenterY(M)
Definition: mergenf.h:50
push
LIST push(LIST list, void *element)
Definition: oldlist.cpp:172
mergenf.h
tprintf.h
tesseract::MasterTrainer
Definition: mastertrainer.h:69
MERGE_CLASS_NODE::NumMerged
int NumMerged[MAX_NUM_PROTOS]
Definition: commontraining.h:90
OrientationOf
#define OrientationOf(M)
Definition: mergenf.h:52
tesseract::Shape::IsEqualUnichars
bool IsEqualUnichars(Shape *other)
Definition: shapetable.cpp:217
MERGE_CLASS_NODE::Class
CLASS_TYPE Class
Definition: commontraining.h:91
cluster.h
tesseract::LoadTrainingData
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
Definition: commontraining.cpp:211
PROTOTYPE::Significant
bool Significant
Definition: cluster.h:63
protos.h
PROTOTYPE::Mean
float * Mean
Definition: cluster.h:73
ParseArguments
void ParseArguments(int *argc, char ***argv)
Definition: commontraining.cpp:122
tesseract::IndexMapBiDi
Definition: indexmapbidi.h:102
MergeInsignificantProtos
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *clusterconfig)
Definition: commontraining.cpp:528
featdefs.h
CLUSTERER
Definition: cluster.h:81
tesseract::IndexMapBiDi::Init
void Init(int size, bool all_mapped)
Definition: indexmapbidi.cpp:86
iterate
#define iterate(l)
Definition: oldlist.h:92
NewLabeledClass
MERGE_CLASS NewLabeledClass(const char *Label)
Definition: commontraining.cpp:692
tesseract::ShapeTable::AnyMultipleUnichars
bool AnyMultipleUnichars() const
Definition: shapetable.cpp:444
c_create_window
ScrollView * c_create_window(const char *name, int16_t xpos, int16_t ypos, int16_t xsize, int16_t ysize, double xmin, double xmax, double ymin, double ymax)
Definition: callcpp.cpp:47
c_line_color_index
void c_line_color_index(void *win, C_COL index)
Definition: callcpp.cpp:62
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::ShapeTable
Definition: shapetable.h:261
c_draw
void c_draw(void *win, double x, double y)
Definition: callcpp.cpp:80
Green
Definition: callcpp.h:31
intproto.h
classify.h
CenterX
#define CenterX(M)
Definition: mergenf.h:49
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
tesseract::IndexMap::CompactSize
int CompactSize() const
Definition: indexmapbidi.h:61
tessopt.h
PROTOTYPE::NumSamples
unsigned NumSamples
Definition: cluster.h:70
indexmapbidi.h
UNICHARSET::unichar_insert
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:625
UNICHARSET::size
int size() const
Definition: unicharset.h:341
clusttool.h
c_make_current
void c_make_current(void *win)
Definition: callcpp.cpp:89
main
int main(int argc, char **argv)
Definition: mftraining.cpp:205