tesseract  4.0.0-1-g2a2b
cntraining.cpp
Go to the documentation of this file.
1 /******************************************************************************
2 ** Filename: cntraining.cpp
3 ** Purpose: Generates a normproto and pffmtable.
4 ** Author: Dan Johnson
5 ** Revisment: Christy Russon
6 ** History: Fri Aug 18 08:53:50 1989, DSJ, Created.
7 ** 5/25/90, DSJ, Adapted to multiple feature types.
8 ** Tuesday, May 17, 1998 Changes made to make feature specific and
9 ** simplify structures. First step in simplifying training process.
10 **
11  ** (c) Copyright Hewlett-Packard Company, 1988.
12  ** Licensed under the Apache License, Version 2.0 (the "License");
13  ** you may not use this file except in compliance with the License.
14  ** You may obtain a copy of the License at
15  ** http://www.apache.org/licenses/LICENSE-2.0
16  ** Unless required by applicable law or agreed to in writing, software
17  ** distributed under the License is distributed on an "AS IS" BASIS,
18  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19  ** See the License for the specific language governing permissions and
20  ** limitations under the License.
21 ******************************************************************************/
22 
23 /*----------------------------------------------------------------------------
24  Include Files and Type Defines
25 ----------------------------------------------------------------------------*/
26 #include "oldlist.h"
27 #include "emalloc.h"
28 #include "featdefs.h"
29 #include "tessopt.h"
30 #include "ocrfeatures.h"
31 #include "clusttool.h"
32 #include "cluster.h"
33 #include <cstring>
34 #include <cstdio>
35 #include <cmath>
36 #include "unichar.h"
37 #include "commontraining.h"
38 
39 #define PROGRAM_FEATURE_TYPE "cn"
40 
42 
43 /*----------------------------------------------------------------------------
44  Private Function Prototypes
45 ----------------------------------------------------------------------------*/
46 
47 static void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
48  const FEATURE_DESC_STRUCT *feature_desc);
49 
50 static void WriteProtos(FILE* File, uint16_t N, LIST ProtoList,
51  bool WriteSigProtos, bool WriteInsigProtos);
52 
53 /*----------------------------------------------------------------------------
54  Global Data Definitions and Declarations
55 ----------------------------------------------------------------------------*/
56 /* global variable to hold configuration parameters to control clustering */
57 //-M 0.025 -B 0.05 -I 0.8 -C 1e-3
59 {
60  elliptical, 0.025, 0.05, 0.8, 1e-3, 0
61 };
62 
63 /*----------------------------------------------------------------------------
64  Public Code
65 ----------------------------------------------------------------------------*/
66 /*---------------------------------------------------------------------------*/
113 int main(int argc, char *argv[]) {
114  tesseract::CheckSharedLibraryVersion();
115 
116  // Set the global Config parameters before parsing the command line.
117  Config = CNConfig;
118 
119  const char *PageName;
120  LIST CharList = NIL_LIST;
121  CLUSTERER *Clusterer = nullptr;
122  LIST ProtoList = NIL_LIST;
123  LIST NormProtoList = NIL_LIST;
124  LIST pCharList;
125  LABELEDLIST CharSample;
126  FEATURE_DEFS_STRUCT FeatureDefs;
127  InitFeatureDefs(&FeatureDefs);
128 
129  ParseArguments(&argc, &argv);
130  int num_fonts = 0;
131  while ((PageName = GetNextFilename(argc, argv)) != nullptr) {
132  printf("Reading %s ...\n", PageName);
133  FILE *TrainingPage = fopen(PageName, "rb");
134  ASSERT_HOST(TrainingPage);
135  if (TrainingPage) {
136  ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr,
137  TrainingPage, &CharList);
138  fclose(TrainingPage);
139  ++num_fonts;
140  }
141  }
142  printf("Clustering ...\n");
143  // To allow an individual font to form a separate cluster,
144  // reduce the min samples:
145  // Config.MinSamples = 0.5 / num_fonts;
146  pCharList = CharList;
147  // The norm protos will count the source protos, so we keep them here in
148  // freeable_protos, so they can be freed later.
149  GenericVector<LIST> freeable_protos;
150  iterate(pCharList) {
151  //Cluster
152  CharSample = (LABELEDLIST)first_node(pCharList);
153  Clusterer =
154  SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
155  if (Clusterer == nullptr) { // To avoid a SIGSEGV
156  fprintf(stderr, "Error: nullptr clusterer!\n");
157  return 1;
158  }
159  float SavedMinSamples = Config.MinSamples;
160  // To disable the tendency to produce a single cluster for all fonts,
161  // make MagicSamples an impossible to achieve number:
162  // Config.MagicSamples = CharSample->SampleCount * 10;
163  Config.MagicSamples = CharSample->SampleCount;
164  while (Config.MinSamples > 0.001) {
165  ProtoList = ClusterSamples(Clusterer, &Config);
166  if (NumberOfProtos(ProtoList, true, false) > 0) {
167  break;
168  } else {
169  Config.MinSamples *= 0.95;
170  printf("0 significant protos for %s."
171  " Retrying clustering with MinSamples = %f%%\n",
172  CharSample->Label, Config.MinSamples);
173  }
174  }
175  Config.MinSamples = SavedMinSamples;
176  AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
177  freeable_protos.push_back(ProtoList);
178  FreeClusterer(Clusterer);
179  }
180  FreeTrainingSamples(CharList);
181  int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
182  WriteNormProtos(FLAGS_D.c_str(), NormProtoList,
183  FeatureDefs.FeatureDesc[desc_index]);
184  FreeNormProtoList(NormProtoList);
185  for (int i = 0; i < freeable_protos.size(); ++i) {
186  FreeProtoList(&freeable_protos[i]);
187  }
188  printf ("\n");
189  return 0;
190 } // main
191 
192 /*----------------------------------------------------------------------------
193  Private Code
194 ----------------------------------------------------------------------------*/
195 
196 /*----------------------------------------------------------------------------*/
206 static void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
207  const FEATURE_DESC_STRUCT *feature_desc) {
208  FILE *File;
209  STRING Filename;
210  LABELEDLIST LabeledProto;
211  int N;
212 
213  Filename = "";
214  if (Directory != nullptr && Directory[0] != '\0') {
215  Filename += Directory;
216  Filename += "/";
217  }
218  Filename += "normproto";
219  printf ("\nWriting %s ...", Filename.string());
220  File = fopen(Filename.string(), "wb");
221  ASSERT_HOST(File);
222  fprintf(File, "%0d\n", feature_desc->NumParams);
223  WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc);
224  iterate(LabeledProtoList)
225  {
226  LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
227  N = NumberOfProtos(LabeledProto->List, true, false);
228  if (N < 1) {
229  printf ("\nError! Not enough protos for %s: %d protos"
230  " (%d significant protos"
231  ", %d insignificant protos)\n",
232  LabeledProto->Label, N,
233  NumberOfProtos(LabeledProto->List, true, false),
234  NumberOfProtos(LabeledProto->List, false, true));
235  exit(1);
236  }
237  fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
238  WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false);
239  }
240  fclose (File);
241 
242 } // WriteNormProtos
243 
244 /*-------------------------------------------------------------------------*/
245 
246 static void WriteProtos(FILE* File, uint16_t N, LIST ProtoList,
247  bool WriteSigProtos, bool WriteInsigProtos)
248 {
249  PROTOTYPE *Proto;
250 
251  // write prototypes
252  iterate(ProtoList)
253  {
254  Proto = (PROTOTYPE*)first_node(ProtoList);
255  if ((Proto->Significant && WriteSigProtos) ||
256  (! Proto->Significant && WriteInsigProtos))
257  WritePrototype(File, N, Proto);
258  }
259 } // WriteProtos
CLUSTERCONFIG Config
int size() const
Definition: genericvector.h:71
float MinSamples
Definition: cluster.h:50
const char * string() const
Definition: strngs.cpp:196
#define PROGRAM_FEATURE_TYPE
Definition: cntraining.cpp:39
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
void ParseArguments(int *argc, char ***argv)
struct LABELEDLISTNODE * LABELEDLIST
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:56
void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto)
Definition: clusttool.cpp:250
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:563
void FreeNormProtoList(LIST CharList)
unsigned Significant
Definition: cluster.h:68
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:112
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:506
void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:223
int MagicSamples
Definition: cluster.h:55
CLUSTERCONFIG CNConfig
Definition: cntraining.cpp:58
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
void FreeTrainingSamples(LIST CharList)
int push_back(T object)
#define first_node(l)
Definition: oldlist.h:141
#define NIL_LIST
Definition: oldlist.h:127
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
Definition: strngs.h:45
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:538
#define iterate(l)
Definition: oldlist.h:161
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:48
const char * GetNextFilename(int argc, const char *const *argv)
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:270
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
int main(int argc, char *argv[])
Definition: cntraining.cpp:113
DECLARE_STRING_PARAM_FLAG(D)
#define ASSERT_HOST(x)
Definition: errcode.h:84