tesseract  5.0.0-alpha-619-ge9db
cntraining.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: cntraining.cpp
3  ** Purpose: Generates a normproto and pffmtable.
4  ** Author: Dan Johnson
5  ** Revisment: Christy Russon
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17 ******************************************************************************/
18 
19 /*----------------------------------------------------------------------------
20  Include Files and Type Defines
21 ----------------------------------------------------------------------------*/
22 #include "oldlist.h"
23 #include "featdefs.h"
24 #include "tessopt.h"
25 #include "ocrfeatures.h"
26 #include "clusttool.h"
27 #include "cluster.h"
28 #include <cstring>
29 #include <cstdio>
30 #include <cmath>
31 #include <tesseract/unichar.h>
32 #include "commontraining.h"
33 
34 #define PROGRAM_FEATURE_TYPE "cn"
35 
36 /*----------------------------------------------------------------------------
37  Private Function Prototypes
38 ----------------------------------------------------------------------------*/
39 
40 static void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
41  const FEATURE_DESC_STRUCT *feature_desc);
42 
43 static void WriteProtos(FILE* File, uint16_t N, LIST ProtoList,
44  bool WriteSigProtos, bool WriteInsigProtos);
45 
46 /*----------------------------------------------------------------------------
47  Global Data Definitions and Declarations
48 ----------------------------------------------------------------------------*/
49 /* global variable to hold configuration parameters to control clustering */
50 //-M 0.025 -B 0.05 -I 0.8 -C 1e-3
51 static const CLUSTERCONFIG CNConfig = {
52  elliptical, 0.025, 0.05, 0.8, 1e-3, 0
53 };
54 
55 /*----------------------------------------------------------------------------
56  Public Code
57 ----------------------------------------------------------------------------*/
58 
104 int main(int argc, char *argv[]) {
105  tesseract::CheckSharedLibraryVersion();
106 
107  // Set the global Config parameters before parsing the command line.
108  Config = CNConfig;
109 
110  const char *PageName;
111  LIST CharList = NIL_LIST;
112  CLUSTERER *Clusterer = nullptr;
113  LIST ProtoList = NIL_LIST;
114  LIST NormProtoList = NIL_LIST;
115  LIST pCharList;
116  LABELEDLIST CharSample;
117  FEATURE_DEFS_STRUCT FeatureDefs;
118  InitFeatureDefs(&FeatureDefs);
119 
120  ParseArguments(&argc, &argv);
121  int num_fonts = 0;
122  while ((PageName = GetNextFilename(argc, argv)) != nullptr) {
123  printf("Reading %s ...\n", PageName);
124  FILE *TrainingPage = fopen(PageName, "rb");
125  ASSERT_HOST(TrainingPage);
126  if (TrainingPage) {
127  ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr,
128  TrainingPage, &CharList);
129  fclose(TrainingPage);
130  ++num_fonts;
131  }
132  }
133  printf("Clustering ...\n");
134  // To allow an individual font to form a separate cluster,
135  // reduce the min samples:
136  // Config.MinSamples = 0.5 / num_fonts;
137  pCharList = CharList;
138  // The norm protos will count the source protos, so we keep them here in
139  // freeable_protos, so they can be freed later.
140  GenericVector<LIST> freeable_protos;
141  iterate(pCharList) {
142  //Cluster
143  CharSample = reinterpret_cast<LABELEDLIST>first_node(pCharList);
144  Clusterer =
145  SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
146  if (Clusterer == nullptr) { // To avoid a SIGSEGV
147  fprintf(stderr, "Error: nullptr clusterer!\n");
148  return 1;
149  }
150  float SavedMinSamples = Config.MinSamples;
151  // To disable the tendency to produce a single cluster for all fonts,
152  // make MagicSamples an impossible to achieve number:
153  // Config.MagicSamples = CharSample->SampleCount * 10;
154  Config.MagicSamples = CharSample->SampleCount;
155  while (Config.MinSamples > 0.001) {
156  ProtoList = ClusterSamples(Clusterer, &Config);
157  if (NumberOfProtos(ProtoList, true, false) > 0) {
158  break;
159  } else {
160  Config.MinSamples *= 0.95;
161  printf("0 significant protos for %s."
162  " Retrying clustering with MinSamples = %f%%\n",
163  CharSample->Label, Config.MinSamples);
164  }
165  }
166  Config.MinSamples = SavedMinSamples;
167  AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
168  freeable_protos.push_back(ProtoList);
169  FreeClusterer(Clusterer);
170  }
171  FreeTrainingSamples(CharList);
172  int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
173  WriteNormProtos(FLAGS_D.c_str(), NormProtoList,
174  FeatureDefs.FeatureDesc[desc_index]);
175  FreeNormProtoList(NormProtoList);
176  for (int i = 0; i < freeable_protos.size(); ++i) {
177  FreeProtoList(&freeable_protos[i]);
178  }
179  printf ("\n");
180  return 0;
181 } // main
182 
183 /*----------------------------------------------------------------------------
184  Private Code
185 ----------------------------------------------------------------------------*/
186 
187 /*----------------------------------------------------------------------------*/
196 static void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
197  const FEATURE_DESC_STRUCT *feature_desc) {
198  FILE *File;
199  STRING Filename;
200  LABELEDLIST LabeledProto;
201  int N;
202 
203  Filename = "";
204  if (Directory != nullptr && Directory[0] != '\0') {
205  Filename += Directory;
206  Filename += "/";
207  }
208  Filename += "normproto";
209  printf ("\nWriting %s ...", Filename.c_str());
210  File = fopen(Filename.c_str(), "wb");
211  ASSERT_HOST(File);
212  fprintf(File, "%0d\n", feature_desc->NumParams);
213  WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc);
214  iterate(LabeledProtoList)
215  {
216  LabeledProto = reinterpret_cast<LABELEDLIST>first_node (LabeledProtoList);
217  N = NumberOfProtos(LabeledProto->List, true, false);
218  if (N < 1) {
219  printf ("\nError! Not enough protos for %s: %d protos"
220  " (%d significant protos"
221  ", %d insignificant protos)\n",
222  LabeledProto->Label, N,
223  NumberOfProtos(LabeledProto->List, true, false),
224  NumberOfProtos(LabeledProto->List, false, true));
225  exit(1);
226  }
227  fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
228  WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false);
229  }
230  fclose (File);
231 
232 } // WriteNormProtos
233 
234 /*-------------------------------------------------------------------------*/
235 
236 static void WriteProtos(FILE* File, uint16_t N, LIST ProtoList,
237  bool WriteSigProtos, bool WriteInsigProtos)
238 {
239  PROTOTYPE *Proto;
240 
241  // write prototypes
242  iterate(ProtoList)
243  {
244  Proto = reinterpret_cast<PROTOTYPE*>first_node(ProtoList);
245  if ((Proto->Significant && WriteSigProtos) ||
246  (! Proto->Significant && WriteInsigProtos))
247  WritePrototype(File, N, Proto);
248  }
249 } // WriteProtos
ReadTrainingSamples
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
Definition: commontraining.cpp:389
CLUSTERCONFIG
Definition: cluster.h:45
CLUSTERCONFIG::MagicSamples
int MagicSamples
Definition: cluster.h:52
commontraining.h
InitFeatureDefs
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:111
NumberOfProtos
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
Definition: commontraining.cpp:839
FreeClusterer
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:514
first_node
#define first_node(l)
Definition: oldlist.h:84
elliptical
Definition: cluster.h:43
ShortNameToFeatureType
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:269
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
list_rec
Definition: oldlist.h:73
FreeProtoList
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:538
Config
CLUSTERCONFIG Config
Definition: commontraining.cpp:88
STRING
Definition: strngs.h:45
main
int main(int argc, char *argv[])
Definition: cntraining.cpp:104
NIL_LIST
#define NIL_LIST
Definition: oldlist.h:68
oldlist.h
PROTOTYPE
Definition: cluster.h:62
FEATURE_DEFS_STRUCT::FeatureDesc
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:46
ClusterSamples
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:483
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
WriteParamDesc
void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:255
ocrfeatures.h
FEATURE_DESC_STRUCT
Definition: ocrfeatures.h:51
FEATURE_DEFS_STRUCT
Definition: featdefs.h:44
FEATURE_DESC_STRUCT::NumParams
uint16_t NumParams
Definition: ocrfeatures.h:52
GetNextFilename
const char * GetNextFilename(int argc, const char *const *argv)
Definition: commontraining.cpp:323
PROGRAM_FEATURE_TYPE
#define PROGRAM_FEATURE_TYPE
Definition: cntraining.cpp:34
FEATURE_DESC_STRUCT::ParamDesc
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:54
FreeTrainingSamples
void FreeTrainingSamples(LIST CharList)
Definition: commontraining.cpp:450
GenericVector
Definition: baseapi.h:40
LABELEDLISTNODE
Definition: commontraining.h:78
cluster.h
WritePrototype
void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto)
Definition: clusttool.cpp:280
PROTOTYPE::Significant
bool Significant
Definition: cluster.h:63
SetUpForClustering
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
Definition: commontraining.cpp:494
ParseArguments
void ParseArguments(int *argc, char ***argv)
Definition: commontraining.cpp:122
featdefs.h
CLUSTERER
Definition: cluster.h:81
unichar.h
iterate
#define iterate(l)
Definition: oldlist.h:92
LABELEDLISTNODE::SampleCount
int SampleCount
Definition: commontraining.h:81
tessopt.h
GenericVector::size
int size() const
Definition: genericvector.h:71
CLUSTERCONFIG::MinSamples
float MinSamples
Definition: cluster.h:47
AddToNormProtosList
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
Definition: commontraining.cpp:821
LABELEDLISTNODE::Label
char * Label
Definition: commontraining.h:80
FreeNormProtoList
void FreeNormProtoList(LIST CharList)
Definition: commontraining.cpp:805
clusttool.h
LABELEDLISTNODE::List
LIST List
Definition: commontraining.h:83