tesseract  4.0.0-1-g2a2b
commontraining.h
Go to the documentation of this file.
1 // Copyright 2008 Google Inc. All Rights Reserved.
2 // Author: scharron@google.com (Samuel Charron)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 #ifndef TESSERACT_TRAINING_COMMONTRAINING_H_
15 #define TESSERACT_TRAINING_COMMONTRAINING_H_
16 
17 #ifdef HAVE_CONFIG_H
18 #include "config_auto.h"
19 #include "baseapi.h"
20 #endif
21 
22 #ifdef DISABLED_LEGACY_ENGINE
23 
24 #include "tprintf.h"
25 #include "commandlineflags.h"
26 
27 
28 void ParseArguments(int* argc, char*** argv);
29 
30 
31 namespace tesseract {
32 
33 // Check whether the shared tesseract library is the right one.
34 // This function must be inline because otherwise it would be part of
35 // the shared library, so it could not compare the versions.
36 static inline void CheckSharedLibraryVersion()
37 {
38 #ifdef HAVE_CONFIG_H
39  if (!!strcmp(TESSERACT_VERSION_STR, TessBaseAPI::Version())) {
40  tprintf("ERROR: shared library version mismatch (was %s, expected %s\n"
41  "Did you use a wrong shared tesseract library?\n",
42  TessBaseAPI::Version(), TESSERACT_VERSION_STR);
43  exit(1);
44  }
45 #endif
46 }
47 
48 } // namespace tesseract
49 
50 
51 #else
52 
53 #include "cluster.h"
54 #include "commandlineflags.h"
55 #include "featdefs.h"
56 #include "intproto.h"
57 #include "oldlist.h"
58 
59 namespace tesseract {
60 class Classify;
61 class MasterTrainer;
62 class ShapeTable;
63 }
64 
66 // Globals ///////////////////////////////////////////////////////////////////
68 
70 
71 // Must be defined in the file that "implements" commonTraining facilities.
72 extern CLUSTERCONFIG Config;
73 
75 // Structs ///////////////////////////////////////////////////////////////////
77 typedef struct
78 {
79  char *Label;
83 }
85 
86 typedef struct
87 {
88  char* Label;
89  int NumMerged[MAX_NUM_PROTOS];
93 
94 
96 // Functions /////////////////////////////////////////////////////////////////
98 void ParseArguments(int* argc, char*** argv);
99 
100 namespace tesseract {
101 
102 // Check whether the shared tesseract library is the right one.
103 // This function must be inline because otherwise it would be part of
104 // the shared library, so it could not compare the versions.
105 static inline void CheckSharedLibraryVersion()
106 {
107 #ifdef HAVE_CONFIG_H
108  if (!!strcmp(TESSERACT_VERSION_STR, TessBaseAPI::Version())) {
109  tprintf("ERROR: shared library version mismatch (was %s, expected %s\n"
110  "Did you use a wrong shared tesseract library?\n",
111  TessBaseAPI::Version(), TESSERACT_VERSION_STR);
112  exit(1);
113  }
114 #endif
115 }
116 
117 // Helper loads shape table from the given file.
118 ShapeTable* LoadShapeTable(const STRING& file_prefix);
119 // Helper to write the shape_table.
120 void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table);
121 
122 // Creates a MasterTraininer and loads the training data into it:
123 // Initializes feature_defs and IntegerFX.
124 // Loads the shape_table if shape_table != nullptr.
125 // Loads initial unicharset from -U command-line option.
126 // If FLAGS_input_trainer is set, loads the majority of data from there, else:
127 // Loads font info from -F option.
128 // Loads xheights from -X option.
129 // Loads samples from .tr files in remaining command-line args.
130 // Deletes outliers and computes canonical samples.
131 // If FLAGS_output_trainer is set, saves the trainer for future use.
132 // Computes canonical and cloud features.
133 // If shape_table is not nullptr, but failed to load, make a fake flat one,
134 // as shape clustering was not run.
135 MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
136  bool replication,
137  ShapeTable** shape_table,
138  STRING* file_prefix);
139 } // namespace tesseract.
140 
141 const char *GetNextFilename(int argc, const char* const * argv);
142 
144  LIST List,
145  char *Label);
146 
148  const char *Label);
149 
151  const char *feature_name, int max_samples,
152  UNICHARSET* unicharset,
153  FILE* file, LIST* training_samples);
154 
156  const FEATURE_DEFS_STRUCT &FeatureDefs,
157  char *Directory,
158  LIST CharList,
159  const char *program_feature_type);
160 
162  LIST CharList);
163 
164 void FreeLabeledList(
165  LABELEDLIST LabeledList);
166 
168  LIST ClassListList);
169 
171  const FEATURE_DEFS_STRUCT &FeatureDefs,
172  LABELEDLIST CharSample,
173  const char *program_feature_type);
174 
176  LIST ProtoList,
177  bool KeepSigProtos,
178  bool KeepInsigProtos,
179  int N);
180 
181 void CleanUpUnusedData(
182  LIST ProtoList);
183 
185  LIST ProtoList,
186  const char *label,
187  CLUSTERER *Clusterer,
189 
191  LIST List,
192  const char *Label);
193 
195  const char *Label);
196 
198  LIST CharList);
199 
200 CLASS_STRUCT* SetUpForFloat2Int(const UNICHARSET& unicharset,
201  LIST LabeledClassList);
202 
203 void Normalize(
204  float *Values);
205 
206 void FreeNormProtoList(
207  LIST CharList);
208 
210  LIST* NormProtoList,
211  LIST ProtoList,
212  char *CharName);
213 
214 int NumberOfProtos(
215  LIST ProtoList,
216  bool CountSigProtos,
217  bool CountInsigProtos);
218 
219 
220 void allocNormProtos();
221 
222 #endif // def DISABLED_LEGACY_ENGINE
223 
224 #endif // TESSERACT_TRAINING_COMMONTRAINING_H_
void FreeTrainingSamples(LIST CharList)
void ParseArguments(int *argc, char ***argv)
MERGE_CLASS NewLabeledClass(const char *Label)
void FreeLabeledClassList(LIST ClassListList)
struct LABELEDLISTNODE * LABELEDLIST
LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
void Normalize(float *Values)
#define MAX_NUM_PROTOS
Definition: intproto.h:48
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST CharSample, const char *program_feature_type)
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
void WriteShapeTable(const STRING &file_prefix, const ShapeTable &shape_table)
FEATURE_DEFS_STRUCT feature_defs
void FreeLabeledList(LABELEDLIST LabeledList)
LABELEDLIST FindList(LIST List, char *Label)
void CleanUpUnusedData(LIST ProtoList)
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void FreeNormProtoList(LIST CharList)
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
void WriteTrainingSamples(const FEATURE_DEFS_STRUCT &FeatureDefs, char *Directory, LIST CharList, const char *program_feature_type)
MERGE_CLASS FindClass(LIST List, const char *Label)
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
CLASS_TYPE Class
Definition: strngs.h:45
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
void allocNormProtos()
const char * GetNextFilename(int argc, const char *const *argv)
static const char * Version()
Definition: baseapi.cpp:223
CLUSTERCONFIG Config
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
ShapeTable * LoadShapeTable(const STRING &file_prefix)
LABELEDLIST NewLabeledList(const char *Label)