tesseract  5.0.0-alpha-619-ge9db
commontraining.h
Go to the documentation of this file.
1 // Copyright 2008 Google Inc. All Rights Reserved.
2 // Author: scharron@google.com (Samuel Charron)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 #ifndef TESSERACT_TRAINING_COMMONTRAINING_H_
15 #define TESSERACT_TRAINING_COMMONTRAINING_H_
16 
17 #ifdef HAVE_CONFIG_H
18 #include "config_auto.h"
19 #endif
20 
21 #include <tesseract/baseapi.h>
22 
23 #ifdef DISABLED_LEGACY_ENGINE
24 
25 #include "tprintf.h"
26 #include "commandlineflags.h"
27 
28 
29 void ParseArguments(int* argc, char*** argv);
30 
31 
32 namespace tesseract {
33 
34 // Check whether the shared tesseract library is the right one.
35 // This function must be inline because otherwise it would be part of
36 // the shared library, so it could not compare the versions.
37 static inline void CheckSharedLibraryVersion()
38 {
39 #ifdef HAVE_CONFIG_H
40  if (!!strcmp(TESSERACT_VERSION_STR, TessBaseAPI::Version())) {
41  tprintf("ERROR: shared library version mismatch (was %s, expected %s\n"
42  "Did you use a wrong shared tesseract library?\n",
43  TessBaseAPI::Version(), TESSERACT_VERSION_STR);
44  exit(1);
45  }
46 #endif
47 }
48 
49 } // namespace tesseract
50 
51 
52 #else
53 
54 #include "cluster.h"
55 #include "commandlineflags.h"
56 #include "featdefs.h"
57 #include "intproto.h"
58 #include "oldlist.h"
59 
60 namespace tesseract {
61 class Classify;
62 class MasterTrainer;
63 class ShapeTable;
64 }
65 
67 // Globals ///////////////////////////////////////////////////////////////////
69 
71 
72 // Must be defined in the file that "implements" commonTraining facilities.
73 extern CLUSTERCONFIG Config;
74 
76 // Structs ///////////////////////////////////////////////////////////////////
78 typedef struct
79 {
80  char *Label;
84 }
86 
87 typedef struct
88 {
89  char* Label;
90  int NumMerged[MAX_NUM_PROTOS];
94 
95 
97 // Functions /////////////////////////////////////////////////////////////////
99 void ParseArguments(int* argc, char*** argv);
100 
101 namespace tesseract {
102 
103 // Check whether the shared tesseract library is the right one.
104 // This function must be inline because otherwise it would be part of
105 // the shared library, so it could not compare the versions.
106 static inline void CheckSharedLibraryVersion()
107 {
108 #ifdef HAVE_CONFIG_H
109  if (!!strcmp(TESSERACT_VERSION_STR, TessBaseAPI::Version())) {
110  tprintf("ERROR: shared library version mismatch (was %s, expected %s\n"
111  "Did you use a wrong shared tesseract library?\n",
112  TessBaseAPI::Version(), TESSERACT_VERSION_STR);
113  exit(1);
114  }
115 #endif
116 }
117 
118 // Helper loads shape table from the given file.
119 ShapeTable* LoadShapeTable(const STRING& file_prefix);
120 // Helper to write the shape_table.
121 void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table);
122 
123 // Creates a MasterTraininer and loads the training data into it:
124 // Initializes feature_defs and IntegerFX.
125 // Loads the shape_table if shape_table != nullptr.
126 // Loads initial unicharset from -U command-line option.
127 // If FLAGS_input_trainer is set, loads the majority of data from there, else:
128 // Loads font info from -F option.
129 // Loads xheights from -X option.
130 // Loads samples from .tr files in remaining command-line args.
131 // Deletes outliers and computes canonical samples.
132 // If FLAGS_output_trainer is set, saves the trainer for future use.
133 // Computes canonical and cloud features.
134 // If shape_table is not nullptr, but failed to load, make a fake flat one,
135 // as shape clustering was not run.
136 MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
137  bool replication,
138  ShapeTable** shape_table,
139  STRING* file_prefix);
140 } // namespace tesseract.
141 
142 const char *GetNextFilename(int argc, const char* const * argv);
143 
145  LIST List,
146  char *Label);
147 
149  const char *Label);
150 
152  const char *feature_name, int max_samples,
153  UNICHARSET* unicharset,
154  FILE* file, LIST* training_samples);
155 
157  const FEATURE_DEFS_STRUCT &FeatureDefs,
158  char *Directory,
159  LIST CharList,
160  const char *program_feature_type);
161 
163  LIST CharList);
164 
165 void FreeLabeledList(
166  LABELEDLIST LabeledList);
167 
169  LIST ClassListList);
170 
172  const FEATURE_DEFS_STRUCT &FeatureDefs,
173  LABELEDLIST CharSample,
174  const char *program_feature_type);
175 
177  LIST ProtoList,
178  bool KeepSigProtos,
179  bool KeepInsigProtos,
180  int N);
181 
182 void CleanUpUnusedData(
183  LIST ProtoList);
184 
186  LIST ProtoList,
187  const char *label,
188  CLUSTERER *Clusterer,
190 
192  LIST List,
193  const char *Label);
194 
196  const char *Label);
197 
199  LIST CharList);
200 
201 CLASS_STRUCT* SetUpForFloat2Int(const UNICHARSET& unicharset,
202  LIST LabeledClassList);
203 
204 void Normalize(
205  float *Values);
206 
207 void FreeNormProtoList(
208  LIST CharList);
209 
211  LIST* NormProtoList,
212  LIST ProtoList,
213  char *CharName);
214 
215 int NumberOfProtos(
216  LIST ProtoList,
217  bool CountSigProtos,
218  bool CountInsigProtos);
219 
220 
221 void allocNormProtos();
222 
223 #endif // def DISABLED_LEGACY_ENGINE
224 
225 #endif // TESSERACT_TRAINING_COMMONTRAINING_H_
CLUSTERCONFIG
Definition: cluster.h:45
Normalize
void Normalize(float *Values)
Definition: commontraining.cpp:788
GetNextFilename
const char * GetNextFilename(int argc, const char *const *argv)
Definition: commontraining.cpp:323
FindClass
MERGE_CLASS FindClass(LIST List, const char *Label)
Definition: commontraining.cpp:678
tesseract::LoadShapeTable
ShapeTable * LoadShapeTable(const STRING &file_prefix)
Definition: commontraining.cpp:154
list_rec
Definition: oldlist.h:73
Config
CLUSTERCONFIG Config
Definition: commontraining.cpp:88
LABELEDLISTNODE::font_sample_count
int font_sample_count
Definition: commontraining.h:82
MERGE_CLASS_NODE
Definition: commontraining.h:87
SetUpForClustering
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST CharSample, const char *program_feature_type)
Definition: commontraining.cpp:494
ReadTrainingSamples
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
Definition: commontraining.cpp:389
STRING
Definition: strngs.h:45
NumberOfProtos
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
Definition: commontraining.cpp:839
oldlist.h
NewLabeledClass
MERGE_CLASS NewLabeledClass(const char *Label)
Definition: commontraining.cpp:692
FreeTrainingSamples
void FreeTrainingSamples(LIST CharList)
Definition: commontraining.cpp:450
CleanUpUnusedData
void CleanUpUnusedData(LIST ProtoList)
Definition: commontraining.cpp:595
FreeLabeledClassList
void FreeLabeledClassList(LIST ClassListList)
Definition: commontraining.cpp:709
baseapi.h
file
Definition: include_gunit.h:22
MAX_NUM_PROTOS
#define MAX_NUM_PROTOS
Definition: intproto.h:47
MergeInsignificantProtos
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: commontraining.cpp:528
MERGE_CLASS_NODE::Label
char * Label
Definition: commontraining.h:89
RemoveInsignificantProtos
LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
Definition: commontraining.cpp:613
FEATURE_DEFS_STRUCT
Definition: featdefs.h:44
UNICHARSET
Definition: unicharset.h:145
tesseract::WriteShapeTable
void WriteShapeTable(const STRING &file_prefix, const ShapeTable &shape_table)
Definition: commontraining.cpp:179
CLASS_STRUCT
Definition: protos.h:45
tesseract::TessBaseAPI::Version
static const char * Version()
Definition: baseapi.cpp:233
tesseract
Definition: baseapi.h:65
LABELEDLIST
struct LABELEDLISTNODE * LABELEDLIST
FreeNormProtoList
void FreeNormProtoList(LIST CharList)
Definition: commontraining.cpp:805
FreeLabeledList
void FreeLabeledList(LABELEDLIST LabeledList)
Definition: commontraining.cpp:476
WriteTrainingSamples
void WriteTrainingSamples(const FEATURE_DEFS_STRUCT &FeatureDefs, char *Directory, LIST CharList, const char *program_feature_type)
tprintf.h
SetUpForFloat2Int
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
Definition: commontraining.cpp:725
LABELEDLISTNODE
Definition: commontraining.h:78
MERGE_CLASS_NODE::Class
CLASS_TYPE Class
Definition: commontraining.h:91
allocNormProtos
void allocNormProtos()
cluster.h
tesseract::LoadTrainingData
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
Definition: commontraining.cpp:211
NewLabeledList
LABELEDLIST NewLabeledList(const char *Label)
Definition: commontraining.cpp:361
AddToNormProtosList
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
Definition: commontraining.cpp:821
featdefs.h
CLUSTERER
Definition: cluster.h:81
ParseArguments
void ParseArguments(int *argc, char ***argv)
Definition: commontraining.cpp:122
FindList
LABELEDLIST FindList(LIST List, char *Label)
Definition: commontraining.cpp:340
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
LABELEDLISTNODE::SampleCount
int SampleCount
Definition: commontraining.h:81
feature_defs
FEATURE_DEFS_STRUCT feature_defs
Definition: commontraining.cpp:89
commandlineflags.h
intproto.h
LABELEDLISTNODE::Label
char * Label
Definition: commontraining.h:80
LABELEDLISTNODE::List
LIST List
Definition: commontraining.h:83