tesseract
5.0.0-alpha-619-ge9db
|
Go to the documentation of this file.
14 #define _USE_MATH_DEFINES // for M_PI
19 #ifdef DISABLED_LEGACY_ENGINE
51 usage +=
" -v | --version | ";
54 usage +=
" [.tr files ...]";
60 #include "allheaders.h"
93 static INT_PARAM_FLAG(load_images, 0,
"Load images with tr files");
103 "Min number of samples per proto as % of total");
105 "Max percentage of samples in a cluster which have more"
106 " than 1 feature in that cluster");
108 "Desired independence between dimensions");
110 "Desired confidence in prototypes created");
126 usage +=
" -v | --version | ";
129 usage +=
" [.tr files ...]";
136 std::max(0.0, std::min(1.0,
double(FLAGS_clusterconfig_min_samples_fraction)));
138 std::max(0.0, std::min(1.0,
double(FLAGS_clusterconfig_max_illegal)));
140 std::max(0.0, std::min(1.0,
double(FLAGS_clusterconfig_independence)));
142 std::max(0.0, std::min(1.0,
double(FLAGS_clusterconfig_confidence)));
144 if (!FLAGS_configfile.empty()) {
146 FLAGS_configfile.c_str(),
156 STRING shape_table_file = file_prefix;
157 shape_table_file += kShapeTableFileSuffix;
159 if (shape_fp.
Open(shape_table_file.
c_str(),
nullptr)) {
163 shape_table =
nullptr;
164 tprintf(
"Error: Failed to read shape table %s\n",
165 shape_table_file.
c_str());
167 int num_shapes = shape_table->
NumShapes();
168 tprintf(
"Read shape table %s of %d shapes\n",
169 shape_table_file.
c_str(), num_shapes);
172 tprintf(
"Warning: No shape table file present: %s\n",
173 shape_table_file.
c_str());
180 STRING shape_table_file = file_prefix;
181 shape_table_file += kShapeTableFileSuffix;
182 FILE* fp = fopen(shape_table_file.
c_str(),
"wb");
185 fprintf(stderr,
"Error writing shape table: %s\n",
186 shape_table_file.
c_str());
190 fprintf(stderr,
"Error creating shape table: %s\n",
191 shape_table_file.
c_str());
218 if (!FLAGS_D.empty()) {
219 *file_prefix += FLAGS_D.
c_str();
226 bool shape_analysis =
false;
227 if (shape_table !=
nullptr) {
229 if (*shape_table !=
nullptr) shape_analysis =
true;
231 shape_analysis =
true;
241 if (!FLAGS_F.empty()) {
247 if (!FLAGS_X.empty()) {
254 const char* page_name;
257 tprintf(
"Reading %s ...\n", page_name);
262 int pagename_len = strlen(page_name);
263 char* fontinfo_file_name =
new char[pagename_len + 7];
264 strncpy(fontinfo_file_name, page_name, pagename_len - 2);
265 strcpy(fontinfo_file_name + pagename_len - 2,
"fontinfo");
267 delete[] fontinfo_file_name;
270 if (FLAGS_load_images) {
271 STRING image_name = page_name;
280 if (!FLAGS_output_trainer.empty()) {
281 FILE* fp = fopen(FLAGS_output_trainer.c_str(),
"wb");
283 tprintf(
"Can't create saved trainer data!\n");
290 if (!FLAGS_O.empty() &&
292 fprintf(stderr,
"Failed to save unicharset to file %s\n", FLAGS_O.c_str());
296 if (shape_table !=
nullptr) {
299 if (*shape_table ==
nullptr) {
302 tprintf(
"Flat shape table summary: %s\n",
303 (*shape_table)->SummaryStr().c_str());
305 (*shape_table)->set_unicharset(trainer->
unicharset());
346 if (strcmp (LabeledList->
Label, Label) == 0)
347 return (LabeledList);
365 LabeledList->
Label = static_cast<char*>(
Emalloc (strlen (Label)+1));
366 strcpy (LabeledList->
Label, Label);
370 return (LabeledList);
390 const char *feature_name,
int max_samples,
392 FILE*
file,
LIST* training_samples) {
398 uint32_t feature_type =
402 LIST it = *training_samples;
404 char_sample = reinterpret_cast<LABELEDLIST>(
first_node(it));
408 while (fgets(buffer, 2048,
file) !=
nullptr) {
409 if (buffer[0] ==
'\n')
412 sscanf(buffer,
"%*s %s", unichar);
416 tprintf(
"Error: Size of unicharset in training is "
417 "greater than MAX_NUM_CLASSES\n");
421 char_sample =
FindList(*training_samples, unichar);
422 if (char_sample ==
nullptr) {
424 *training_samples =
push(*training_samples, char_sample);
427 feature_samples = char_desc->
FeatureSets[feature_type];
429 char_sample->
List =
push(char_sample->
List, feature_samples);
436 if (feature_type != i)
455 LIST nodes = CharList;
458 FeatureList = char_sample->
List;
478 free(LabeledList->
Label);
496 const char* program_feature_type) {
499 float* Sample =
nullptr;
502 LIST FeatureList =
nullptr;
510 FeatureList = char_sample->
List;
515 if (Sample ==
nullptr) Sample = static_cast<float*>(
Emalloc(N *
sizeof(
float)));
516 for (j = 0; j < N; j++)
532 bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
534 LIST pProtoList = ProtoList;
539 float best_dist = 0.125;
542 LIST list_it = ProtoList;
545 if (test_p != Prototype && !test_p->
Merged) {
549 if (dist < best_dist) {
555 if (best_match !=
nullptr && !best_match->
Significant) {
557 tprintf(
"Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
559 best_match->
Mean[0], best_match->
Mean[1],
560 Prototype->
Mean[0], Prototype->
Mean[1]);
569 }
else if (best_match !=
nullptr) {
571 tprintf(
"Red proto at %g,%g matched a green one at %g,%g\n",
572 Prototype->
Mean[0], Prototype->
Mean[1],
573 best_match->
Mean[0], best_match->
Mean[1]);
580 pProtoList = ProtoList;
587 tprintf(
"Red proto at %g,%g becoming green\n",
588 Prototype->
Mean[0], Prototype->
Mean[1]);
616 bool KeepInsigProtos,
626 pProtoList = ProtoList;
635 NewProto->
Mean = static_cast<float *>(
Emalloc(N *
sizeof(
float)));
642 for (i=0; i < N; i++)
646 for (i=0; i < N; i++)
654 for (i=0; i < N; i++)
662 for (i=0; i < N; i++)
670 NewProtoList =
push_last(NewProtoList, NewProto);
674 return (NewProtoList);
684 if (strcmp (MergeClass->
Label, Label) == 0)
696 MergeClass->
Label = static_cast<char*>(
Emalloc (strlen (Label)+1));
697 strcpy (MergeClass->
Label, Label);
712 LIST nodes = ClassList;
716 free (MergeClass->
Label);
726 LIST LabeledClassList) {
753 for(i=0; i < NumProtos; i++)
757 Values[0] = OldProto->
X;
758 Values[1] = OldProto->
Y;
759 Values[2] = OldProto->
Angle;
761 NewProto->
X = OldProto->
X;
762 NewProto->
Y = OldProto->
Y;
765 NewProto->
A = Values[0];
766 NewProto->
B = Values[1];
767 NewProto->
C = Values[2];
774 NumWords = WordsInVectorOfSize(NumProtos);
775 for(i=0; i < NumConfigs; i++)
777 NewConfig = NewBitVector(NumProtos);
779 for(j=0; j < NumWords; j++)
780 NewConfig[j] = OldConfig[j];
784 return float_classes;
795 Slope = tan(Values [2] * 2 * M_PI);
796 Intercept = Values [1] - Slope * Values [0];
797 Normalizer = 1 / sqrt (Slope * Slope + 1.0);
799 Values [0] = Slope * Normalizer;
800 Values [1] = - Normalizer;
801 Values [2] = Intercept * Normalizer;
810 LIST nodes = CharList;
833 LabeledProtoList->
List =
push(LabeledProtoList->
List, Proto);
835 *NormProtoList =
push(*NormProtoList, LabeledProtoList);
840 bool CountInsigProtos) {
852 #endif // def DISABLED_LEGACY_ENGINE
LABELEDLIST FindList(LIST List, char *Label)
bool Serialize(FILE *fp) const
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
void Normalize(float *Values)
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
bool Serialize(FILE *fp) const
#define DOUBLE_PARAM_FLAG(name, val, comment)
STRING_PARAM_FLAG(D, "", "Directory to write output files to")
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
ShapeTable * LoadShapeTable(const STRING &file_prefix)
void LoadUnicharset(const char *filename)
void FreeLabeledClassList(LIST ClassList)
void FreeProtoList(LIST *ProtoList)
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
void truncate_at(int32_t index)
float ComputeDistance(int k, PARAM_DESC *dim, float p1[], float p2[])
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
MERGE_CLASS FindClass(LIST List, const char *Label)
void SetFeatureSpace(const IntFeatureSpace &fs)
LABELEDLIST NewLabeledList(const char *Label)
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
bool DeSerialize(TFile *fp)
INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging")
void Init(uint8_t xbuckets, uint8_t ybuckets, uint8_t thetabuckets)
#define ProtoIn(Class, Pid)
bool Open(const STRING &filename, FileReader reader)
bool save_to_file(const char *const filename) const
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
const int kBoostDirBuckets
const int kBoostXYBuckets
CLASS_TYPE NewClass(int NumProtos, int NumConfigs)
bool AddSpacingInfo(const char *filename)
const char * c_str() const
bool LoadFontInfo(const char *filename)
const UNICHARSET & unicharset() const
void CleanUpUnusedData(LIST ProtoList)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
UnicityTableEqEq< int > font_set
FEATURE_DEFS_STRUCT feature_defs
void WriteShapeTable(const STRING &file_prefix, const ShapeTable &shape_table)
const char * GetNextFilename(int argc, const char *const *argv)
LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
LIST push(LIST list, void *element)
const PARAM_DESC * ParamDesc
void FreeTrainingSamples(LIST CharList)
bool LoadXHeights(const char *filename)
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
void ParseArguments(int *argc, char ***argv)
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *clusterconfig)
bool contains_unichar(const char *const unichar_repr) const
void FreeFeatureSet(FEATURE_SET FeatureSet)
void FreeClass(CLASS_TYPE Class)
MERGE_CLASS NewLabeledClass(const char *Label)
DLLSYM void tprintf(const char *format,...)
void FreeLabeledList(LABELEDLIST LabeledList)
void SetupFlatShapeTable(ShapeTable *shape_table)
void LoadPageImages(const char *filename)
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
LIST push_last(LIST list, void *item)
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, int32_t CharID)
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
void FreeNormProtoList(LIST CharList)
void move(UnicityTable< T > *from)