24 #include "config_auto.h"
30 #include "allheaders.h"
56 bool replicate_samples,
58 : norm_mode_(norm_mode), samples_(fontinfo_table_),
59 junk_samples_(fontinfo_table_), verify_samples_(fontinfo_table_),
61 enable_shape_anaylsis_(shape_analysis),
62 enable_replication_(replicate_samples),
63 fragments_(
NULL), prev_unichar_id_(-1), debug_level_(debug_level) {
68 for (
int p = 0; p < page_images_.
size(); ++p)
69 pixDestroy(&page_images_[p]);
76 if (fwrite(&norm_mode_,
sizeof(norm_mode_), 1, fp) != 1)
return false;
78 if (!feature_space_.
Serialize(fp))
return false;
79 if (!samples_.
Serialize(fp))
return false;
80 if (!junk_samples_.
Serialize(fp))
return false;
81 if (!verify_samples_.
Serialize(fp))
return false;
82 if (!master_shapes_.
Serialize(fp))
return false;
83 if (!flat_shapes_.
Serialize(fp))
return false;
84 if (!fontinfo_table_.
Serialize(fp))
return false;
85 if (!xheights_.
Serialize(fp))
return false;
92 if (fread(&norm_mode_,
sizeof(norm_mode_), 1, fp) != 1)
return false;
94 ReverseN(&norm_mode_,
sizeof(norm_mode_));
97 charsetsize_ = unicharset_.
size();
98 if (!feature_space_.
DeSerialize(swap, fp))
return false;
99 feature_map_.
Init(feature_space_);
101 if (!junk_samples_.
DeSerialize(swap, fp))
return false;
102 if (!verify_samples_.
DeSerialize(swap, fp))
return false;
103 if (!master_shapes_.
DeSerialize(swap, fp))
return false;
104 if (!flat_shapes_.
DeSerialize(swap, fp))
return false;
105 if (!fontinfo_table_.
DeSerialize(swap, fp))
return false;
106 if (!xheights_.
DeSerialize(swap, fp))
return false;
113 tprintf(
"Failed to load unicharset from file %s\n"
114 "Building unicharset for training from scratch...\n",
122 charsetsize_ = unicharset_.
size();
123 delete [] fragments_;
124 fragments_ =
new int[charsetsize_];
125 memset(fragments_, 0,
sizeof(*fragments_) * charsetsize_);
145 FILE* fp =
Efopen(page_name,
"rb");
147 tprintf(
"Failed to open tr file: %s\n", page_name);
151 while (fgets(buffer,
sizeof(buffer), fp) !=
NULL) {
152 if (buffer[0] ==
'\n')
155 char* space = strchr(buffer,
' ');
157 tprintf(
"Bad format in tr file, reading fontname, unichar\n");
162 if (font_id < 0) font_id = 0;
167 tprintf(
"Bad format in tr file, reading box coords\n");
176 cn_feature_type, geo_feature_type, char_desc);
180 charsetsize_ = unicharset_.
size();
189 verify_samples_.
AddSample(unichar, sample);
190 prev_unichar_id_ = -1;
192 if (prev_unichar_id_ >= 0)
193 fragments_[prev_unichar_id_] = -1;
194 prev_unichar_id_ = samples_.
AddSample(unichar, sample);
198 int junk_id = junk_samples_.
AddSample(unichar, sample);
199 if (prev_unichar_id_ >= 0) {
202 if (fragments_[prev_unichar_id_] == 0)
203 fragments_[prev_unichar_id_] = junk_id;
204 else if (fragments_[prev_unichar_id_] != junk_id)
205 fragments_[prev_unichar_id_] = -1;
209 prev_unichar_id_ = -1;
219 for (page = 0; (pix = pixReadTiff(filename, page)) !=
NULL; ++page) {
222 tprintf(
"Loaded %d page images from %s\n", page, filename);
231 if (debug_level_ > 0)
232 tprintf(
"PostLoadCleanup...\n");
233 if (enable_shape_anaylsis_)
234 ReplaceFragmentedSamples();
245 if (debug_level_ > 0)
246 tprintf(
"ComputeCanonicalSamples...\n");
254 if (debug_level_ > 0)
255 tprintf(
"PreTrainingSetup...\n");
258 if (debug_level_ > 0)
259 tprintf(
"ComputeCloudFeatures...\n");
266 tprintf(
"Building master shape table\n");
267 int num_fonts = samples_.
NumFonts();
274 for (
int f = 0; f < num_fonts; ++f) {
278 ClusterShapes(kMinClusteredShapes, 1, kFontMergeDistance, &shapes);
282 if (fragment ==
NULL)
283 char_shapes.AppendMasterShapes(shapes,
NULL);
285 char_shapes_begin_fragment.AppendMasterShapes(shapes,
NULL);
287 char_shapes_end_fragment.AppendMasterShapes(shapes,
NULL);
289 char_shapes.AppendMasterShapes(shapes,
NULL);
291 ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
292 kFontMergeDistance, &char_shapes_begin_fragment);
293 char_shapes.AppendMasterShapes(char_shapes_begin_fragment,
NULL);
294 ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
295 kFontMergeDistance, &char_shapes_end_fragment);
296 char_shapes.AppendMasterShapes(char_shapes_end_fragment,
NULL);
297 ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
298 kFontMergeDistance, &char_shapes);
319 tprintf(
"Moving %d junk samples to master sample set.\n", num_junks);
320 for (
int s = 0; s < num_junks; ++s) {
325 if (sample_id == INVALID_UNICHAR_ID)
341 if (enable_replication_) {
342 if (debug_level_ > 0)
343 tprintf(
"ReplicateAndRandomize...\n");
353 FILE* fp = fopen(filename,
"rb");
355 fprintf(stderr,
"Failed to load font_properties from %s\n", filename);
358 int italic, bold, fixed, serif, fraktur;
361 char* font_name =
new char[1024];
362 fontinfo.
name = font_name;
365 if (
tfscanf(fp,
"%1024s %i %i %i %i %i\n", font_name,
366 &italic, &bold, &fixed, &serif, &fraktur) != 6)
374 if (!fontinfo_table_.
contains(fontinfo)) {
385 tprintf(
"fontinfo table is of size %d\n", fontinfo_table_.
size());
387 if (filename ==
NULL)
return true;
388 FILE *f = fopen(filename,
"rb");
390 fprintf(stderr,
"Failed to load font xheights from %s\n", filename);
393 tprintf(
"Reading x-heights from %s ...\n", filename);
399 int total_xheight = 0;
400 int xheight_count = 0;
402 if (
tfscanf(f,
"%1023s %d\n", buffer, &xht) != 2)
405 fontinfo.
name = buffer;
406 if (!fontinfo_table_.
contains(fontinfo))
continue;
407 int fontinfo_id = fontinfo_table_.
get_index(fontinfo);
408 xheights_[fontinfo_id] = xht;
409 total_xheight += xht;
412 if (xheight_count == 0) {
413 fprintf(stderr,
"No valid xheights in %s!\n", filename);
417 int mean_xheight =
DivRounded(total_xheight, xheight_count);
418 for (
int i = 0; i < fontinfo_table_.
size(); ++i) {
419 if (xheights_[i] < 0)
420 xheights_[i] = mean_xheight;
428 FILE* fontinfo_file = fopen(filename,
"rb");
429 if (fontinfo_file ==
NULL)
433 if (fontinfo_id < 0) {
434 tprintf(
"No font found matching fontinfo filename %s\n", filename);
435 fclose(fontinfo_file);
438 tprintf(
"Reading spacing from %s for font %d...\n", filename, fontinfo_id);
445 int x_gap, x_gap_before, x_gap_after, num_kerned;
450 for (
int l = 0; l < num_unichars; ++l) {
451 if (
tfscanf(fontinfo_file,
"%s %d %d %d",
452 uch, &x_gap_before, &x_gap_after, &num_kerned) != 4) {
453 tprintf(
"Bad format of font spacing file %s\n", filename);
454 fclose(fontinfo_file);
463 for (
int k = 0; k < num_kerned; ++k) {
464 if (
tfscanf(fontinfo_file,
"%s %d", kerned_uch, &x_gap) != 2) {
465 tprintf(
"Bad format of font spacing file %s\n", filename);
466 fclose(fontinfo_file);
477 fclose(fontinfo_file);
486 fontinfo.
name =
const_cast<char*
>(font_name);
489 return fontinfo_table_.
get_index(fontinfo);
495 int fontinfo_id = -1;
497 for (
int f = 0; f < fontinfo_table_.
size(); ++f) {
498 if (strstr(filename, fontinfo_table_.
get(f).name) !=
NULL) {
499 int len = strlen(fontinfo_table_.
get(f).name);
501 if (len > best_len) {
517 int num_shapes = flat_shapes_.
NumShapes();
518 for (
int s = 0; s < num_shapes; ++s) {
519 int font = flat_shapes_.
GetShape(s)[0].font_ids[0];
521 for (f = 0; f < active_fonts.
size(); ++f) {
522 if (active_fonts[f] == font)
525 if (f == active_fonts.
size())
529 int num_fonts = active_fonts.
size();
530 for (
int f = 0; f < num_fonts; ++f) {
531 for (
int s = num_shapes - 1; s >= 0; --s) {
532 int font = flat_shapes_.
GetShape(s)[0].font_ids[0];
533 if (font == active_fonts[f]) {
557 shape_map.
SetMap(shape_id,
true);
562 it.
Init(&shape_map, &shape_table,
false, &samples_);
567 for (
int i = sample_ptrs.
size() - 1; i >= 0; --i) {
570 for (
int f = 0; f < num_features; ++f)
574 *num_samples = sample_id;
586 const char* inttemp_file,
587 const char* pffmtable_file) {
593 FILE* fp = fopen(inttemp_file,
"wb");
603 for (
int c = 0; c < unicharset.
size(); ++c)
606 for (
int i = 0; i < int_templates->
NumClasses; ++i) {
611 for (
int config_id = 0; config_id < Class->
NumConfigs; config_id++) {
615 if (length > max_length)
617 int shape_id = float_classes[i].
font_set.
get(config_id);
619 for (
int c = 0; c < shape.
size(); ++c) {
620 int unichar_id = shape[c].unichar_id;
621 if (length > unichar_cutoffs[unichar_id])
622 unichar_cutoffs[unichar_id] = length;
625 shapetable_cutoffs.
push_back(max_length);
627 fp = fopen(pffmtable_file,
"wb");
629 for (
int c = 0; c < unicharset.
size(); ++c) {
631 if (strcmp(unichar,
" ") == 0) {
634 fprintf(fp,
"%s %d\n", unichar, unichar_cutoffs[c]);
644 const char* unichar_str2) {
647 if (class_id2 == INVALID_UNICHAR_ID)
648 class_id2 = class_id1;
649 if (class_id1 == INVALID_UNICHAR_ID) {
650 tprintf(
"No unicharset entry found for %s\n", unichar_str1);
653 tprintf(
"Font ambiguities for unichar %d = %s and %d = %s\n",
654 class_id1, unichar_str1, class_id2, unichar_str2);
656 int num_fonts = samples_.
NumFonts();
661 for (
int f = 0; f < num_fonts; ++f) {
667 for (
int f1 = 0; f1 < num_fonts; ++f1) {
672 for (
int f2 = 0; f2 < num_fonts; ++f2) {
683 for (
int f = 0; f < num_fonts; ++f) {
686 if (class_id1 != class_id2 &&
692 #ifndef GRAPHICS_DISABLED
704 const char* unichar_str2,
705 int canonical_font) {
712 if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {
720 if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {
722 for (
int f = 0; f < cloud.
size(); ++f) {
740 if (feature_index >= 0) {
754 #endif // GRAPHICS_DISABLED
760 sample_it.
Init(
NULL,
NULL, replicate_samples, &samples_);
763 page_images_, &sample_it);
770 bool replicate_samples,
773 TestClassifier(error_mode, report_level, replicate_samples, &samples_,
774 test_classifier, report_string);
792 bool replicate_samples,
798 if (report_level > 0) {
802 tprintf(
"Iterator has charset size of %d/%d, %d shapes, %d samples\n",
805 tprintf(
"Testing %sREPLICATED:\n", replicate_samples ?
"" :
"NON-");
807 double unichar_error = 0.0;
809 error_mode, fontinfo_table_,
810 page_images_, &sample_it, &unichar_error,
811 NULL, report_string);
812 return unichar_error;
821 int num_chars1 = shape1.
size();
822 int num_chars2 = shape2.
size();
823 float dist_sum = 0.0f;
825 if (num_chars1 > 1 || num_chars2 > 1) {
828 for (
int c1 = 0; c1 < num_chars1; ++c1) {
829 for (
int c2 = 0; c2 < num_chars2; ++c2) {
842 return dist_sum / dist_count;
847 void MasterTrainer::ReplaceFragmentedSamples() {
848 if (fragments_ ==
NULL)
return;
852 for (
int s = 0; s < num_samples; ++s) {
854 if (fragments_[sample->
class_id()] > 0)
866 bool* good_junk =
new bool[frag_set.
size()];
867 memset(good_junk, 0,
sizeof(*good_junk) * frag_set.
size());
868 for (
int dead_ch = 1; dead_ch < unicharset_.
size(); ++dead_ch) {
869 int frag_ch = fragments_[dead_ch];
870 if (frag_ch <= 0)
continue;
874 for (
int part = 0; part < frag->
get_total(); ++part) {
877 if (good_ch != INVALID_UNICHAR_ID)
878 good_junk[good_ch] =
true;
885 for (
int s = 0; s < num_junks; ++s) {
902 delete [] fragments_;
912 void MasterTrainer::ClusterShapes(
int min_shapes,
int max_shape_unichars,
915 int max_merges = num_shapes - min_shapes;
921 tprintf(
"Computing shape distances...");
922 for (
int s1 = 0; s1 < num_shapes; ++s1) {
923 for (
int s2 = s1 + 1; s2 < num_shapes; ++s2) {
936 while (num_merged < max_merges && min_dist < max_dist) {
937 tprintf(
"Distance = %f: ", min_dist);
939 shape_dists[min_s1][min_s2 - min_s1 - 1].distance =
kInfiniteDist;
940 if (num_unichars > max_shape_unichars) {
941 tprintf(
"Merge of %d and %d with %d would exceed max of %d unichars\n",
942 min_s1, min_s2, num_unichars, max_shape_unichars);
945 shape_dists[min_s2].
clear();
948 for (
int s = 0; s < min_s1; ++s) {
949 if (!shape_dists[s].empty()) {
950 shape_dists[s][min_s1 - s - 1].distance =
955 for (
int s2 = min_s1 + 1; s2 < num_shapes; ++s2) {
956 if (shape_dists[min_s1][s2 - min_s1 - 1].distance < kInfiniteDist)
957 shape_dists[min_s1][s2 - min_s1 - 1].distance =
960 for (
int s = min_s1 + 1; s < min_s2; ++s) {
961 if (!shape_dists[s].empty()) {
967 for (
int s1 = 0; s1 < num_shapes; ++s1) {
968 for (
int i = 0; i < shape_dists[s1].
size(); ++i) {
969 if (shape_dists[s1][i].distance < min_dist) {
970 min_dist = shape_dists[s1][i].distance;
977 tprintf(
"Stopped with %d merged, min dist %f\n", num_merged, min_dist);
978 delete [] shape_dists;
979 if (debug_level_ > 1) {
980 for (
int s1 = 0; s1 < num_shapes; ++s1) {
void Init(int size, bool all_mapped)
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
bool Serialize(FILE *fp) const
bool LoadFontInfo(const char *filename)
void LoadPageImages(const char *filename)
void OrganizeByFontAndClass()
virtual const ShapeTable * GetShapeTable() const =0
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
void set_bounding_box(const TBOX &box)
bool save_to_file(const char *const filename) const
FILE * Efopen(const char *Name, const char *Mode)
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
void ComputeCloudFeatures(int feature_space_size)
const INT_FEATURE_STRUCT * features() const
void TestClassifierVOld(bool replicate_samples, ShapeClassifier *test_classifier, ShapeClassifier *old_classifier)
void FreeCharDescription(CHAR_DESC CharDesc)
void Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize, TrainingSampleSet *sample_set)
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
int CompactCharsetSize() const
const float kInfiniteDist
int tfscanf(FILE *stream, const char *format,...)
const char * kIntFeatureType
void ReplicateAndRandomizeSamples()
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
void SetupFlatShapeTable(ShapeTable *shape_table)
bool Serialize(FILE *fp) const
CLUSTERER * MakeClusterer(inT16 SampleSize, const PARAM_DESC ParamDesc[])
bool DeSerialize(bool swap, FILE *fp)
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, STRING *utf8_str, TBOX *bounding_box)
STRING SummaryStr() const
bool load_from_file(const char *const filename, bool skip_fragments)
static CHAR_FRAGMENT * parse_from_string(const char *str)
SAMPLE * MakeSample(CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
bool DeSerialize(bool swap, FILE *fp)
void SetMap(int sparse_index, bool mapped)
void AddSample(bool verification, const char *unichar_str, TrainingSample *sample)
UnicityTableEqEq< int > font_set
const int kMaxUnicharsPerCluster
void AddToShape(int unichar_id, int font_id)
bool Serialize(FILE *fp) const
const char * kCNFeatureType
uinT16 ConfigLengths[MAX_NUM_CONFIGS]
bool Serialize(FILE *fp) const
const IntFeatureSpace & feature_space() const
INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos, const UNICHARSET &target_unicharset)
void KillSample(TrainingSample *sample)
TrainingSample * extract_sample(int index)
void IndexFeatures(const IntFeatureSpace &feature_space)
void AppendOtherUnicharset(const UNICHARSET &src)
void DisplaySamples(const char *unichar_str1, int cloud_font, const char *unichar_str2, int canonical_font)
static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix * > &page_images, SampleIterator *it, double *unichar_error, double *scaled_error, STRING *fonts_report)
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)
int MasterDestinationIndex(int shape_id) const
void MergeShapes(int shape_id1, int shape_id2)
void ComputeCanonicalFeatures()
int GetFontInfoId(const char *font_name)
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
int SparseCharsetSize() const
double TestClassifier(CountTypes error_mode, int report_level, bool replicate_samples, TrainingSampleSet *samples, ShapeClassifier *test_classifier, STRING *report_string)
const int kMinClusteredShapes
GenericVector< UNICHAR_ID > kerned_unichar_ids
FEATURE_DEFS_STRUCT feature_defs
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
void Init(const IntFeatureSpace &feature_space)
const char *const id_to_unichar(UNICHAR_ID id) const
void init_to_size(int size, T t)
bool DeSerialize(bool swap, FILE *fp)
void LoadUnicharset(const char *filename)
SVEvent * AwaitEvent(SVEventType type)
void set_class_id(int id)
bool LoadXHeights(const char *filename)
int NumClassSamples(int font_id, int class_id, bool randomize) const
bool Serialize(FILE *fp) const
void ReplicateAndRandomizeSamplesIfRequired()
const BitVector & GetCloudFeatures(int font_id, int class_id) const
void set_page_num(int page)
const char * kGeoFeatureType
const char * kMicroFeatureType
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info)
int GetBestMatchingFontInfoId(const char *filename)
void LoadUnicharset(const char *filename)
UnicityTable< FontInfo > & get_fontinfo_table()
void TestClassifierOnSamples(CountTypes error_mode, int report_level, bool replicate_samples, ShapeClassifier *test_classifier, STRING *report_string)
static STRING to_string(const char *unichar, int pos, int total, bool natural)
void MoveTo(UnicityTable< FontInfo > *target)
#define ClassForClassId(T, c)
bool contains(T object) const
bool Serialize(FILE *fp) const
void ExtractCharDesc(int feature_type, int micro_type, int cn_type, int geo_type, CHAR_DESC_STRUCT *char_desc)
STRING DebugStr(int shape_id) const
const T & get(int id) const
Return the object from an id.
const TrainingSample & GetSample() const
void ReverseN(void *ptr, int num_bytes)
float ShapeDistance(const ShapeTable &shapes, int s1, int s2)
UNICHAR_ID class_id() const
int DivRounded(int a, int b)
int AddSample(const char *unichar, TrainingSample *sample)
int MergedUnicharCount(int shape_id1, int shape_id2) const
double NormalizeSamples()
void free_int_templates(INT_TEMPLATES templates)
const UNICHARSET & unicharset() const
bool DeSerialize(bool swap, FILE *fp)
MasterTrainer(NormalizationMode norm_mode, bool shape_analysis, bool replicate_samples, int debug_level)
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix * > &page_images, SampleIterator *it)
int FindShape(int unichar_id, int font_id) const
void WriteInttempAndPFFMTable(const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
bool DeSerialize(bool swap, FILE *fp)
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
INT_FEATURE_STRUCT InverseIndexFeature(int index_feature) const
bool contains_unichar(const char *const unichar_repr) const
void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
void init_spacing(int unicharset_size)
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
const MicroFeature * micro_features() const
int AddShape(int unichar_id, int font_id)
const float kFontMergeDistance
TrainingSample * mutable_sample(int index)
const PARAM_DESC * ParamDesc
void AppendMasterShapes(const ShapeTable &other, GenericVector< int > *shape_map)
const char * string() const
const Shape & GetShape(int shape_id) const
void DebugCanonical(const char *unichar_str1, const char *unichar_str2)
CLUSTERER * SetupForClustering(const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs, int shape_id, int *num_samples)
int get_index(T object) const
int num_micro_features() const
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)
int XYToFeatureIndex(int x, int y) const
bool is_beginning() const
bool AddSpacingInfo(const char *filename)
GenericVector< inT16 > kerned_x_gaps
bool DeSerialize(bool swap, FILE *fp)