21 #include "config_auto.h"
27 #include "allheaders.h"
52 bool replicate_samples,
54 : norm_mode_(norm_mode), samples_(fontinfo_table_),
55 junk_samples_(fontinfo_table_), verify_samples_(fontinfo_table_),
57 enable_shape_analysis_(shape_analysis),
58 enable_replication_(replicate_samples),
59 fragments_(nullptr), prev_unichar_id_(-1), debug_level_(debug_level) {
64 for (
int p = 0; p < page_images_.
size(); ++p)
65 pixDestroy(&page_images_[p]);
72 uint32_t value = norm_mode_;
75 if (!feature_space_.
Serialize(fp))
return false;
76 if (!samples_.
Serialize(fp))
return false;
77 if (!junk_samples_.
Serialize(fp))
return false;
78 if (!verify_samples_.
Serialize(fp))
return false;
79 if (!master_shapes_.
Serialize(fp))
return false;
80 if (!flat_shapes_.
Serialize(fp))
return false;
81 if (!fontinfo_table_.
Serialize(fp))
return false;
82 if (!xheights_.
Serialize(fp))
return false;
89 tprintf(
"Failed to load unicharset from file %s\n"
90 "Building unicharset for training from scratch...\n",
98 charsetsize_ = unicharset_.
size();
100 fragments_ =
new int[charsetsize_];
101 memset(fragments_, 0,
sizeof(*fragments_) * charsetsize_);
121 FILE* fp = fopen(page_name,
"rb");
123 tprintf(
"Failed to open tr file: %s\n", page_name);
127 while (fgets(buffer,
sizeof(buffer), fp) !=
nullptr) {
128 if (buffer[0] ==
'\n')
131 char* space = strchr(buffer,
' ');
132 if (space ==
nullptr) {
133 tprintf(
"Bad format in tr file, reading fontname, unichar\n");
138 if (font_id < 0) font_id = 0;
143 tprintf(
"Bad format in tr file, reading box coords\n");
148 sample->set_font_id(font_id);
149 sample->set_page_num(page_number + page_images_.
size());
150 sample->set_bounding_box(bounding_box);
151 sample->ExtractCharDesc(int_feature_type, micro_feature_type,
152 cn_feature_type, geo_feature_type, char_desc);
156 charsetsize_ = unicharset_.
size();
166 prev_unichar_id_ = -1;
168 if (prev_unichar_id_ >= 0)
169 fragments_[prev_unichar_id_] = -1;
175 if (prev_unichar_id_ >= 0) {
178 if (fragments_[prev_unichar_id_] == 0)
179 fragments_[prev_unichar_id_] = junk_id;
180 else if (fragments_[prev_unichar_id_] != junk_id)
181 fragments_[prev_unichar_id_] = -1;
185 prev_unichar_id_ = -1;
196 for (page = 0;; page++) {
197 pix = pixReadFromMultipageTiff(filename, &offset);
202 tprintf(
"Loaded %d page images from %s\n", page, filename);
211 if (debug_level_ > 0)
212 tprintf(
"PostLoadCleanup...\n");
213 if (enable_shape_analysis_)
214 ReplaceFragmentedSamples();
216 sample_it.
Init(
nullptr,
nullptr,
true, &verify_samples_);
225 if (debug_level_ > 0)
226 tprintf(
"ComputeCanonicalSamples...\n");
234 if (debug_level_ > 0)
235 tprintf(
"PreTrainingSetup...\n");
238 if (debug_level_ > 0)
239 tprintf(
"ComputeCloudFeatures...\n");
246 tprintf(
"Building master shape table\n");
247 const int num_fonts = samples_.
NumFonts();
254 for (
int f = 0; f < num_fonts; ++f) {
262 if (fragment ==
nullptr)
263 char_shapes.AppendMasterShapes(shapes,
nullptr);
265 char_shapes_begin_fragment.AppendMasterShapes(shapes,
nullptr);
267 char_shapes_end_fragment.AppendMasterShapes(shapes,
nullptr);
269 char_shapes.AppendMasterShapes(shapes,
nullptr);
273 char_shapes.AppendMasterShapes(char_shapes_begin_fragment,
nullptr);
276 char_shapes.AppendMasterShapes(char_shapes_end_fragment,
nullptr);
299 tprintf(
"Moving %d junk samples to master sample set.\n", num_junks);
300 for (
int s = 0; s < num_junks; ++s) {
302 int junk_id =
sample->class_id();
305 if (sample_id == INVALID_UNICHAR_ID)
307 sample->set_class_id(sample_id);
321 if (enable_replication_) {
322 if (debug_level_ > 0)
323 tprintf(
"ReplicateAndRandomize...\n");
333 FILE* fp = fopen(filename,
"rb");
335 fprintf(stderr,
"Failed to load font_properties from %s\n", filename);
338 int italic, bold, fixed, serif, fraktur;
341 char* font_name =
new char[1024];
342 fontinfo.
name = font_name;
345 if (
tfscanf(fp,
"%1024s %i %i %i %i %i\n", font_name, &italic, &bold,
346 &fixed, &serif, &fraktur) != 6) {
356 if (!fontinfo_table_.
contains(fontinfo)) {
369 tprintf(
"fontinfo table is of size %d\n", fontinfo_table_.
size());
371 if (filename ==
nullptr)
return true;
372 FILE *f = fopen(filename,
"rb");
374 fprintf(stderr,
"Failed to load font xheights from %s\n", filename);
377 tprintf(
"Reading x-heights from %s ...\n", filename);
383 int total_xheight = 0;
384 int xheight_count = 0;
386 if (
tfscanf(f,
"%1023s %d\n", buffer, &xht) != 2)
389 fontinfo.
name = buffer;
390 if (!fontinfo_table_.
contains(fontinfo))
continue;
391 int fontinfo_id = fontinfo_table_.
get_index(fontinfo);
392 xheights_[fontinfo_id] = xht;
393 total_xheight += xht;
396 if (xheight_count == 0) {
397 fprintf(stderr,
"No valid xheights in %s!\n", filename);
401 int mean_xheight =
DivRounded(total_xheight, xheight_count);
402 for (
int i = 0; i < fontinfo_table_.
size(); ++i) {
403 if (xheights_[i] < 0)
404 xheights_[i] = mean_xheight;
412 FILE* fontinfo_file = fopen(filename,
"rb");
413 if (fontinfo_file ==
nullptr)
417 if (fontinfo_id < 0) {
418 tprintf(
"No font found matching fontinfo filename %s\n", filename);
419 fclose(fontinfo_file);
422 tprintf(
"Reading spacing from %s for font %d...\n", filename, fontinfo_id);
429 int x_gap, x_gap_before, x_gap_after, num_kerned;
434 for (
int l = 0; l < num_unichars; ++l) {
435 if (
tfscanf(fontinfo_file,
"%s %d %d %d",
436 uch, &x_gap_before, &x_gap_after, &num_kerned) != 4) {
437 tprintf(
"Bad format of font spacing file %s\n", filename);
438 fclose(fontinfo_file);
444 spacing->
x_gap_before = static_cast<int16_t>(x_gap_before * scale);
445 spacing->
x_gap_after = static_cast<int16_t>(x_gap_after * scale);
447 for (
int k = 0; k < num_kerned; ++k) {
448 if (
tfscanf(fontinfo_file,
"%s %d", kerned_uch, &x_gap) != 2) {
449 tprintf(
"Bad format of font spacing file %s\n", filename);
450 fclose(fontinfo_file);
461 fclose(fontinfo_file);
470 fontinfo.
name = const_cast<char*>(font_name);
473 return fontinfo_table_.
get_index(fontinfo);
479 int fontinfo_id = -1;
481 for (
int f = 0; f < fontinfo_table_.
size(); ++f) {
482 if (strstr(filename, fontinfo_table_.
get(f).name) !=
nullptr) {
483 int len = strlen(fontinfo_table_.
get(f).name);
485 if (len > best_len) {
501 int num_shapes = flat_shapes_.
NumShapes();
502 for (
int s = 0; s < num_shapes; ++s) {
503 int font = flat_shapes_.
GetShape(s)[0].font_ids[0];
505 for (f = 0; f < active_fonts.
size(); ++f) {
506 if (active_fonts[f] == font)
509 if (f == active_fonts.
size())
513 int num_fonts = active_fonts.
size();
514 for (
int f = 0; f < num_fonts; ++f) {
515 for (
int s = num_shapes - 1; s >= 0; --s) {
516 int font = flat_shapes_.
GetShape(s)[0].font_ids[0];
517 if (font == active_fonts[f]) {
541 shape_map.
SetMap(shape_id,
true);
546 it.
Init(&shape_map, &shape_table,
false, &samples_);
551 for (
int i = sample_ptrs.
size() - 1; i >= 0; --i) {
553 uint32_t num_features =
sample->num_micro_features();
554 for (uint32_t f = 0; f < num_features; ++f)
558 *num_samples = sample_id;
570 const char* inttemp_file,
571 const char* pffmtable_file) {
574 fontinfo_table_.
MoveTo(&classify->get_fontinfo_table());
575 INT_TEMPLATES int_templates = classify->CreateIntTemplates(float_classes,
577 FILE* fp = fopen(inttemp_file,
"wb");
579 tprintf(
"Error, failed to open file \"%s\"\n", inttemp_file);
581 classify->WriteIntTemplates(fp, int_templates, shape_set);
594 for (
int i = 0; i < int_templates->
NumClasses; ++i) {
598 uint16_t max_length = 0;
599 for (
int config_id = 0; config_id < Class->
NumConfigs; config_id++) {
603 if (length > max_length)
605 int shape_id = float_classes[i].
font_set.
get(config_id);
607 for (
int c = 0; c < shape.
size(); ++c) {
608 int unichar_id = shape[c].unichar_id;
609 if (length > unichar_cutoffs[unichar_id])
610 unichar_cutoffs[unichar_id] = length;
613 shapetable_cutoffs.
push_back(max_length);
615 fp = fopen(pffmtable_file,
"wb");
617 tprintf(
"Error, failed to open file \"%s\"\n", pffmtable_file);
622 if (strcmp(unichar,
" ") == 0) {
625 fprintf(fp,
"%s %d\n", unichar, unichar_cutoffs[c]);
636 const char* unichar_str2) {
639 if (class_id2 == INVALID_UNICHAR_ID)
640 class_id2 = class_id1;
641 if (class_id1 == INVALID_UNICHAR_ID) {
642 tprintf(
"No unicharset entry found for %s\n", unichar_str1);
645 tprintf(
"Font ambiguities for unichar %d = %s and %d = %s\n",
646 class_id1, unichar_str1, class_id2, unichar_str2);
648 int num_fonts = samples_.
NumFonts();
653 for (
int f = 0; f < num_fonts; ++f) {
659 for (
int f1 = 0; f1 < num_fonts; ++f1) {
664 for (
int f2 = 0; f2 < num_fonts; ++f2) {
675 for (
int f = 0; f < num_fonts; ++f) {
678 if (class_id1 != class_id2 &&
684 #ifndef GRAPHICS_DISABLED
696 const char* unichar_str2,
697 int canonical_font) {
704 if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {
707 for (uint32_t f = 0; f <
sample->num_features(); ++f) {
712 if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {
714 for (
int f = 0; f < cloud.
size(); ++f) {
732 if (feature_index >= 0) {
746 #endif // GRAPHICS_DISABLED
752 sample_it.
Init(
nullptr,
nullptr, replicate_samples, &samples_);
755 page_images_, &sample_it);
762 bool replicate_samples,
765 TestClassifier(error_mode, report_level, replicate_samples, &samples_,
766 test_classifier, report_string);
784 bool replicate_samples,
789 sample_it.
Init(
nullptr,
nullptr, replicate_samples, samples);
790 if (report_level > 0) {
794 tprintf(
"Iterator has charset size of %d/%d, %d shapes, %d samples\n",
797 tprintf(
"Testing %sREPLICATED:\n", replicate_samples ?
"" :
"NON-");
799 double unichar_error = 0.0;
801 error_mode, fontinfo_table_,
802 page_images_, &sample_it, &unichar_error,
803 nullptr, report_string);
804 return unichar_error;
813 int num_chars1 = shape1.
size();
814 int num_chars2 = shape2.
size();
815 float dist_sum = 0.0f;
817 if (num_chars1 > 1 || num_chars2 > 1) {
820 for (
int c1 = 0; c1 < num_chars1; ++c1) {
821 for (
int c2 = 0; c2 < num_chars2; ++c2) {
834 return dist_sum / dist_count;
839 void MasterTrainer::ReplaceFragmentedSamples() {
840 if (fragments_ ==
nullptr)
return;
844 for (
int s = 0; s < num_samples; ++s) {
846 if (fragments_[
sample->class_id()] > 0)
858 bool* good_junk =
new bool[frag_set.
size()];
859 memset(good_junk, 0,
sizeof(*good_junk) * frag_set.
size());
860 for (
int dead_ch = 1; dead_ch < unicharset_.
size(); ++dead_ch) {
861 int frag_ch = fragments_[dead_ch];
862 if (frag_ch <= 0)
continue;
866 for (
int part = 0; part < frag->
get_total(); ++part) {
869 if (good_ch != INVALID_UNICHAR_ID)
870 good_junk[good_ch] =
true;
878 for (
int s = 0; s < num_junks; ++s) {
880 int junk_id =
sample->class_id();
896 delete [] fragments_;
897 fragments_ =
nullptr;
906 void MasterTrainer::ClusterShapes(
int min_shapes,
int max_shape_unichars,
909 int max_merges = num_shapes - min_shapes;
915 tprintf(
"Computing shape distances...");
916 for (
int s1 = 0; s1 < num_shapes; ++s1) {
917 for (
int s2 = s1 + 1; s2 < num_shapes; ++s2) {
919 shape_dists[s1].push_back(dist);
930 while (num_merged < max_merges && min_dist < max_dist) {
931 tprintf(
"Distance = %f: ", min_dist);
933 shape_dists[min_s1][min_s2 - min_s1 - 1].distance =
kInfiniteDist;
934 if (num_unichars > max_shape_unichars) {
935 tprintf(
"Merge of %d and %d with %d would exceed max of %d unichars\n",
936 min_s1, min_s2, num_unichars, max_shape_unichars);
939 shape_dists[min_s2].clear();
942 for (
int s = 0; s < min_s1; ++s) {
943 if (!shape_dists[s].empty()) {
944 shape_dists[s][min_s1 - s - 1].distance =
949 for (
int s2 = min_s1 + 1; s2 < num_shapes; ++s2) {
951 shape_dists[min_s1][s2 - min_s1 - 1].distance =
954 for (
int s = min_s1 + 1; s < min_s2; ++s) {
955 if (!shape_dists[s].empty()) {
961 for (
int s1 = 0; s1 < num_shapes; ++s1) {
962 for (
int i = 0; i < shape_dists[s1].size(); ++i) {
963 if (shape_dists[s1][i].
distance < min_dist) {
964 min_dist = shape_dists[s1][i].distance;
971 tprintf(
"Stopped with %d merged, min dist %f\n", num_merged, min_dist);
972 delete [] shape_dists;
973 if (debug_level_ > 1) {
974 for (
int s1 = 0; s1 < num_shapes; ++s1) {