17 #include "allheaders.h"
39 TrainingSampleSet::FontClassInfo::FontClassInfo()
40 : num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) {
44 bool TrainingSampleSet::FontClassInfo::Serialize(FILE* fp)
const {
45 if (fwrite(&num_raw_samples,
sizeof(num_raw_samples), 1, fp) != 1)
47 if (fwrite(&canonical_sample,
sizeof(canonical_sample), 1, fp) != 1)
49 if (fwrite(&canonical_dist,
sizeof(canonical_dist), 1, fp) != 1)
return false;
50 if (!samples.Serialize(fp))
return false;
55 bool TrainingSampleSet::FontClassInfo::DeSerialize(
bool swap, FILE* fp) {
56 if (fread(&num_raw_samples,
sizeof(num_raw_samples), 1, fp) != 1)
58 if (fread(&canonical_sample,
sizeof(canonical_sample), 1, fp) != 1)
60 if (fread(&canonical_dist,
sizeof(canonical_dist), 1, fp) != 1)
return false;
61 if (!samples.DeSerialize(swap, fp))
return false;
63 ReverseN(&num_raw_samples,
sizeof(num_raw_samples));
64 ReverseN(&canonical_sample,
sizeof(canonical_sample));
65 ReverseN(&canonical_dist,
sizeof(canonical_dist));
71 : num_raw_samples_(0), unicharset_size_(0),
72 font_class_array_(
NULL), fontinfo_table_(font_table) {
76 delete font_class_array_;
81 if (!samples_.Serialize(fp))
return false;
83 if (!font_id_map_.
Serialize(fp))
return false;
84 inT8 not_null = font_class_array_ !=
NULL;
85 if (fwrite(¬_null,
sizeof(not_null), 1, fp) != 1)
return false;
95 if (!samples_.DeSerialize(swap, fp))
return false;
96 num_raw_samples_ = samples_.size();
98 if (!font_id_map_.
DeSerialize(swap, fp))
return false;
99 if (font_class_array_ !=
NULL) {
100 delete font_class_array_;
101 font_class_array_ =
NULL;
104 if (fread(¬_null,
sizeof(not_null), 1, fp) != 1)
return false;
110 unicharset_size_ = unicharset_.
size();
117 tprintf(
"Failed to load unicharset from file %s\n"
118 "Building unicharset from scratch...\n",
125 unicharset_size_ = unicharset_.
size();
135 tprintf(
"Error: Size of unicharset in TrainingSampleSet::AddSample is "
136 "greater than MAX_NUM_CLASSES\n");
149 samples_.push_back(sample);
150 num_raw_samples_ = samples_.size();
151 unicharset_size_ = unicharset_.
size();
159 bool randomize)
const {
161 if (font_id < 0 || class_id < 0 ||
162 font_id >= font_id_map_.
SparseSize() || class_id >= unicharset_size_) {
170 return (*font_class_array_)(font_index, class_id).samples.size();
177 return samples_[index];
186 if (font_index < 0)
return NULL;
187 int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
188 return samples_[sample_index];
197 if (font_index < 0)
return NULL;
198 int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
199 return samples_[sample_index];
214 int font_id,
int class_id)
const {
217 return (*font_class_array_)(font_index, class_id).cloud_features;
222 int font_id,
int class_id)
const {
225 return (*font_class_array_)(font_index, class_id).canonical_features;
240 double dist_sum = 0.0;
245 for (
int i = 0; i < num_fonts1; ++i) {
247 for (
int j = 0; j < num_fonts2; ++j) {
255 }
else if (num_fonts1 * num_fonts2 <= kSquareLimit) {
257 for (
int i = 0; i < num_fonts1; ++i) {
259 for (
int j = 0; j < num_fonts2; ++j) {
263 tprintf(
"Cluster dist %d %d %d %d = %g\n",
273 int increment = kPrime1 != num_fonts2 ? kPrime1 :
kPrime2;
276 for (
int i = 0; i <
num_samples; ++i, index += increment) {
277 int f1 = uf1.
font_ids[i % num_fonts1];
278 int f2 = uf2.
font_ids[index % num_fonts2];
280 tprintf(
"Cluster dist %d %d %d %d = %g\n",
287 if (dist_count == 0) {
292 return dist_sum / dist_count;
299 int font_id2,
int class_id2,
304 if (font_index1 < 0 || font_index2 < 0)
306 FontClassInfo& fc_info = (*font_class_array_)(font_index1, class_id1);
307 if (font_id1 == font_id2) {
309 if (fc_info.unichar_distance_cache.size() == 0)
310 fc_info.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
311 if (fc_info.unichar_distance_cache[class_id2] < 0) {
316 fc_info.unichar_distance_cache[class_id2] = result;
318 FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
319 if (fc_info2.unichar_distance_cache.size() == 0)
320 fc_info2.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
321 fc_info2.unichar_distance_cache[class_id1] = result;
323 return fc_info.unichar_distance_cache[class_id2];
324 }
else if (class_id1 == class_id2) {
326 if (fc_info.font_distance_cache.size() == 0)
327 fc_info.font_distance_cache.init_to_size(font_id_map_.
CompactSize(),
329 if (fc_info.font_distance_cache[font_index2] < 0) {
334 fc_info.font_distance_cache[font_index2] = result;
336 FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
337 if (fc_info2.font_distance_cache.size() == 0)
338 fc_info2.font_distance_cache.init_to_size(font_id_map_.
CompactSize(),
340 fc_info2.font_distance_cache[font_index1] = result;
342 return fc_info.font_distance_cache[font_index2];
347 while (cache_index < fc_info.distance_cache.size() &&
348 (fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
349 fc_info.distance_cache[cache_index].font_id != font_id2))
351 if (cache_index == fc_info.distance_cache.size()) {
356 FontClassDistance fc_dist = { class_id2, font_id2, result };
357 fc_info.distance_cache.push_back(fc_dist);
360 FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
361 fc_dist.unichar_id = class_id1;
362 fc_dist.font_id = font_id1;
363 fc_info2.distance_cache.push_back(fc_dist);
365 return fc_info.distance_cache[cache_index].distance;
370 int font_id1,
int class_id1,
int font_id2,
int class_id2,
378 return static_cast<float>(dist) / denominator;
384 static void AddNearFeatures(
const IntFeatureMap& feature_map,
int f,
int levels,
386 int prev_num_features = 0;
388 int num_features = 1;
389 for (
int level = 0; level < levels; ++level) {
390 for (
int i = prev_num_features; i < num_features; ++i) {
391 int feature = (*good_features)[i];
392 for (
int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
393 if (dir == 0)
continue;
400 prev_num_features = num_features;
401 num_features = good_features->
size();
416 int font_id2,
int class_id2,
418 bool thorough)
const {
426 if (cloud1.
size() == 0)
427 return canonical2.
size();
430 for (
int f = 0; f < canonical2.
size(); ++f) {
431 int feature = canonical2[f];
436 AddNearFeatures(feature_map, feature, 1, &good_features);
439 for (i = 0; i < good_features.
size(); ++i) {
440 int good_f = good_features[i];
441 if (cloud1[good_f]) {
445 if (i < good_features.
size())
458 if (font_index < 0)
return -1;
459 return (*font_class_array_)(font_index, class_id).samples[index];
465 int font_id,
int class_id)
const {
468 if (font_index < 0)
return NULL;
469 int sample_index = (*font_class_array_)(font_index,
470 class_id).canonical_sample;
471 return sample_index >= 0 ? samples_[sample_index] :
NULL;
479 if (font_index < 0)
return 0.0f;
480 if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0)
481 return (*font_class_array_)(font_index, class_id).canonical_dist;
488 for (
int s = 0; s < samples_.size(); ++s)
496 if (font_class_array_ ==
NULL)
500 pixa = pixaCreate(0);
502 int fs_size = feature_space.
Size();
504 for (
int font_index = 0; font_index < font_size; ++font_index) {
505 for (
int c = 0; c < unicharset_size_; ++c) {
509 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
510 int sample_count = fcinfo.samples.size();
511 if (sample_count < kMinOutlierSamples)
513 for (
int i = 0; i < sample_count; ++i) {
514 int s = fcinfo.samples[i];
516 for (
int f = 0; f < features.
size(); ++f) {
517 ++feature_counts[features[f]];
520 for (
int i = 0; i < sample_count; ++i) {
521 int s = fcinfo.samples[i];
526 int good_features = 0;
527 int bad_features = 0;
528 for (
int f = 0; f < features.
size(); ++f) {
529 if (feature_counts[features[f]] > 1)
535 if (bad_features * 2 > good_features) {
536 tprintf(
"Deleting outlier sample of %s, %d good, %d bad\n",
538 good_features, bad_features);
540 pixaAddPix(pixa, sample.
RenderToPix(&unicharset_), L_INSERT);
545 t = fcinfo.samples[1];
547 t = fcinfo.samples[i - 1];
549 pixaAddPix(pixa, csample.
RenderToPix(&unicharset_), L_INSERT);
560 Pix* pix = pixaDisplayTiledInRows(pixa, 1, 2600, 1.0, 0, 10, 10);
562 pixWrite(
"outliers.png", pix, IFF_PNG);
577 num_raw_samples_ = samples_.size();
587 static Pix* DebugSample(
const UNICHARSET& unicharset,
589 tprintf(
"\nOriginal features:\n");
594 tprintf(
"\nMapped features:\n");
608 int compact_font_size = font_id_map_.
CompactSize();
610 if (font_class_array_ !=
NULL)
611 delete font_class_array_;
614 compact_font_size, unicharset_size_, empty);
615 for (
int s = 0; s < samples_.size(); ++s) {
616 int font_id = samples_[s]->font_id();
617 int class_id = samples_[s]->class_id();
618 if (font_id < 0 || font_id >= font_id_map_.
SparseSize()) {
619 tprintf(
"Font id = %d/%d, class id = %d/%d on sample %d\n",
620 font_id, font_id_map_.
SparseSize(), class_id, unicharset_size_,
624 ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
626 (*font_class_array_)(font_index, class_id).samples.push_back(s);
630 for (
int f = 0; f < compact_font_size; ++f) {
631 for (
int c = 0; c < unicharset_size_; ++c)
633 (*font_class_array_)(f, c).samples.size();
637 num_raw_samples_ = samples_.size();
645 for (
int s = 0; s < samples_.size(); ++s) {
646 int font_id = samples_[s]->font_id();
647 while (font_id >= font_counts.
size())
649 ++font_counts[font_id];
651 font_id_map_.
Init(font_counts.
size(),
false);
652 for (
int f = 0; f < font_counts.
size(); ++f) {
653 font_id_map_.
SetMap(f, font_counts[f] > 0);
655 font_id_map_.
Setup();
670 double global_worst_dist = 0.0;
673 for (
int font_index = 0; font_index < font_size; ++font_index) {
675 for (
int c = 0; c < unicharset_size_; ++c) {
676 int samples_found = 0;
677 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
678 if (fcinfo.samples.size() == 0 ||
680 fcinfo.canonical_sample = -1;
681 fcinfo.canonical_dist = 0.0f;
682 if (debug)
tprintf(
"Skipping class %d\n", c);
687 double min_max_dist = 2.0;
690 double max_max_dist = 0.0;
693 fcinfo.canonical_sample = fcinfo.samples[0];
694 fcinfo.canonical_dist = 0.0f;
695 for (
int i = 0; i < fcinfo.samples.size(); ++i) {
696 int s1 = fcinfo.samples[i];
698 f_table.
Set(features1, features1.
size(),
true);
699 double max_dist = 0.0;
704 for (
int j = 0; j < fcinfo.samples.size(); ++j) {
705 int s2 = fcinfo.samples[j];
706 if (samples_[s2]->class_id() != c ||
707 samples_[s2]->font_id() != font_id ||
712 if (dist > max_dist) {
714 if (dist > max_max_dist) {
722 f_table.
Set(features1, features1.
size(),
false);
723 samples_[s1]->set_max_dist(max_dist);
725 if (max_dist < min_max_dist) {
726 fcinfo.canonical_sample = s1;
727 fcinfo.canonical_dist = max_dist;
729 UpdateRange(max_dist, &min_max_dist, &max_max_dist);
731 if (max_max_dist > global_worst_dist) {
733 global_worst_dist = max_max_dist;
738 tprintf(
"Found %d samples of class %d=%s, font %d, "
739 "dist range [%g, %g], worst pair= %s, %s\n",
741 font_index, min_max_dist, max_max_dist,
748 tprintf(
"Global worst dist = %g, between sample %d and %d\n",
749 global_worst_dist, worst_s1, worst_s2);
750 Pix* pix1 = DebugSample(unicharset_, samples_[worst_s1]);
751 Pix* pix2 = DebugSample(unicharset_, samples_[worst_s2]);
752 pixOr(pix1, pix1, pix2);
753 pixWrite(
"worstpair.png", pix1, IFF_PNG);
767 for (
int font_index = 0; font_index < font_size; ++font_index) {
768 for (
int c = 0; c < unicharset_size_; ++c) {
769 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
770 int sample_count = fcinfo.samples.size();
771 int min_samples = 2 *
MAX(kSampleRandomSize, sample_count);
772 if (sample_count > 0 && sample_count < min_samples) {
773 int base_count = sample_count;
774 for (
int base_index = 0; sample_count < min_samples; ++sample_count) {
775 int src_index = fcinfo.samples[base_index++];
776 if (base_index >= base_count) base_index = 0;
778 sample_count % kSampleRandomSize);
779 int sample_index = samples_.size();
781 samples_.push_back(sample);
782 fcinfo.samples.push_back(sample_index);
796 for (
int font_index = 0; font_index < font_size; ++font_index) {
798 for (
int c = 0; c < unicharset_size_; ++c) {
800 if (num_samples == 0)
803 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
814 for (
int font_index = 0; font_index < font_size; ++font_index) {
816 for (
int c = 0; c < unicharset_size_; ++c) {
818 if (num_samples == 0)
820 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
821 fcinfo.cloud_features.Init(feature_space_size);
825 for (
int i = 0; i < sample_features.
size(); ++i)
826 fcinfo.cloud_features.SetBit(sample_features[i]);
834 for (
int f = 0; f < font_id_map_.
CompactSize(); ++f) {
853 for (
int f = 0; f < indexed_features.
size(); ++f) {
854 if (indexed_features[f] == f_index) {
void Init(int size, bool all_mapped)
const GenericVector< int > & GetCanonicalFeatures(int font_id, int class_id) const
STRING debug_str(UNICHAR_ID id) const
bool SerializeClasses(FILE *fp) const
bool Serialize(FILE *fp) const
Pix * RenderToPix(const UNICHARSET *unicharset) const
void OrganizeByFontAndClass()
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
bool save_to_file(const char *const filename) const
virtual int SparseSize() const
void ComputeCloudFeatures(int feature_space_size)
const INT_FEATURE_STRUCT * features() const
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
const TrainingSample * GetSample(int index) const
const GenericVector< int > & mapped_features() const
void ReplicateAndRandomizeSamples()
bool load_from_file(const char *const filename, bool skip_fragments)
void SetMap(int sparse_index, bool mapped)
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
void AddToShape(int unichar_id, int font_id)
bool ContainsUnichar(int unichar_id) const
void IndexAndSortFeatures(const INT_FEATURE_STRUCT *features, int num_features, GenericVector< int > *sorted_features) const
int OffsetFeature(int index_feature, int dir) const
void KillSample(TrainingSample *sample)
void IndexFeatures(const IntFeatureSpace &feature_space)
void AppendOtherUnicharset(const UNICHARSET &src)
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)
TrainingSample * MutableSample(int font_id, int class_id, int index)
void ComputeCanonicalFeatures()
const GenericVector< int > & indexed_features() const
void set_sample_index(int value)
void Init(const IntFeatureMap *feature_map)
bool Serialize(FILE *fp) const
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
const char *const id_to_unichar(UNICHAR_ID id) const
int GlobalSampleIndex(int font_id, int class_id, int index) const
void init_to_size(int size, T t)
virtual int SparseToCompact(int sparse_index) const
void LoadUnicharset(const char *filename)
bool DeleteableSample(const TrainingSample *sample)
void set_class_id(int id)
int NumClassSamples(int font_id, int class_id, bool randomize) const
const int kMinOutlierSamples
float GetCanonicalDist(int font_id, int class_id) const
const BitVector & GetCloudFeatures(int font_id, int class_id) const
void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, STRING *box_str)
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
double FeatureDistance(const GenericVector< int > &features) const
GenericVector< inT32 > font_ids
void unichar_insert(const char *const unichar_repr)
bool DeSerializeClasses(bool swap, FILE *fp)
bool features_are_mapped() const
void DeleteOutliers(const IntFeatureSpace &feature_space, bool debug)
void ReverseN(void *ptr, int num_bytes)
int num_raw_samples() const
UNICHAR_ID class_id() const
void AddAllFontsForClass(int class_id, Shape *shape) const
int AddSample(const char *unichar, TrainingSample *sample)
float ComputeClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map) const
int CompactToSparse(int compact_index) const
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
void Set(const GenericVector< int > &indexed_features, int canonical_count, bool value)
bool contains_unichar(const char *const unichar_repr) const
STRING SampleToString(const TrainingSample &sample) const
void DisplayFeatures(ScrollView::Color color, ScrollView *window) const
const TBOX & bounding_box() const
int ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map, bool thorough) const
const char * string() const
bool DeSerialize(bool swap, FILE *fp)
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)
bool DeSerialize(bool swap, FILE *fp)