17 #include "allheaders.h" 
   39 TrainingSampleSet::FontClassInfo::FontClassInfo()
 
   40   : num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) {
 
   44 bool TrainingSampleSet::FontClassInfo::Serialize(FILE* fp)
 const {
 
   45   if (fwrite(&num_raw_samples, 
sizeof(num_raw_samples), 1, fp) != 1)
 
   47   if (fwrite(&canonical_sample, 
sizeof(canonical_sample), 1, fp) != 1)
 
   49   if (fwrite(&canonical_dist, 
sizeof(canonical_dist), 1, fp) != 1) 
return false;
 
   50   if (!samples.Serialize(fp)) 
return false;
 
   55 bool TrainingSampleSet::FontClassInfo::DeSerialize(
bool swap, FILE* fp) {
 
   56   if (fread(&num_raw_samples, 
sizeof(num_raw_samples), 1, fp) != 1)
 
   58   if (fread(&canonical_sample, 
sizeof(canonical_sample), 1, fp) != 1)
 
   60   if (fread(&canonical_dist, 
sizeof(canonical_dist), 1, fp) != 1) 
return false;
 
   61   if (!samples.DeSerialize(swap, fp)) 
return false;
 
   63     ReverseN(&num_raw_samples, 
sizeof(num_raw_samples));
 
   64     ReverseN(&canonical_sample, 
sizeof(canonical_sample));
 
   65     ReverseN(&canonical_dist, 
sizeof(canonical_dist));
 
   71   : num_raw_samples_(0), unicharset_size_(0),
 
   72     font_class_array_(
NULL), fontinfo_table_(font_table) {
 
   76   delete font_class_array_;
 
   81   if (!samples_.Serialize(fp)) 
return false;
 
   83   if (!font_id_map_.
Serialize(fp)) 
return false;
 
   84   inT8 not_null = font_class_array_ != 
NULL;
 
   85   if (fwrite(¬_null, 
sizeof(not_null), 1, fp) != 1) 
return false;
 
   95   if (!samples_.DeSerialize(swap, fp)) 
return false;
 
   96   num_raw_samples_ = samples_.size();
 
   98   if (!font_id_map_.
DeSerialize(swap, fp)) 
return false;
 
   99   if (font_class_array_ != 
NULL) {
 
  100     delete font_class_array_;
 
  101     font_class_array_ = 
NULL;
 
  104   if (fread(¬_null, 
sizeof(not_null), 1, fp) != 1) 
return false;
 
  110   unicharset_size_ = unicharset_.
size();
 
  117     tprintf(
"Failed to load unicharset from file %s\n" 
  118             "Building unicharset from scratch...\n",
 
  125   unicharset_size_ = unicharset_.
size();
 
  135       tprintf(
"Error: Size of unicharset in TrainingSampleSet::AddSample is " 
  136               "greater than MAX_NUM_CLASSES\n");
 
  149   samples_.push_back(sample);
 
  150   num_raw_samples_ = samples_.size();
 
  151   unicharset_size_ = unicharset_.
size();
 
  159                                        bool randomize)
 const {
 
  161   if (font_id < 0 || class_id < 0 ||
 
  162       font_id >= font_id_map_.
SparseSize() || class_id >= unicharset_size_) {
 
  170     return (*font_class_array_)(font_index, class_id).samples.size();
 
  177   return samples_[index];
 
  186   if (font_index < 0) 
return NULL;
 
  187   int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
 
  188   return samples_[sample_index];
 
  197   if (font_index < 0) 
return NULL;
 
  198   int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
 
  199   return samples_[sample_index];
 
  214     int font_id, 
int class_id)
 const {
 
  217   return (*font_class_array_)(font_index, class_id).cloud_features;
 
  222     int font_id, 
int class_id)
 const {
 
  225   return (*font_class_array_)(font_index, class_id).canonical_features;
 
  240   double dist_sum = 0.0;
 
  245     for (
int i = 0; i < num_fonts1; ++i) {
 
  247       for (
int j = 0; j < num_fonts2; ++j) {
 
  255   } 
else if (num_fonts1 * num_fonts2 <= kSquareLimit) {
 
  257     for (
int i = 0; i < num_fonts1; ++i) {
 
  259       for (
int j = 0; j < num_fonts2; ++j) {
 
  263             tprintf(
"Cluster dist %d %d %d %d = %g\n",
 
  273     int increment = kPrime1 != num_fonts2 ? kPrime1 : 
kPrime2;
 
  276     for (
int i = 0; i < 
num_samples; ++i, index += increment) {
 
  277       int f1 = uf1.
font_ids[i % num_fonts1];
 
  278       int f2 = uf2.
font_ids[index % num_fonts2];
 
  280           tprintf(
"Cluster dist %d %d %d %d = %g\n",
 
  287   if (dist_count == 0) {
 
  292   return dist_sum / dist_count;
 
  299                                          int font_id2, 
int class_id2,
 
  304   if (font_index1 < 0 || font_index2 < 0)
 
  306   FontClassInfo& fc_info = (*font_class_array_)(font_index1, class_id1);
 
  307   if (font_id1 == font_id2) {
 
  309     if (fc_info.unichar_distance_cache.size() == 0)
 
  310       fc_info.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
 
  311     if (fc_info.unichar_distance_cache[class_id2] < 0) {
 
  316       fc_info.unichar_distance_cache[class_id2] = result;
 
  318       FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
 
  319       if (fc_info2.unichar_distance_cache.size() == 0)
 
  320         fc_info2.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
 
  321       fc_info2.unichar_distance_cache[class_id1] = result;
 
  323     return fc_info.unichar_distance_cache[class_id2];
 
  324   } 
else if (class_id1 == class_id2) {
 
  326     if (fc_info.font_distance_cache.size() == 0)
 
  327       fc_info.font_distance_cache.init_to_size(font_id_map_.
CompactSize(),
 
  329     if (fc_info.font_distance_cache[font_index2] < 0) {
 
  334       fc_info.font_distance_cache[font_index2] = result;
 
  336       FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
 
  337       if (fc_info2.font_distance_cache.size() == 0)
 
  338         fc_info2.font_distance_cache.init_to_size(font_id_map_.
CompactSize(),
 
  340       fc_info2.font_distance_cache[font_index1] = result;
 
  342     return fc_info.font_distance_cache[font_index2];
 
  347   while (cache_index < fc_info.distance_cache.size() &&
 
  348          (fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
 
  349           fc_info.distance_cache[cache_index].font_id != font_id2))
 
  351   if (cache_index == fc_info.distance_cache.size()) {
 
  356     FontClassDistance fc_dist = { class_id2, font_id2, result };
 
  357     fc_info.distance_cache.push_back(fc_dist);
 
  360     FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
 
  361     fc_dist.unichar_id = class_id1;
 
  362     fc_dist.font_id = font_id1;
 
  363     fc_info2.distance_cache.push_back(fc_dist);
 
  365   return fc_info.distance_cache[cache_index].distance;
 
  370     int font_id1, 
int class_id1, 
int font_id2, 
int class_id2,
 
  378   return static_cast<float>(dist) / denominator;
 
  384 static void AddNearFeatures(
const IntFeatureMap& feature_map, 
int f, 
int levels,
 
  386   int prev_num_features = 0;
 
  388   int num_features = 1;
 
  389   for (
int level = 0; level < levels; ++level) {
 
  390     for (
int i = prev_num_features; i < num_features; ++i) {
 
  391       int feature = (*good_features)[i];
 
  392       for (
int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
 
  393         if (dir == 0) 
continue;
 
  400     prev_num_features = num_features;
 
  401     num_features = good_features->
size();
 
  416                                          int font_id2, 
int class_id2,
 
  418                                          bool thorough)
 const {
 
  426   if (cloud1.
size() == 0)
 
  427     return canonical2.
size();  
 
  430   for (
int f = 0; f < canonical2.
size(); ++f) {
 
  431     int feature = canonical2[f];
 
  436     AddNearFeatures(feature_map, feature, 1, &good_features);
 
  439     for (i = 0; i < good_features.
size(); ++i) {
 
  440       int good_f = good_features[i];
 
  441       if (cloud1[good_f]) {
 
  445     if (i < good_features.
size())
 
  458   if (font_index < 0) 
return -1;
 
  459   return (*font_class_array_)(font_index, class_id).samples[index];
 
  465     int font_id, 
int class_id)
 const {
 
  468   if (font_index < 0) 
return NULL;
 
  469   int sample_index = (*font_class_array_)(font_index,
 
  470                                           class_id).canonical_sample;
 
  471   return sample_index >= 0 ? samples_[sample_index] : 
NULL;
 
  479   if (font_index < 0) 
return 0.0f;
 
  480   if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0)
 
  481     return (*font_class_array_)(font_index, class_id).canonical_dist;
 
  488   for (
int s = 0; s < samples_.size(); ++s)
 
  496   if (font_class_array_ == 
NULL)
 
  500     pixa = pixaCreate(0);
 
  502   int fs_size = feature_space.
Size();
 
  504   for (
int font_index = 0; font_index < font_size; ++font_index) {
 
  505     for (
int c = 0; c < unicharset_size_; ++c) {
 
  509       FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
 
  510       int sample_count = fcinfo.samples.size();
 
  511       if (sample_count < kMinOutlierSamples)
 
  513       for (
int i = 0; i < sample_count; ++i) {
 
  514         int s = fcinfo.samples[i];
 
  516         for (
int f = 0; f < features.
size(); ++f) {
 
  517           ++feature_counts[features[f]];
 
  520       for (
int i = 0; i < sample_count; ++i) {
 
  521         int s = fcinfo.samples[i];
 
  526         int good_features = 0;
 
  527         int bad_features = 0;
 
  528         for (
int f = 0; f < features.
size(); ++f) {
 
  529           if (feature_counts[features[f]] > 1)
 
  535         if (bad_features * 2 > good_features) {
 
  536           tprintf(
"Deleting outlier sample of %s, %d good, %d bad\n",
 
  538                   good_features, bad_features);
 
  540             pixaAddPix(pixa, sample.
RenderToPix(&unicharset_), L_INSERT);
 
  545               t = fcinfo.samples[1];
 
  547               t = fcinfo.samples[i - 1];
 
  549             pixaAddPix(pixa, csample.
RenderToPix(&unicharset_), L_INSERT);
 
  560     Pix* pix = pixaDisplayTiledInRows(pixa, 1, 2600, 1.0, 0, 10, 10);
 
  562     pixWrite(
"outliers.png", pix, IFF_PNG);
 
  577   num_raw_samples_ = samples_.size();
 
  587 static Pix* DebugSample(
const UNICHARSET& unicharset,
 
  589   tprintf(
"\nOriginal features:\n");
 
  594     tprintf(
"\nMapped features:\n");
 
  608   int compact_font_size = font_id_map_.
CompactSize();
 
  610   if (font_class_array_ != 
NULL)
 
  611     delete font_class_array_;
 
  614       compact_font_size, unicharset_size_, empty);
 
  615   for (
int s = 0; s < samples_.size(); ++s) {
 
  616     int font_id = samples_[s]->font_id();
 
  617     int class_id = samples_[s]->class_id();
 
  618     if (font_id < 0 || font_id >= font_id_map_.
SparseSize()) {
 
  619       tprintf(
"Font id = %d/%d, class id = %d/%d on sample %d\n",
 
  620               font_id, font_id_map_.
SparseSize(), class_id, unicharset_size_,
 
  624     ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
 
  626     (*font_class_array_)(font_index, class_id).samples.push_back(s);
 
  630   for (
int f = 0; f < compact_font_size; ++f) {
 
  631     for (
int c = 0; c < unicharset_size_; ++c)
 
  633           (*font_class_array_)(f, c).samples.size();
 
  637   num_raw_samples_ = samples_.size();
 
  645   for (
int s = 0; s < samples_.size(); ++s) {
 
  646     int font_id = samples_[s]->font_id();
 
  647     while (font_id >= font_counts.
size())
 
  649     ++font_counts[font_id];
 
  651   font_id_map_.
Init(font_counts.
size(), 
false);
 
  652   for (
int f = 0; f < font_counts.
size(); ++f) {
 
  653     font_id_map_.
SetMap(f, font_counts[f] > 0);
 
  655   font_id_map_.
Setup();
 
  670   double global_worst_dist = 0.0;
 
  673   for (
int font_index = 0; font_index < font_size; ++font_index) {
 
  675     for (
int c = 0; c < unicharset_size_; ++c) {
 
  676       int samples_found = 0;
 
  677       FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
 
  678       if (fcinfo.samples.size() == 0 ||
 
  680         fcinfo.canonical_sample = -1;
 
  681         fcinfo.canonical_dist = 0.0f;
 
  682         if (debug) 
tprintf(
"Skipping class %d\n", c);
 
  687       double min_max_dist = 2.0;
 
  690       double max_max_dist = 0.0;
 
  693       fcinfo.canonical_sample = fcinfo.samples[0];
 
  694       fcinfo.canonical_dist = 0.0f;
 
  695       for (
int i = 0; i < fcinfo.samples.size(); ++i) {
 
  696         int s1 = fcinfo.samples[i];
 
  698         f_table.
Set(features1, features1.
size(), 
true);
 
  699         double max_dist = 0.0;
 
  704         for (
int j = 0; j < fcinfo.samples.size(); ++j) {
 
  705           int s2 = fcinfo.samples[j];
 
  706           if (samples_[s2]->class_id() != c  ||
 
  707               samples_[s2]->font_id() != font_id ||
 
  712           if (dist > max_dist) {
 
  714             if (dist > max_max_dist) {
 
  722         f_table.
Set(features1, features1.
size(), 
false);
 
  723         samples_[s1]->set_max_dist(max_dist);
 
  725         if (max_dist < min_max_dist) {
 
  726           fcinfo.canonical_sample = s1;
 
  727           fcinfo.canonical_dist = max_dist;
 
  729         UpdateRange(max_dist, &min_max_dist, &max_max_dist);
 
  731       if (max_max_dist > global_worst_dist) {
 
  733         global_worst_dist = max_max_dist;
 
  738         tprintf(
"Found %d samples of class %d=%s, font %d, " 
  739                 "dist range [%g, %g], worst pair= %s, %s\n",
 
  741                 font_index, min_max_dist, max_max_dist,
 
  748     tprintf(
"Global worst dist = %g, between sample %d and %d\n",
 
  749             global_worst_dist, worst_s1, worst_s2);
 
  750     Pix* pix1 = DebugSample(unicharset_, samples_[worst_s1]);
 
  751     Pix* pix2 = DebugSample(unicharset_, samples_[worst_s2]);
 
  752     pixOr(pix1, pix1, pix2);
 
  753     pixWrite(
"worstpair.png", pix1, IFF_PNG);
 
  767   for (
int font_index = 0; font_index < font_size; ++font_index) {
 
  768     for (
int c = 0; c < unicharset_size_; ++c) {
 
  769       FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
 
  770       int sample_count = fcinfo.samples.size();
 
  771       int min_samples = 2 * 
MAX(kSampleRandomSize, sample_count);
 
  772       if (sample_count > 0 && sample_count < min_samples) {
 
  773         int base_count = sample_count;
 
  774         for (
int base_index = 0; sample_count < min_samples; ++sample_count) {
 
  775           int src_index = fcinfo.samples[base_index++];
 
  776           if (base_index >= base_count) base_index = 0;
 
  778               sample_count % kSampleRandomSize);
 
  779           int sample_index = samples_.size();
 
  781           samples_.push_back(sample);
 
  782           fcinfo.samples.push_back(sample_index);
 
  796   for (
int font_index = 0; font_index < font_size; ++font_index) {
 
  798     for (
int c = 0; c < unicharset_size_; ++c) {
 
  800       if (num_samples == 0)
 
  803       FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
 
  814   for (
int font_index = 0; font_index < font_size; ++font_index) {
 
  816     for (
int c = 0; c < unicharset_size_; ++c) {
 
  818       if (num_samples == 0)
 
  820       FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
 
  821       fcinfo.cloud_features.Init(feature_space_size);
 
  825         for (
int i = 0; i < sample_features.
size(); ++i)
 
  826           fcinfo.cloud_features.SetBit(sample_features[i]);
 
  834   for (
int f = 0; f < font_id_map_.
CompactSize(); ++f) {
 
  853       for (
int f = 0; f < indexed_features.
size(); ++f) {
 
  854         if (indexed_features[f] == f_index) {
 
void Init(int size, bool all_mapped)
const GenericVector< int > & GetCanonicalFeatures(int font_id, int class_id) const 
STRING debug_str(UNICHAR_ID id) const 
bool SerializeClasses(FILE *fp) const 
bool Serialize(FILE *fp) const 
Pix * RenderToPix(const UNICHARSET *unicharset) const 
void OrganizeByFontAndClass()
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const 
bool save_to_file(const char *const filename) const 
virtual int SparseSize() const 
void ComputeCloudFeatures(int feature_space_size)
const INT_FEATURE_STRUCT * features() const 
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
const TrainingSample * GetSample(int index) const 
const GenericVector< int > & mapped_features() const 
void ReplicateAndRandomizeSamples()
bool load_from_file(const char *const filename, bool skip_fragments)
void SetMap(int sparse_index, bool mapped)
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
void AddToShape(int unichar_id, int font_id)
bool ContainsUnichar(int unichar_id) const 
void IndexAndSortFeatures(const INT_FEATURE_STRUCT *features, int num_features, GenericVector< int > *sorted_features) const 
int OffsetFeature(int index_feature, int dir) const 
void KillSample(TrainingSample *sample)
void IndexFeatures(const IntFeatureSpace &feature_space)
void AppendOtherUnicharset(const UNICHARSET &src)
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)
TrainingSample * MutableSample(int font_id, int class_id, int index)
void ComputeCanonicalFeatures()
const GenericVector< int > & indexed_features() const 
void set_sample_index(int value)
void Init(const IntFeatureMap *feature_map)
bool Serialize(FILE *fp) const 
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const 
const char *const id_to_unichar(UNICHAR_ID id) const 
int GlobalSampleIndex(int font_id, int class_id, int index) const 
void init_to_size(int size, T t)
virtual int SparseToCompact(int sparse_index) const 
void LoadUnicharset(const char *filename)
bool DeleteableSample(const TrainingSample *sample)
void set_class_id(int id)
int NumClassSamples(int font_id, int class_id, bool randomize) const 
const int kMinOutlierSamples
float GetCanonicalDist(int font_id, int class_id) const 
const BitVector & GetCloudFeatures(int font_id, int class_id) const 
void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, STRING *box_str)
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
double FeatureDistance(const GenericVector< int > &features) const 
GenericVector< inT32 > font_ids
void unichar_insert(const char *const unichar_repr)
bool DeSerializeClasses(bool swap, FILE *fp)
bool features_are_mapped() const 
void DeleteOutliers(const IntFeatureSpace &feature_space, bool debug)
void ReverseN(void *ptr, int num_bytes)
int num_raw_samples() const 
UNICHAR_ID class_id() const 
void AddAllFontsForClass(int class_id, Shape *shape) const 
int AddSample(const char *unichar, TrainingSample *sample)
float ComputeClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map) const 
int CompactToSparse(int compact_index) const 
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const 
void Set(const GenericVector< int > &indexed_features, int canonical_count, bool value)
bool contains_unichar(const char *const unichar_repr) const 
STRING SampleToString(const TrainingSample &sample) const 
void DisplayFeatures(ScrollView::Color color, ScrollView *window) const 
const TBOX & bounding_box() const 
int ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map, bool thorough) const 
const char * string() const 
bool DeSerialize(bool swap, FILE *fp)
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)
bool DeSerialize(bool swap, FILE *fp)