All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
commontraining.cpp File Reference
#include "commontraining.h"
#include "allheaders.h"
#include "ccutil.h"
#include "classify.h"
#include "cluster.h"
#include "clusttool.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "fontinfo.h"
#include "freelist.h"
#include "globals.h"
#include "intfeaturespace.h"
#include "mastertrainer.h"
#include "mf.h"
#include "ndminx.h"
#include "oldlist.h"
#include "params.h"
#include "shapetable.h"
#include "tessdatamanager.h"
#include "tessopt.h"
#include "tprintf.h"
#include "unicity_table.h"
#include <math.h>

Go to the source code of this file.

Namespaces

 tesseract
 

Functions

 INT_PARAM_FLAG (debug_level, 0,"Level of Trainer debugging")
 
 INT_PARAM_FLAG (load_images, 0,"Load images with tr files")
 
 STRING_PARAM_FLAG (configfile,"","File to load more configs from")
 
 STRING_PARAM_FLAG (D,"","Directory to write output files to")
 
 STRING_PARAM_FLAG (F,"font_properties","File listing font properties")
 
 STRING_PARAM_FLAG (X,"","File listing font xheights")
 
 STRING_PARAM_FLAG (U,"unicharset","File to load unicharset from")
 
 STRING_PARAM_FLAG (O,"","File to write unicharset to")
 
 STRING_PARAM_FLAG (T,"","File to load trainer from")
 
 STRING_PARAM_FLAG (output_trainer,"","File to write trainer to")
 
 STRING_PARAM_FLAG (test_ch,"","UTF8 test character string")
 
 DOUBLE_PARAM_FLAG (clusterconfig_min_samples_fraction, Config.MinSamples,"Min number of samples per proto as % of total")
 
 DOUBLE_PARAM_FLAG (clusterconfig_max_illegal, Config.MaxIllegal,"Max percentage of samples in a cluster which have more"" than 1 feature in that cluster")
 
 DOUBLE_PARAM_FLAG (clusterconfig_independence, Config.Independence,"Desired independence between dimensions")
 
 DOUBLE_PARAM_FLAG (clusterconfig_confidence, Config.Confidence,"Desired confidence in prototypes created")
 
void ParseArguments (int *argc, char ***argv)
 
ShapeTabletesseract::LoadShapeTable (const STRING &file_prefix)
 
void tesseract::WriteShapeTable (const STRING &file_prefix, const ShapeTable &shape_table)
 
MasterTrainer * tesseract::LoadTrainingData (int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
 
const char * GetNextFilename (int argc, const char *const *argv)
 
LABELEDLIST FindList (LIST List, char *Label)
 
LABELEDLIST NewLabeledList (const char *Label)
 
void ReadTrainingSamples (const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
 
void FreeTrainingSamples (LIST CharList)
 
void FreeLabeledList (LABELEDLIST LabeledList)
 
CLUSTERERSetUpForClustering (const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
 
void MergeInsignificantProtos (LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
void CleanUpUnusedData (LIST ProtoList)
 
LIST RemoveInsignificantProtos (LIST ProtoList, BOOL8 KeepSigProtos, BOOL8 KeepInsigProtos, int N)
 
MERGE_CLASS FindClass (LIST List, const char *Label)
 
MERGE_CLASS NewLabeledClass (const char *Label)
 
void FreeLabeledClassList (LIST ClassList)
 
CLASS_STRUCTSetUpForFloat2Int (const UNICHARSET &unicharset, LIST LabeledClassList)
 
void Normalize (float *Values)
 
void FreeNormProtoList (LIST CharList)
 
void AddToNormProtosList (LIST *NormProtoList, LIST ProtoList, char *CharName)
 
int NumberOfProtos (LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
 

Variables

CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 }
 
FEATURE_DEFS_STRUCT feature_defs
 
CCUtil ccutil
 

Function Documentation

void AddToNormProtosList ( LIST NormProtoList,
LIST  ProtoList,
char *  CharName 
)

Definition at line 854 of file commontraining.cpp.

858 {
859  PROTOTYPE* Proto;
860  LABELEDLIST LabeledProtoList;
861 
862  LabeledProtoList = NewLabeledList(CharName);
863  iterate(ProtoList)
864  {
865  Proto = (PROTOTYPE *) first_node (ProtoList);
866  LabeledProtoList->List = push(LabeledProtoList->List, Proto);
867  }
868  *NormProtoList = push(*NormProtoList, LabeledProtoList);
869 }
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
LABELEDLIST NewLabeledList(const char *Label)
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
void CleanUpUnusedData ( LIST  ProtoList)

Definition at line 606 of file commontraining.cpp.

608 {
609  PROTOTYPE* Prototype;
610 
611  iterate(ProtoList)
612  {
613  Prototype = (PROTOTYPE *) first_node (ProtoList);
614  if(Prototype->Variance.Elliptical != NULL)
615  {
616  memfree(Prototype->Variance.Elliptical);
617  Prototype->Variance.Elliptical = NULL;
618  }
619  if(Prototype->Magnitude.Elliptical != NULL)
620  {
621  memfree(Prototype->Magnitude.Elliptical);
622  Prototype->Magnitude.Elliptical = NULL;
623  }
624  if(Prototype->Weight.Elliptical != NULL)
625  {
626  memfree(Prototype->Weight.Elliptical);
627  Prototype->Weight.Elliptical = NULL;
628  }
629  }
630 }
void memfree(void *element)
Definition: freelist.cpp:30
FLOATUNION Variance
Definition: cluster.h:81
FLOATUNION Weight
Definition: cluster.h:83
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Elliptical
Definition: cluster.h:64
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
#define NULL
Definition: host.h:144
DOUBLE_PARAM_FLAG ( clusterconfig_min_samples_fraction  ,
Config.  MinSamples,
"Min number of samples per proto as % of total"   
)
DOUBLE_PARAM_FLAG ( clusterconfig_max_illegal  ,
Config.  MaxIllegal,
"Max percentage of samples in a cluster which have more"" than 1 feature in that cluster"   
)
DOUBLE_PARAM_FLAG ( clusterconfig_independence  ,
Config.  Independence,
"Desired independence between dimensions"   
)
DOUBLE_PARAM_FLAG ( clusterconfig_confidence  ,
Config.  Confidence,
"Desired confidence in prototypes created"   
)
MERGE_CLASS FindClass ( LIST  List,
const char *  Label 
)

Definition at line 701 of file commontraining.cpp.

704 {
705  MERGE_CLASS MergeClass;
706 
707  iterate (List)
708  {
709  MergeClass = (MERGE_CLASS) first_node (List);
710  if (strcmp (MergeClass->Label, Label) == 0)
711  return (MergeClass);
712  }
713  return (NULL);
714 
715 } /* FindClass */
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
#define NULL
Definition: host.h:144
MERGE_CLASS_NODE * MERGE_CLASS
LABELEDLIST FindList ( LIST  List,
char *  Label 
)

This routine searches thru a list of labeled lists to find a list with the specified label. If a matching labeled list cannot be found, NULL is returned.

Parameters
Listlist to search
Labellabel to search for
Returns
Labeled list with the specified Label or NULL.
Note
Globals: none
Exceptions: none
History: Fri Aug 18 15:57:41 1989, DSJ, Created.

Definition at line 331 of file commontraining.cpp.

334 {
335  LABELEDLIST LabeledList;
336 
337  iterate (List)
338  {
339  LabeledList = (LABELEDLIST) first_node (List);
340  if (strcmp (LabeledList->Label, Label) == 0)
341  return (LabeledList);
342  }
343  return (NULL);
344 
345 } /* FindList */
struct LABELEDLISTNODE * LABELEDLIST
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
#define NULL
Definition: host.h:144
void FreeLabeledClassList ( LIST  ClassList)

This routine deallocates all of the space allocated to the specified list of training samples.

Parameters
ClassListlist of all fonts in document
Returns
none
Note
Globals: none
Exceptions: none
History: Fri Aug 18 17:44:27 1989, DSJ, Created.

Definition at line 741 of file commontraining.cpp.

743 {
744  MERGE_CLASS MergeClass;
745 
746  iterate (ClassList) /* iterate thru all of the fonts */
747  {
748  MergeClass = (MERGE_CLASS) first_node (ClassList);
749  free (MergeClass->Label);
750  FreeClass(MergeClass->Class);
751  delete MergeClass;
752  }
753  destroy (ClassList);
754 
755 } /* FreeLabeledClassList */
#define first_node(l)
Definition: oldlist.h:139
void FreeClass(CLASS_TYPE Class)
Definition: protos.cpp:215
LIST destroy(LIST list)
Definition: oldlist.cpp:187
#define iterate(l)
Definition: oldlist.h:159
MERGE_CLASS_NODE * MERGE_CLASS
CLASS_TYPE Class
void FreeLabeledList ( LABELEDLIST  LabeledList)

This routine deallocates all of the memory consumed by a labeled list. It does not free any memory which may be consumed by the items in the list.

Parameters
LabeledListlabeled list to be freed
Note
Globals: none
Returns
none
Note
Exceptions: none
History: Fri Aug 18 17:52:45 1989, DSJ, Created.

Definition at line 487 of file commontraining.cpp.

487  {
488  destroy(LabeledList->List);
489  free(LabeledList->Label);
490  free(LabeledList);
491 } /* FreeLabeledList */
LIST destroy(LIST list)
Definition: oldlist.cpp:187
void FreeNormProtoList ( LIST  CharList)

Definition at line 838 of file commontraining.cpp.

841 {
842  LABELEDLIST char_sample;
843 
844  iterate (CharList) /* iterate thru all of the fonts */
845  {
846  char_sample = (LABELEDLIST) first_node (CharList);
847  FreeLabeledList (char_sample);
848  }
849  destroy (CharList);
850 
851 } // FreeNormProtoList
void FreeLabeledList(LABELEDLIST LabeledList)
struct LABELEDLISTNODE * LABELEDLIST
#define first_node(l)
Definition: oldlist.h:139
LIST destroy(LIST list)
Definition: oldlist.cpp:187
#define iterate(l)
Definition: oldlist.h:159
void FreeTrainingSamples ( LIST  CharList)

This routine deallocates all of the space allocated to the specified list of training samples.

Parameters
CharListlist of all fonts in document
Returns
none
Note
Globals: none
Exceptions: none
History: Fri Aug 18 17:44:27 1989, DSJ, Created.

Definition at line 458 of file commontraining.cpp.

458  {
459  LABELEDLIST char_sample;
460  FEATURE_SET FeatureSet;
461  LIST FeatureList;
462 
463 
464  iterate(CharList) { /* iterate thru all of the fonts */
465  char_sample = (LABELEDLIST) first_node(CharList);
466  FeatureList = char_sample->List;
467  iterate(FeatureList) { /* iterate thru all of the classes */
468  FeatureSet = (FEATURE_SET) first_node(FeatureList);
469  FreeFeatureSet(FeatureSet);
470  }
471  FreeLabeledList(char_sample);
472  }
473  destroy(CharList);
474 } /* FreeTrainingSamples */
void FreeLabeledList(LABELEDLIST LabeledList)
struct LABELEDLISTNODE * LABELEDLIST
#define first_node(l)
Definition: oldlist.h:139
LIST destroy(LIST list)
Definition: oldlist.cpp:187
#define iterate(l)
Definition: oldlist.h:159
FEATURE_SET_STRUCT * FEATURE_SET
Definition: ocrfeatures.h:74
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:78
const char* GetNextFilename ( int  argc,
const char *const *  argv 
)

This routine returns the next command line argument. If there are no remaining command line arguments, it returns NULL. This routine should only be called after all option arguments have been parsed and removed with ParseArguments.

Globals:

  • tessoptind defined by tessopt sys call
    Returns
    Next command line argument or NULL.
    Note
    Exceptions: none
    History: Fri Aug 18 09:34:12 1989, DSJ, Created.

Definition at line 310 of file commontraining.cpp.

310  {
311  if (tessoptind < argc)
312  return argv[tessoptind++];
313  else
314  return NULL;
315 } /* GetNextFilename */
#define NULL
Definition: host.h:144
int tessoptind
Definition: tessopt.cpp:24
INT_PARAM_FLAG ( debug_level  ,
,
"Level of Trainer debugging"   
)
INT_PARAM_FLAG ( load_images  ,
,
"Load images with tr files"   
)
void MergeInsignificantProtos ( LIST  ProtoList,
const char *  label,
CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

Definition at line 541 of file commontraining.cpp.

542  {
543  PROTOTYPE *Prototype;
544  bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
545 
546  LIST pProtoList = ProtoList;
547  iterate(pProtoList) {
548  Prototype = (PROTOTYPE *) first_node (pProtoList);
549  if (Prototype->Significant || Prototype->Merged)
550  continue;
551  FLOAT32 best_dist = 0.125;
552  PROTOTYPE* best_match = NULL;
553  // Find the nearest alive prototype.
554  LIST list_it = ProtoList;
555  iterate(list_it) {
556  PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it);
557  if (test_p != Prototype && !test_p->Merged) {
558  FLOAT32 dist = ComputeDistance(Clusterer->SampleSize,
559  Clusterer->ParamDesc,
560  Prototype->Mean, test_p->Mean);
561  if (dist < best_dist) {
562  best_match = test_p;
563  best_dist = dist;
564  }
565  }
566  }
567  if (best_match != NULL && !best_match->Significant) {
568  if (debug)
569  tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
570  best_match->NumSamples, Prototype->NumSamples,
571  best_match->Mean[0], best_match->Mean[1],
572  Prototype->Mean[0], Prototype->Mean[1]);
573  best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
574  Clusterer->ParamDesc,
575  best_match->NumSamples,
576  Prototype->NumSamples,
577  best_match->Mean,
578  best_match->Mean, Prototype->Mean);
579  Prototype->NumSamples = 0;
580  Prototype->Merged = 1;
581  } else if (best_match != NULL) {
582  if (debug)
583  tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
584  Prototype->Mean[0], Prototype->Mean[1],
585  best_match->Mean[0], best_match->Mean[1]);
586  Prototype->Merged = 1;
587  }
588  }
589  // Mark significant those that now have enough samples.
590  int min_samples = (inT32) (Config->MinSamples * Clusterer->NumChar);
591  pProtoList = ProtoList;
592  iterate(pProtoList) {
593  Prototype = (PROTOTYPE *) first_node (pProtoList);
594  // Process insignificant protos that do not match a green one
595  if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
596  !Prototype->Merged) {
597  if (debug)
598  tprintf("Red proto at %g,%g becoming green\n",
599  Prototype->Mean[0], Prototype->Mean[1]);
600  Prototype->Significant = true;
601  }
602  }
603 } /* MergeInsignificantProtos */
float FLOAT32
Definition: host.h:111
#define tprintf(...)
Definition: tprintf.h:31
FLOAT32 * Mean
Definition: cluster.h:78
unsigned Significant
Definition: cluster.h:68
unsigned NumSamples
Definition: cluster.h:75
FLOAT32 MinSamples
Definition: cluster.h:50
inT32 NumChar
Definition: cluster.h:93
#define first_node(l)
Definition: oldlist.h:139
FLOAT32 ComputeDistance(int k, PARAM_DESC *dim, FLOAT32 p1[], FLOAT32 p2[])
Definition: kdtree.cpp:473
#define iterate(l)
Definition: oldlist.h:159
PARAM_DESC * ParamDesc
Definition: cluster.h:88
inT32 MergeClusters(inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
#define NULL
Definition: host.h:144
unsigned Merged
Definition: cluster.h:69
inT16 SampleSize
Definition: cluster.h:87
int inT32
Definition: host.h:102
MERGE_CLASS NewLabeledClass ( const char *  Label)

Definition at line 718 of file commontraining.cpp.

720 {
721  MERGE_CLASS MergeClass;
722 
723  MergeClass = new MERGE_CLASS_NODE;
724  MergeClass->Label = (char*)Emalloc (strlen (Label)+1);
725  strcpy (MergeClass->Label, Label);
726  MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS);
727  return (MergeClass);
728 
729 } /* NewLabeledClass */
#define MAX_NUM_CONFIGS
Definition: intproto.h:46
CLASS_TYPE NewClass(int NumProtos, int NumConfigs)
Definition: protos.cpp:248
void * Emalloc(int Size)
Definition: emalloc.cpp:47
#define MAX_NUM_PROTOS
Definition: intproto.h:47
CLASS_TYPE Class
LABELEDLIST NewLabeledList ( const char *  Label)

This routine allocates a new, empty labeled list and gives it the specified label.

Parameters
Labellabel for new list
Returns
New, empty labeled list.
Note
Globals: none
Exceptions: none
History: Fri Aug 18 16:08:46 1989, DSJ, Created.

Definition at line 357 of file commontraining.cpp.

359 {
360  LABELEDLIST LabeledList;
361 
362  LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE));
363  LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
364  strcpy (LabeledList->Label, Label);
365  LabeledList->List = NIL_LIST;
366  LabeledList->SampleCount = 0;
367  LabeledList->font_sample_count = 0;
368  return (LabeledList);
369 
370 } /* NewLabeledList */
#define NIL_LIST
Definition: oldlist.h:126
struct LABELEDLISTNODE * LABELEDLIST
void * Emalloc(int Size)
Definition: emalloc.cpp:47
void Normalize ( float *  Values)

Definition at line 821 of file commontraining.cpp.

823 {
824  register float Slope;
825  register float Intercept;
826  register float Normalizer;
827 
828  Slope = tan (Values [2] * 2 * PI);
829  Intercept = Values [1] - Slope * Values [0];
830  Normalizer = 1 / sqrt (Slope * Slope + 1.0);
831 
832  Values [0] = Slope * Normalizer;
833  Values [1] = - Normalizer;
834  Values [2] = Intercept * Normalizer;
835 } // Normalize
#define PI
Definition: const.h:19
int NumberOfProtos ( LIST  ProtoList,
BOOL8  CountSigProtos,
BOOL8  CountInsigProtos 
)

Definition at line 872 of file commontraining.cpp.

876 {
877  int N = 0;
878  PROTOTYPE *Proto;
879 
880  iterate(ProtoList)
881  {
882  Proto = (PROTOTYPE *) first_node ( ProtoList );
883  if (( Proto->Significant && CountSigProtos ) ||
884  ( ! Proto->Significant && CountInsigProtos ) )
885  N++;
886  }
887  return(N);
888 }
unsigned Significant
Definition: cluster.h:68
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
void ParseArguments ( int *  argc,
char ***  argv 
)

This routine parses the command line arguments that were passed to the program and ses them to set relevant training-related global parameters

Globals:

  • Config current clustering parameters
    Parameters
    argcnumber of command line arguments to parse
    argvcommand line arguments
    Returns
    none
    Note
    Exceptions: Illegal options terminate the program.

Definition at line 88 of file commontraining.cpp.

88  {
89  STRING usage;
90  if (*argc) {
91  usage += (*argv)[0];
92  }
93  usage += " [.tr files ...]";
94  tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
95  // Record the index of the first non-flag argument to 1, since we set
96  // remove_flags to true when parsing the flags.
97  tessoptind = 1;
98  // Set some global values based on the flags.
100  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
102  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_max_illegal)));
104  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_independence)));
106  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_confidence)));
107  // Set additional parameters from config file if specified.
108  if (!FLAGS_configfile.empty()) {
110  FLAGS_configfile.c_str(),
112  ccutil.params());
113  }
114 }
#define MAX(x, y)
Definition: ndminx.h:24
#define MIN(x, y)
Definition: ndminx.h:28
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
FLOAT32 Independence
Definition: cluster.h:53
FLOAT32 MaxIllegal
Definition: cluster.h:51
FLOAT64 Confidence
Definition: cluster.h:54
FLOAT32 MinSamples
Definition: cluster.h:50
ParamsVectors * params()
Definition: ccutil.h:65
CLUSTERCONFIG Config
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41
Definition: strngs.h:44
CCUtil ccutil
int tessoptind
Definition: tessopt.cpp:24
const char * c_str() const
Definition: strngs.cpp:204
void ReadTrainingSamples ( const FEATURE_DEFS_STRUCT feature_defs,
const char *  feature_name,
int  max_samples,
UNICHARSET unicharset,
FILE *  file,
LIST training_samples 
)

This routine reads training samples from a file and places them into a data structure which organizes the samples by FontName and CharName. It then returns this data structure.

Parameters
fileopen text file to read samples from
feature_defs
feature_name
max_samples
unicharset
training_samples
Returns
none
Note
Globals: none
Exceptions: none
History:
  • Fri Aug 18 13:11:39 1989, DSJ, Created.
  • Tue May 17 1998 simplifications to structure, illiminated font, and feature specification levels of structure.

Definition at line 394 of file commontraining.cpp.

397  {
398  char buffer[2048];
399  char unichar[UNICHAR_LEN + 1];
400  LABELEDLIST char_sample;
401  FEATURE_SET feature_samples;
402  CHAR_DESC char_desc;
403  int i;
404  int feature_type = ShortNameToFeatureType(feature_defs, feature_name);
405  // Zero out the font_sample_count for all the classes.
406  LIST it = *training_samples;
407  iterate(it) {
408  char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
409  char_sample->font_sample_count = 0;
410  }
411 
412  while (fgets(buffer, 2048, file) != NULL) {
413  if (buffer[0] == '\n')
414  continue;
415 
416  sscanf(buffer, "%*s %s", unichar);
417  if (unicharset != NULL && !unicharset->contains_unichar(unichar)) {
418  unicharset->unichar_insert(unichar);
419  if (unicharset->size() > MAX_NUM_CLASSES) {
420  tprintf("Error: Size of unicharset in training is "
421  "greater than MAX_NUM_CLASSES\n");
422  exit(1);
423  }
424  }
425  char_sample = FindList(*training_samples, unichar);
426  if (char_sample == NULL) {
427  char_sample = NewLabeledList(unichar);
428  *training_samples = push(*training_samples, char_sample);
429  }
430  char_desc = ReadCharDescription(feature_defs, file);
431  feature_samples = char_desc->FeatureSets[feature_type];
432  if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
433  char_sample->List = push(char_sample->List, feature_samples);
434  char_sample->SampleCount++;
435  char_sample->font_sample_count++;
436  } else {
437  FreeFeatureSet(feature_samples);
438  }
439  for (i = 0; i < char_desc->NumFeatureSets; i++) {
440  if (feature_type != i)
441  FreeFeatureSet(char_desc->FeatureSets[i]);
442  }
443  free(char_desc);
444  }
445 } // ReadTrainingSamples
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:263
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
#define tprintf(...)
Definition: tprintf.h:31
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:44
uinT32 NumFeatureSets
Definition: featdefs.h:43
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:302
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
#define NULL
Definition: host.h:144
LABELEDLIST FindList(LIST List, char *Label)
#define UNICHAR_LEN
Definition: unichar.h:30
int size() const
Definition: unicharset.h:297
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:78
LABELEDLIST NewLabeledList(const char *Label)
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
LIST RemoveInsignificantProtos ( LIST  ProtoList,
BOOL8  KeepSigProtos,
BOOL8  KeepInsigProtos,
int  N 
)

Definition at line 633 of file commontraining.cpp.

639 {
640  LIST NewProtoList = NIL_LIST;
641  LIST pProtoList;
642  PROTOTYPE* Proto;
643  PROTOTYPE* NewProto;
644  int i;
645 
646  pProtoList = ProtoList;
647  iterate(pProtoList)
648  {
649  Proto = (PROTOTYPE *) first_node (pProtoList);
650  if ((Proto->Significant && KeepSigProtos) ||
651  (!Proto->Significant && KeepInsigProtos))
652  {
653  NewProto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
654 
655  NewProto->Mean = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
656  NewProto->Significant = Proto->Significant;
657  NewProto->Style = Proto->Style;
658  NewProto->NumSamples = Proto->NumSamples;
659  NewProto->Cluster = NULL;
660  NewProto->Distrib = NULL;
661 
662  for (i=0; i < N; i++)
663  NewProto->Mean[i] = Proto->Mean[i];
664  if (Proto->Variance.Elliptical != NULL)
665  {
666  NewProto->Variance.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
667  for (i=0; i < N; i++)
668  NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
669  }
670  else
671  NewProto->Variance.Elliptical = NULL;
672  //---------------------------------------------
673  if (Proto->Magnitude.Elliptical != NULL)
674  {
675  NewProto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
676  for (i=0; i < N; i++)
677  NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
678  }
679  else
680  NewProto->Magnitude.Elliptical = NULL;
681  //------------------------------------------------
682  if (Proto->Weight.Elliptical != NULL)
683  {
684  NewProto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
685  for (i=0; i < N; i++)
686  NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
687  }
688  else
689  NewProto->Weight.Elliptical = NULL;
690 
691  NewProto->TotalMagnitude = Proto->TotalMagnitude;
692  NewProto->LogMagnitude = Proto->LogMagnitude;
693  NewProtoList = push_last(NewProtoList, NewProto);
694  }
695  }
696  FreeProtoList(&ProtoList);
697  return (NewProtoList);
698 } /* RemoveInsignificantProtos */
float FLOAT32
Definition: host.h:111
#define NIL_LIST
Definition: oldlist.h:126
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 * Mean
Definition: cluster.h:78
unsigned Significant
Definition: cluster.h:68
FLOATUNION Weight
Definition: cluster.h:83
FLOAT32 TotalMagnitude
Definition: cluster.h:79
unsigned NumSamples
Definition: cluster.h:75
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:571
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:338
void * Emalloc(int Size)
Definition: emalloc.cpp:47
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Elliptical
Definition: cluster.h:64
CLUSTER * Cluster
Definition: cluster.h:76
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
unsigned Style
Definition: cluster.h:74
#define NULL
Definition: host.h:144
CLUSTERER* SetUpForClustering ( const FEATURE_DEFS_STRUCT FeatureDefs,
LABELEDLIST  char_sample,
const char *  program_feature_type 
)

This routine reads samples from a LABELEDLIST and enters those samples into a clusterer data structure. This data structure is then returned to the caller.

Parameters
char_sampleLABELEDLIST that holds all the feature information for a
FeatureDefs
program_feature_typegiven character.
Returns
Pointer to new clusterer data structure.
Note
Globals: None
Exceptions: None
History: 8/16/89, DSJ, Created.

Definition at line 507 of file commontraining.cpp.

509  {
510  uinT16 N;
511  int i, j;
512  FLOAT32 *Sample = NULL;
513  CLUSTERER *Clusterer;
514  inT32 CharID;
515  LIST FeatureList = NULL;
516  FEATURE_SET FeatureSet = NULL;
517 
518  int desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);
519  N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
520  Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
521 
522  FeatureList = char_sample->List;
523  CharID = 0;
524  iterate(FeatureList) {
525  FeatureSet = (FEATURE_SET) first_node(FeatureList);
526  for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
527  if (Sample == NULL)
528  Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
529  for (j = 0; j < N; j++)
530  Sample[j] = FeatureSet->Features[i]->Params[j];
531  MakeSample (Clusterer, Sample, CharID);
532  }
533  CharID++;
534  }
535  if ( Sample != NULL ) free( Sample );
536  return( Clusterer );
537 
538 } /* SetUpForClustering */
float FLOAT32
Definition: host.h:111
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:50
CLUSTERER * MakeClusterer(inT16 SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:400
SAMPLE * MakeSample(CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
Definition: cluster.cpp:457
FEATURE Features[1]
Definition: ocrfeatures.h:72
void * Emalloc(int Size)
Definition: emalloc.cpp:47
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
FLOAT32 Params[1]
Definition: ocrfeatures.h:65
FEATURE_SET_STRUCT * FEATURE_SET
Definition: ocrfeatures.h:74
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:302
#define NULL
Definition: host.h:144
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:59
unsigned short uinT16
Definition: host.h:101
int inT32
Definition: host.h:102
CLASS_STRUCT* SetUpForFloat2Int ( const UNICHARSET unicharset,
LIST  LabeledClassList 
)

Definition at line 758 of file commontraining.cpp.

759  {
760  MERGE_CLASS MergeClass;
761  CLASS_TYPE Class;
762  int NumProtos;
763  int NumConfigs;
764  int NumWords;
765  int i, j;
766  float Values[3];
767  PROTO NewProto;
768  PROTO OldProto;
769  BIT_VECTOR NewConfig;
770  BIT_VECTOR OldConfig;
771 
772  // printf("Float2Int ...\n");
773 
774  CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
775  iterate(LabeledClassList)
776  {
777  UnicityTableEqEq<int> font_set;
778  MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
779  Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
780  NumProtos = MergeClass->Class->NumProtos;
781  NumConfigs = MergeClass->Class->NumConfigs;
782  font_set.move(&MergeClass->Class->font_set);
783  Class->NumProtos = NumProtos;
784  Class->MaxNumProtos = NumProtos;
785  Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos);
786  for(i=0; i < NumProtos; i++)
787  {
788  NewProto = ProtoIn(Class, i);
789  OldProto = ProtoIn(MergeClass->Class, i);
790  Values[0] = OldProto->X;
791  Values[1] = OldProto->Y;
792  Values[2] = OldProto->Angle;
793  Normalize(Values);
794  NewProto->X = OldProto->X;
795  NewProto->Y = OldProto->Y;
796  NewProto->Length = OldProto->Length;
797  NewProto->Angle = OldProto->Angle;
798  NewProto->A = Values[0];
799  NewProto->B = Values[1];
800  NewProto->C = Values[2];
801  }
802 
803  Class->NumConfigs = NumConfigs;
804  Class->MaxNumConfigs = NumConfigs;
805  Class->font_set.move(&font_set);
806  Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs);
807  NumWords = WordsInVectorOfSize(NumProtos);
808  for(i=0; i < NumConfigs; i++)
809  {
810  NewConfig = NewBitVector(NumProtos);
811  OldConfig = MergeClass->Class->Configurations[i];
812  for(j=0; j < NumWords; j++)
813  NewConfig[j] = OldConfig[j];
814  Class->Configurations[i] = NewConfig;
815  }
816  }
817  return float_classes;
818 } // SetUpForFloat2Int
PROTO_STRUCT * PROTO
Definition: protos.h:52
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
#define WordsInVectorOfSize(NumBits)
Definition: bitvec.h:63
#define ProtoIn(Class, Pid)
Definition: protos.h:123
uinT32 * BIT_VECTOR
Definition: bitvec.h:28
inT16 NumConfigs
Definition: protos.h:62
BIT_VECTOR NewBitVector(int NumBits)
Definition: bitvec.cpp:90
UnicityTableEqEq< int > font_set
Definition: protos.h:65
inT16 NumProtos
Definition: protos.h:59
FLOAT32 X
Definition: protos.h:47
void Normalize(float *Values)
FLOAT32 Angle
Definition: protos.h:49
void * Emalloc(int Size)
Definition: emalloc.cpp:47
inT16 MaxNumConfigs
Definition: protos.h:63
FLOAT32 B
Definition: protos.h:45
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
FLOAT32 Length
Definition: protos.h:50
void move(UnicityTable< T > *from)
FLOAT32 C
Definition: protos.h:46
int size() const
Definition: unicharset.h:297
FLOAT32 A
Definition: protos.h:44
inT16 MaxNumProtos
Definition: protos.h:60
MERGE_CLASS_NODE * MERGE_CLASS
PROTO Prototypes
Definition: protos.h:61
CLASS_TYPE Class
CONFIGS Configurations
Definition: protos.h:64
FLOAT32 Y
Definition: protos.h:48
STRING_PARAM_FLAG ( configfile  ,
""  ,
"File to load more configs from"   
)
STRING_PARAM_FLAG ( ,
""  ,
"Directory to write output files to"   
)
STRING_PARAM_FLAG ( ,
"font_properties"  ,
"File listing font properties"   
)
STRING_PARAM_FLAG ( ,
""  ,
"File listing font xheights"   
)
STRING_PARAM_FLAG ( ,
"unicharset"  ,
"File to load unicharset from"   
)
STRING_PARAM_FLAG ( ,
""  ,
"File to write unicharset to"   
)
STRING_PARAM_FLAG ( ,
""  ,
"File to load trainer from"   
)
STRING_PARAM_FLAG ( output_trainer  ,
""  ,
"File to write trainer to"   
)
STRING_PARAM_FLAG ( test_ch  ,
""  ,
"UTF8 test character string"   
)

Variable Documentation

CCUtil ccutil

Definition at line 53 of file commontraining.cpp.

CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 }

Definition at line 51 of file commontraining.cpp.

FEATURE_DEFS_STRUCT feature_defs

Definition at line 52 of file commontraining.cpp.