tesseract  4.0.0-1-g2a2b
cluster.h File Reference
#include "kdtree.h"
#include "oldlist.h"

Go to the source code of this file.

Classes

struct  sample
 
struct  CLUSTERCONFIG
 
union  FLOATUNION
 
struct  PROTOTYPE
 
struct  CLUSTERER
 
struct  SAMPLELIST
 

Macros

#define MINBUCKETS   5
 
#define MAXBUCKETS   39
 
#define InitSampleSearch(S, C)   (((C)==nullptr)?(S=NIL_LIST):(S=push(NIL_LIST,(C))))
 

Typedefs

typedef struct sample CLUSTER
 
typedef CLUSTER SAMPLE
 

Enumerations

enum  PROTOSTYLE { spherical, elliptical, mixed, automatic }
 
enum  DISTRIBUTION { normal, uniform, D_random, DISTRIBUTION_COUNT }
 

Functions

CLUSTERERMakeClusterer (int16_t SampleSize, const PARAM_DESC ParamDesc[])
 
SAMPLEMakeSample (CLUSTERER *Clusterer, const float *Feature, int32_t CharID)
 
LIST ClusterSamples (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
void FreeClusterer (CLUSTERER *Clusterer)
 
void FreeProtoList (LIST *ProtoList)
 
void FreePrototype (void *arg)
 
CLUSTERNextSample (LIST *SearchState)
 
float Mean (PROTOTYPE *Proto, uint16_t Dimension)
 
float StandardDeviation (PROTOTYPE *Proto, uint16_t Dimension)
 
int32_t MergeClusters (int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
 

Macro Definition Documentation

◆ InitSampleSearch

#define InitSampleSearch (   S,
 
)    (((C)==nullptr)?(S=NIL_LIST):(S=push(NIL_LIST,(C))))

Definition at line 105 of file cluster.h.

◆ MAXBUCKETS

#define MAXBUCKETS   39

Definition at line 27 of file cluster.h.

◆ MINBUCKETS

#define MINBUCKETS   5

Definition at line 26 of file cluster.h.

Typedef Documentation

◆ CLUSTER

typedef struct sample CLUSTER

◆ SAMPLE

typedef CLUSTER SAMPLE

Definition at line 42 of file cluster.h.

Enumeration Type Documentation

◆ DISTRIBUTION

Enumerator
normal 
uniform 
D_random 
DISTRIBUTION_COUNT 

Definition at line 58 of file cluster.h.

◆ PROTOSTYLE

enum PROTOSTYLE
Enumerator
spherical 
elliptical 
mixed 
automatic 

Definition at line 44 of file cluster.h.

44  {
46 } PROTOSTYLE;
PROTOSTYLE
Definition: cluster.h:44
Definition: cluster.h:45

Function Documentation

◆ ClusterSamples()

LIST ClusterSamples ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

This routine first checks to see if the samples in this clusterer have already been clustered before; if so, it does not bother to recreate the cluster tree. It simply recomputes the prototypes based on the new Config info.

If the samples have not been clustered before, the samples in the KD tree are formed into a cluster tree and then the prototypes are computed from the cluster tree.

In either case this routine returns a pointer to a list of prototypes that best represent the samples given the constraints specified in Config.

Parameters
Clustererdata struct containing samples to be clustered
Configparameters which control clustering process
Returns
Pointer to a list of prototypes

Definition at line 506 of file cluster.cpp.

506  {
507  //only create cluster tree if samples have never been clustered before
508  if (Clusterer->Root == nullptr)
509  CreateClusterTree(Clusterer);
510 
511  //deallocate the old prototype list if one exists
512  FreeProtoList (&Clusterer->ProtoList);
513  Clusterer->ProtoList = NIL_LIST;
514 
515  //compute prototypes starting at the root node in the tree
516  ComputePrototypes(Clusterer, Config);
517  // We don't need the cluster pointers in the protos any more, so null them
518  // out, which makes it safe to delete the clusterer.
519  LIST proto_list = Clusterer->ProtoList;
520  iterate(proto_list) {
521  PROTOTYPE *proto = reinterpret_cast<PROTOTYPE *>(first_node(proto_list));
522  proto->Cluster = nullptr;
523  }
524  return Clusterer->ProtoList;
525 } // ClusterSamples
CLUSTERCONFIG Config
void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:894
void CreateClusterTree(CLUSTERER *Clusterer)
Definition: cluster.cpp:678
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:563
LIST ProtoList
Definition: cluster.h:92
CLUSTER * Root
Definition: cluster.h:91
#define first_node(l)
Definition: oldlist.h:141
#define NIL_LIST
Definition: oldlist.h:127
CLUSTER * Cluster
Definition: cluster.h:76
#define iterate(l)
Definition: oldlist.h:161

◆ FreeClusterer()

void FreeClusterer ( CLUSTERER Clusterer)

This routine frees all of the memory allocated to the specified data structure. It will not, however, free the memory used by the prototype list. The pointers to the clusters for each prototype in the list will be set to nullptr to indicate that the cluster data structures no longer exist. Any sample lists that have been obtained via calls to GetSamples are no longer valid.

Parameters
Clustererpointer to data structure to be freed
Returns
None

Definition at line 538 of file cluster.cpp.

538  {
539  if (Clusterer != nullptr) {
540  free(Clusterer->ParamDesc);
541  if (Clusterer->KDTree != nullptr)
542  FreeKDTree (Clusterer->KDTree);
543  if (Clusterer->Root != nullptr)
544  FreeCluster (Clusterer->Root);
545  // Free up all used buckets structures.
546  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
547  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
548  if (Clusterer->bucket_cache[d][c] != nullptr)
549  FreeBuckets(Clusterer->bucket_cache[d][c]);
550  }
551 
552  free(Clusterer);
553  }
554 } // FreeClusterer
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:333
PARAM_DESC * ParamDesc
Definition: cluster.h:88
void FreeBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2078
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:95
KDTREE * KDTree
Definition: cluster.h:90
#define MAXBUCKETS
Definition: cluster.h:27
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2093
CLUSTER * Root
Definition: cluster.h:91
#define MINBUCKETS
Definition: cluster.h:26

◆ FreeProtoList()

void FreeProtoList ( LIST ProtoList)

This routine frees all of the memory allocated to the specified list of prototypes. The clusters which are pointed to by the prototypes are not freed.

Parameters
ProtoListpointer to list of prototypes to be freed
Returns
None

Definition at line 563 of file cluster.cpp.

563  {
564  destroy_nodes(*ProtoList, FreePrototype);
565 } // FreeProtoList
void FreePrototype(void *arg)
Definition: cluster.cpp:575
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:186

◆ FreePrototype()

void FreePrototype ( void *  arg)

This routine deallocates the memory consumed by the specified prototype and modifies the corresponding cluster so that it is no longer marked as a prototype. The cluster is NOT deallocated by this routine.

Parameters
argprototype data structure to be deallocated
Returns
None

Definition at line 575 of file cluster.cpp.

575  { //PROTOTYPE *Prototype)
576  PROTOTYPE *Prototype = (PROTOTYPE *) arg;
577 
578  // unmark the corresponding cluster (if there is one
579  if (Prototype->Cluster != nullptr)
580  Prototype->Cluster->Prototype = FALSE;
581 
582  // deallocate the prototype statistics and then the prototype itself
583  free(Prototype->Distrib);
584  free(Prototype->Mean);
585  if (Prototype->Style != spherical) {
586  free(Prototype->Variance.Elliptical);
587  free(Prototype->Magnitude.Elliptical);
588  free(Prototype->Weight.Elliptical);
589  }
590  free(Prototype);
591 } // FreePrototype
float * Mean
Definition: cluster.h:78
unsigned Prototype
Definition: cluster.h:34
float * Elliptical
Definition: cluster.h:64
FLOATUNION Weight
Definition: cluster.h:83
DISTRIBUTION * Distrib
Definition: cluster.h:77
unsigned Style
Definition: cluster.h:74
#define FALSE
Definition: capi.h:52
FLOATUNION Magnitude
Definition: cluster.h:82
CLUSTER * Cluster
Definition: cluster.h:76
FLOATUNION Variance
Definition: cluster.h:81

◆ MakeClusterer()

CLUSTERER* MakeClusterer ( int16_t  SampleSize,
const PARAM_DESC  ParamDesc[] 
)

This routine creates a new clusterer data structure, initializes it, and returns a pointer to it.

Parameters
SampleSizenumber of dimensions in feature space
ParamDescdescription of each dimension
Returns
pointer to the new clusterer data structure

Definition at line 399 of file cluster.cpp.

399  {
400  CLUSTERER *Clusterer;
401  int i;
402 
403  // allocate main clusterer data structure and init simple fields
404  Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
405  Clusterer->SampleSize = SampleSize;
406  Clusterer->NumberOfSamples = 0;
407  Clusterer->NumChar = 0;
408 
409  // init fields which will not be used initially
410  Clusterer->Root = nullptr;
411  Clusterer->ProtoList = NIL_LIST;
412 
413  // maintain a copy of param descriptors in the clusterer data structure
414  Clusterer->ParamDesc =
415  (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
416  for (i = 0; i < SampleSize; i++) {
417  Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
418  Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
419  Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
420  Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
421  Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
422  Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
423  Clusterer->ParamDesc[i].MidRange =
424  (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
425  }
426 
427  // allocate a kd tree to hold the samples
428  Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
429 
430  // Initialize cache of histogram buckets to minimize recomputing them.
431  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
432  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
433  Clusterer->bucket_cache[d][c] = nullptr;
434  }
435 
436  return Clusterer;
437 } // MakeClusterer
float MidRange
Definition: ocrfeatures.h:50
int8_t Circular
Definition: ocrfeatures.h:44
KDTREE * MakeKDTree(int16_t KeySize, const PARAM_DESC KeyDesc[])
Definition: kdtree.cpp:181
float HalfRange
Definition: ocrfeatures.h:49
float Min
Definition: ocrfeatures.h:46
PARAM_DESC * ParamDesc
Definition: cluster.h:88
void * Emalloc(int Size)
Definition: emalloc.cpp:31
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:95
KDTREE * KDTree
Definition: cluster.h:90
#define MAXBUCKETS
Definition: cluster.h:27
float Range
Definition: ocrfeatures.h:48
LIST ProtoList
Definition: cluster.h:92
CLUSTER * Root
Definition: cluster.h:91
int8_t NonEssential
Definition: ocrfeatures.h:45
int32_t NumChar
Definition: cluster.h:93
#define NIL_LIST
Definition: oldlist.h:127
#define MINBUCKETS
Definition: cluster.h:26
int32_t NumberOfSamples
Definition: cluster.h:89
float Max
Definition: ocrfeatures.h:47
int16_t SampleSize
Definition: cluster.h:87

◆ MakeSample()

SAMPLE* MakeSample ( CLUSTERER Clusterer,
const float *  Feature,
int32_t  CharID 
)

This routine creates a new sample data structure to hold the specified feature. This sample is added to the clusterer data structure (so that it knows which samples are to be clustered later), and a pointer to the sample is returned to the caller.

Parameters
Clustererclusterer data structure to add sample to
Featurefeature to be added to clusterer
CharIDunique ident. of char that sample came from
Returns
Pointer to the new sample data structure

Definition at line 452 of file cluster.cpp.

453  {
454  SAMPLE *Sample;
455  int i;
456 
457  // see if the samples have already been clustered - if so trap an error
458  // Can't add samples after they have been clustered.
459  ASSERT_HOST(Clusterer->Root == nullptr);
460 
461  // allocate the new sample and initialize it
462  Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
463  (Clusterer->SampleSize -
464  1) * sizeof (float));
465  Sample->Clustered = FALSE;
466  Sample->Prototype = FALSE;
467  Sample->SampleCount = 1;
468  Sample->Left = nullptr;
469  Sample->Right = nullptr;
470  Sample->CharID = CharID;
471 
472  for (i = 0; i < Clusterer->SampleSize; i++)
473  Sample->Mean[i] = Feature[i];
474 
475  // add the sample to the KD tree - keep track of the total # of samples
476  Clusterer->NumberOfSamples++;
477  KDStore(Clusterer->KDTree, Sample->Mean, Sample);
478  if (CharID >= Clusterer->NumChar)
479  Clusterer->NumChar = CharID + 1;
480 
481  // execute hook for monitoring clustering operation
482  // (*SampleCreationHook)(Sample);
483 
484  return (Sample);
485 } // MakeSample
Definition: cluster.h:32
struct sample * Left
Definition: cluster.h:36
void * Emalloc(int Size)
Definition: emalloc.cpp:31
int32_t CharID
Definition: cluster.h:38
unsigned Prototype
Definition: cluster.h:34
void KDStore(KDTREE *Tree, float *Key, void *Data)
Definition: kdtree.cpp:213
KDTREE * KDTree
Definition: cluster.h:90
unsigned SampleCount
Definition: cluster.h:35
float Mean[1]
Definition: cluster.h:39
#define FALSE
Definition: capi.h:52
CLUSTER * Root
Definition: cluster.h:91
int32_t NumChar
Definition: cluster.h:93
struct sample * Right
Definition: cluster.h:37
unsigned Clustered
Definition: cluster.h:33
int32_t NumberOfSamples
Definition: cluster.h:89
int16_t SampleSize
Definition: cluster.h:87
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ Mean()

float Mean ( PROTOTYPE Proto,
uint16_t  Dimension 
)

This routine returns the mean of the specified prototype in the indicated dimension.

Parameters
Protoprototype to return mean of
Dimensiondimension whose mean is to be returned
Returns
Mean of Prototype in Dimension

Definition at line 628 of file cluster.cpp.

628  {
629  return (Proto->Mean[Dimension]);
630 } // Mean
float * Mean
Definition: cluster.h:78

◆ MergeClusters()

int32_t MergeClusters ( int16_t  N,
PARAM_DESC  ParamDesc[],
int32_t  n1,
int32_t  n2,
float  m[],
float  m1[],
float  m2[] 
)

This routine merges two clusters into one larger cluster. To do this it computes the number of samples in the new cluster and the mean of the new cluster. The ParamDesc information is used to ensure that circular dimensions are handled correctly.

Parameters
N# of dimensions (size of arrays)
ParamDescarray of dimension descriptions
n1,n2number of samples in each old cluster
marray to hold mean of new cluster
m1,m2arrays containing means of old clusters
Returns
The number of samples in the new cluster.

Definition at line 852 of file cluster.cpp.

857  {
858  int32_t i, n;
859 
860  n = n1 + n2;
861  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
862  if (ParamDesc->Circular) {
863  // if distance between means is greater than allowed
864  // reduce upper point by one "rotation" to compute mean
865  // then normalize the mean back into the accepted range
866  if ((*m2 - *m1) > ParamDesc->HalfRange) {
867  *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
868  if (*m < ParamDesc->Min)
869  *m += ParamDesc->Range;
870  }
871  else if ((*m1 - *m2) > ParamDesc->HalfRange) {
872  *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
873  if (*m < ParamDesc->Min)
874  *m += ParamDesc->Range;
875  }
876  else
877  *m = (n1 * *m1 + n2 * *m2) / n;
878  }
879  else
880  *m = (n1 * *m1 + n2 * *m2) / n;
881  }
882  return n;
883 } // MergeClusters

◆ NextSample()

CLUSTER* NextSample ( LIST SearchState)

This routine is used to find all of the samples which belong to a cluster. It starts by removing the top cluster on the cluster list (SearchState). If this cluster is a leaf it is returned. Otherwise, the right subcluster is pushed on the list and we continue the search in the left subcluster. This continues until a leaf is found. If all samples have been found, nullptr is returned. InitSampleSearch() must be called before NextSample() to initialize the search.

Parameters
SearchStateptr to list containing clusters to be searched
Returns
Pointer to the next leaf cluster (sample) or nullptr.

Definition at line 606 of file cluster.cpp.

606  {
607  CLUSTER *Cluster;
608 
609  if (*SearchState == NIL_LIST)
610  return (nullptr);
611  Cluster = (CLUSTER *) first_node (*SearchState);
612  *SearchState = pop (*SearchState);
613  while (TRUE) {
614  if (Cluster->Left == nullptr)
615  return (Cluster);
616  *SearchState = push (*SearchState, Cluster->Right);
617  Cluster = Cluster->Left;
618  }
619 } // NextSample
#define TRUE
Definition: capi.h:51
Definition: cluster.h:32
struct sample * Left
Definition: cluster.h:36
LIST push(LIST list, void *element)
Definition: oldlist.cpp:283
LIST pop(LIST list)
Definition: oldlist.cpp:266
#define first_node(l)
Definition: oldlist.h:141
#define NIL_LIST
Definition: oldlist.h:127
struct sample * Right
Definition: cluster.h:37

◆ StandardDeviation()

float StandardDeviation ( PROTOTYPE Proto,
uint16_t  Dimension 
)

This routine returns the standard deviation of the prototype in the indicated dimension.

Parameters
Protoprototype to return standard deviation of
Dimensiondimension whose stddev is to be returned
Returns
Standard deviation of Prototype in Dimension

Definition at line 639 of file cluster.cpp.

639  {
640  switch (Proto->Style) {
641  case spherical:
642  return ((float) sqrt ((double) Proto->Variance.Spherical));
643  case elliptical:
644  return ((float)
645  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
646  case mixed:
647  switch (Proto->Distrib[Dimension]) {
648  case normal:
649  return ((float)
650  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
651  case uniform:
652  case D_random:
653  return (Proto->Variance.Elliptical[Dimension]);
654  case DISTRIBUTION_COUNT:
655  ASSERT_HOST(!"Distribution count not allowed!");
656  }
657  }
658  return 0.0f;
659 } // StandardDeviation
float Spherical
Definition: cluster.h:63
float * Elliptical
Definition: cluster.h:64
DISTRIBUTION * Distrib
Definition: cluster.h:77
unsigned Style
Definition: cluster.h:74
Definition: cluster.h:59
FLOATUNION Variance
Definition: cluster.h:81
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: cluster.h:45