#include <cfloat>
#include <cmath>
#include <vector>
#include "cluster.h"
#include "emalloc.h"
#include "genericheap.h"
#include <tesseract/helpers.h>
#include "kdpair.h"
#include "matrix.h"
#include "tprintf.h"

Classes
struct	TEMPCLUSTER

struct	STATISTICS

struct	BUCKETS

struct	CHISTRUCT

struct	ClusteringContext

Macros
#define	_USE_MATH_DEFINES

#define	HOTELLING 1

#define	FTABLE_X 10

#define	FTABLE_Y 100

#define	MINVARIANCE 0.0004

#define	MINSAMPLESPERBUCKET 5

#define	MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)

#define	MINSAMPLESNEEDED 1

#define	BUCKETTABLESIZE 1024

#define	NORMALEXTENT 3.0

#define	Odd(N) ((N)%2)

#define	Mirror(N, R) ((R) - (N) - 1)

#define	Abs(N) (((N) < 0) ? (-(N)) : (N))

#define	SqrtOf2Pi 2.506628275

#define	LOOKUPTABLESIZE 8

#define	MAXDEGREESOFFREEDOM MAXBUCKETS

#define	MAXNEIGHBORS 2

#define	MAXDISTANCE FLT_MAX

#define	CHIACCURACY 0.01

#define	MINALPHA (1e-200)

#define	INITIALDELTA 0.1

#define	DELTARATIO 0.1

#define	ILLEGAL_CHAR 2

Typedefs
using	ClusterPair = tesseract::KDPairInc< float, TEMPCLUSTER * >

using	ClusterHeap = tesseract::GenericHeap< ClusterPair >

using	DENSITYFUNC = double(*)(int32_t)

using	SOLVEFUNC = double()(CHISTRUCT , double)

Functions
CLUSTERER *	MakeClusterer (int16_t SampleSize, const PARAM_DESC ParamDesc[])

SAMPLE *	MakeSample (CLUSTERER Clusterer, const float Feature, int32_t CharID)

LIST	ClusterSamples (CLUSTERER Clusterer, CLUSTERCONFIG Config)

void	FreeClusterer (CLUSTERER *Clusterer)

void	FreeProtoList (LIST *ProtoList)

void	FreePrototype (void *arg)

CLUSTER *	NextSample (LIST *SearchState)

float	Mean (PROTOTYPE *Proto, uint16_t Dimension)

float	StandardDeviation (PROTOTYPE *Proto, uint16_t Dimension)

int32_t	MergeClusters (int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])

Variables
const double	FTable [FTABLE_Y][FTABLE_X]

Macro Definition Documentation

◆ _USE_MATH_DEFINES

#define _USE_MATH_DEFINES

Definition at line 17 of file cluster.cpp.

◆ Abs

#define Abs ( N ) (((N) < 0) ? (-(N)) : (N))

Definition at line 209 of file cluster.cpp.

◆ BUCKETTABLESIZE

#define BUCKETTABLESIZE 1024

define the size of the table which maps normalized samples to histogram buckets. Also define the number of standard deviations in a normal distribution which are considered to be significant. The mapping table will be defined in such a way that it covers the specified number of standard deviations on either side of the mean. BUCKETTABLESIZE should always be even.

Definition at line 161 of file cluster.cpp.

◆ CHIACCURACY

#define CHIACCURACY 0.01

◆ DELTARATIO

#define DELTARATIO 0.1

◆ FTABLE_X

#define FTABLE_X 10

Definition at line 32 of file cluster.cpp.

◆ FTABLE_Y

#define FTABLE_Y 100

Definition at line 33 of file cluster.cpp.

◆ HOTELLING

#define HOTELLING 1

Definition at line 31 of file cluster.cpp.

◆ ILLEGAL_CHAR

#define ILLEGAL_CHAR 2

◆ INITIALDELTA

#define INITIALDELTA 0.1

◆ LOOKUPTABLESIZE

#define LOOKUPTABLESIZE 8

define lookup tables used to compute the number of histogram buckets that should be used for a given number of samples.

Definition at line 229 of file cluster.cpp.

◆ MAXDEGREESOFFREEDOM

#define MAXDEGREESOFFREEDOM MAXBUCKETS

Definition at line 230 of file cluster.cpp.

◆ MAXDISTANCE

#define MAXDISTANCE FLT_MAX

◆ MAXNEIGHBORS

#define MAXNEIGHBORS 2

◆ MINALPHA

#define MINALPHA (1e-200)

◆ MINSAMPLES

#define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)

Definition at line 152 of file cluster.cpp.

◆ MINSAMPLESNEEDED

#define MINSAMPLESNEEDED 1

Definition at line 153 of file cluster.cpp.

◆ MINSAMPLESPERBUCKET

#define MINSAMPLESPERBUCKET 5

define the absolute minimum number of samples which must be present in order to accurately test hypotheses about underlying probability distributions. Define separately the minimum samples that are needed before a statistical analysis is attempted; this number should be equal to MINSAMPLES but can be set to a lower number for early testing when very few samples are available.

Definition at line 151 of file cluster.cpp.

◆ MINVARIANCE

#define MINVARIANCE 0.0004

define the variance which will be used as a minimum variance for any dimension of any feature. Since most features are calculated from numbers with a precision no better than 1 in 128, the variance should never be less than the square of this number for parameters whose range is 1.

Definition at line 143 of file cluster.cpp.

◆ Mirror

#define Mirror	(	N,
		R
	)	((R) - (N) - 1)

Definition at line 208 of file cluster.cpp.

◆ NORMALEXTENT

#define NORMALEXTENT 3.0

Definition at line 162 of file cluster.cpp.

◆ Odd

#define Odd ( N ) ((N)%2)

Definition at line 207 of file cluster.cpp.

◆ SqrtOf2Pi

#define SqrtOf2Pi 2.506628275

the following variables describe a discrete normal distribution which is used by NormalDensity() and NormalBucket(). The constant NORMALEXTENT determines how many standard deviations of the distribution are mapped onto the fixed discrete range of x. x=0 is mapped to -NORMALEXTENT standard deviations and x=BUCKETTABLESIZE is mapped to +NORMALEXTENT standard deviations.

Definition at line 219 of file cluster.cpp.

Typedef Documentation

◆ ClusterHeap

using ClusterHeap = tesseract::GenericHeap<ClusterPair>

Definition at line 170 of file cluster.cpp.

◆ ClusterPair

using ClusterPair = tesseract::KDPairInc<float, TEMPCLUSTER*>

Definition at line 169 of file cluster.cpp.

◆ DENSITYFUNC

using DENSITYFUNC = double (*)(int32_t)

Definition at line 204 of file cluster.cpp.

◆ SOLVEFUNC

using SOLVEFUNC = double (*)(CHISTRUCT*, double)

Definition at line 205 of file cluster.cpp.

Function Documentation

◆ ClusterSamples()

LIST ClusterSamples	(	CLUSTERER *	Clusterer,
		CLUSTERCONFIG *	Config
	)

This routine first checks to see if the samples in this clusterer have already been clustered before; if so, it does not bother to recreate the cluster tree. It simply recomputes the prototypes based on the new Config info.

If the samples have not been clustered before, the samples in the KD tree are formed into a cluster tree and then the prototypes are computed from the cluster tree.

In either case this routine returns a pointer to a list of prototypes that best represent the samples given the constraints specified in Config.

Parameters

Clusterer	data struct containing samples to be clustered
Config	parameters which control clustering process

Returns: Pointer to a list of prototypes

Definition at line 483 of file cluster.cpp.

                                                                  {
   //only create cluster tree if samples have never been clustered before
   if (Clusterer->Root == nullptr)
     CreateClusterTree(Clusterer);
  
   //deallocate the old prototype list if one exists
   FreeProtoList (&Clusterer->ProtoList);
   Clusterer->ProtoList = NIL_LIST;
  
   //compute prototypes starting at the root node in the tree
   ComputePrototypes(Clusterer, Config);
   // We don't need the cluster pointers in the protos any more, so null them
   // out, which makes it safe to delete the clusterer.
   LIST proto_list = Clusterer->ProtoList;
   iterate(proto_list) {
     auto *proto = reinterpret_cast<PROTOTYPE *>(first_node(proto_list));
     proto->Cluster = nullptr;
   }
   return Clusterer->ProtoList;
 }                                // ClusterSamples

◆ FreeClusterer()

void FreeClusterer ( CLUSTERER * Clusterer )

This routine frees all of the memory allocated to the specified data structure. It will not, however, free the memory used by the prototype list. The pointers to the clusters for each prototype in the list will be set to nullptr to indicate that the cluster data structures no longer exist. Any sample lists that have been obtained via calls to GetSamples are no longer valid.

Parameters

Clusterer pointer to data structure to be freed

Definition at line 514 of file cluster.cpp.

                                          {
   if (Clusterer != nullptr) {
     free(Clusterer->ParamDesc);
     if (Clusterer->KDTree != nullptr)
       FreeKDTree (Clusterer->KDTree);
     if (Clusterer->Root != nullptr)
       FreeCluster (Clusterer->Root);
     // Free up all used buckets structures.
     for (auto & d : Clusterer->bucket_cache) {
       for (auto & c : d)
         if (c != nullptr)
           FreeBuckets(c);
     }
  
     free(Clusterer);
   }
 }                                // FreeClusterer

◆ FreeProtoList()

void FreeProtoList ( LIST * ProtoList )

This routine frees all of the memory allocated to the specified list of prototypes. The clusters which are pointed to by the prototypes are not freed.

Parameters

ProtoList pointer to list of prototypes to be freed

Definition at line 538 of file cluster.cpp.

                                     {
   destroy_nodes(*ProtoList, FreePrototype);
 }                                // FreeProtoList

◆ FreePrototype()

void FreePrototype ( void * arg )

This routine deallocates the memory consumed by the specified prototype and modifies the corresponding cluster so that it is no longer marked as a prototype. The cluster is NOT deallocated by this routine.

Parameters

arg	prototype data structure to be deallocated

Definition at line 549 of file cluster.cpp.

                               {  //PROTOTYPE     *Prototype)
   auto *Prototype = static_cast<PROTOTYPE *>(arg);
  
   // unmark the corresponding cluster (if there is one
   if (Prototype->Cluster != nullptr)
     Prototype->Cluster->Prototype = false;
  
   // deallocate the prototype statistics and then the prototype itself
   free(Prototype->Distrib);
   free(Prototype->Mean);
   if (Prototype->Style != spherical) {
     free(Prototype->Variance.Elliptical);
     free(Prototype->Magnitude.Elliptical);
     free(Prototype->Weight.Elliptical);
   }
   free(Prototype);
 }                                // FreePrototype

◆ MakeClusterer()

CLUSTERER* MakeClusterer	(	int16_t	SampleSize,
		const PARAM_DESC	ParamDesc[]
	)

This routine creates a new clusterer data structure, initializes it, and returns a pointer to it.

Parameters

SampleSize	number of dimensions in feature space
ParamDesc	description of each dimension

Returns: pointer to the new clusterer data structure

Definition at line 376 of file cluster.cpp.

                                                                  {
   CLUSTERER *Clusterer;
   int i;
  
   // allocate main clusterer data structure and init simple fields
   Clusterer = static_cast<CLUSTERER *>(Emalloc (sizeof (CLUSTERER)));
   Clusterer->SampleSize = SampleSize;
   Clusterer->NumberOfSamples = 0;
   Clusterer->NumChar = 0;
  
   // init fields which will not be used initially
   Clusterer->Root = nullptr;
   Clusterer->ProtoList = NIL_LIST;
  
   // maintain a copy of param descriptors in the clusterer data structure
   Clusterer->ParamDesc =
     static_cast<PARAM_DESC *>(Emalloc (SampleSize * sizeof (PARAM_DESC)));
   for (i = 0; i < SampleSize; i++) {
     Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
     Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
     Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
     Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
     Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
     Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
     Clusterer->ParamDesc[i].MidRange =
       (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
   }
  
   // allocate a kd tree to hold the samples
   Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
  
   // Initialize cache of histogram buckets to minimize recomputing them.
   for (auto & d : Clusterer->bucket_cache) {
     for (auto & c : d)
       c = nullptr;
   }
  
   return Clusterer;
 }                                // MakeClusterer

◆ MakeSample()

SAMPLE* MakeSample	(	CLUSTERER *	Clusterer,
		const float *	Feature,
		int32_t	CharID
	)

This routine creates a new sample data structure to hold the specified feature. This sample is added to the clusterer data structure (so that it knows which samples are to be clustered later), and a pointer to the sample is returned to the caller.

Parameters

Clusterer	clusterer data structure to add sample to
Feature	feature to be added to clusterer
CharID	unique ident. of char that sample came from

Returns: Pointer to the new sample data structure

Definition at line 429 of file cluster.cpp.

                                    {
   SAMPLE *Sample;
   int i;
  
   // see if the samples have already been clustered - if so trap an error
   // Can't add samples after they have been clustered.
   ASSERT_HOST(Clusterer->Root == nullptr);
  
   // allocate the new sample and initialize it
   Sample = static_cast<SAMPLE *>(Emalloc (sizeof (SAMPLE) +
     (Clusterer->SampleSize -
     1) * sizeof (float)));
   Sample->Clustered = false;
   Sample->Prototype = false;
   Sample->SampleCount = 1;
   Sample->Left = nullptr;
   Sample->Right = nullptr;
   Sample->CharID = CharID;
  
   for (i = 0; i < Clusterer->SampleSize; i++)
     Sample->Mean[i] = Feature[i];
  
   // add the sample to the KD tree - keep track of the total # of samples
   Clusterer->NumberOfSamples++;
   KDStore(Clusterer->KDTree, Sample->Mean, Sample);
   if (CharID >= Clusterer->NumChar)
     Clusterer->NumChar = CharID + 1;
  
   // execute hook for monitoring clustering operation
   // (*SampleCreationHook)(Sample);
  
   return (Sample);
 }                                // MakeSample

◆ Mean()

float Mean	(	PROTOTYPE *	Proto,
		uint16_t	Dimension
	)

This routine returns the mean of the specified prototype in the indicated dimension.

Parameters

Proto	prototype to return mean of
Dimension	dimension whose mean is to be returned

Returns: Mean of Prototype in Dimension

Definition at line 602 of file cluster.cpp.

                                                  {
   return (Proto->Mean[Dimension]);
 }                                // Mean

◆ MergeClusters()

int32_t MergeClusters	(	int16_t	N,
		PARAM_DESC	ParamDesc[],
		int32_t	n1,
		int32_t	n2,
		float	m[],
		float	m1[],
		float	m2[]
	)

This routine merges two clusters into one larger cluster. To do this it computes the number of samples in the new cluster and the mean of the new cluster. The ParamDesc information is used to ensure that circular dimensions are handled correctly.

Parameters

N	# of dimensions (size of arrays)
ParamDesc	array of dimension descriptions
n1,n2	number of samples in each old cluster
m	array to hold mean of new cluster
m1,m2	arrays containing means of old clusters

Returns: The number of samples in the new cluster.

Definition at line 824 of file cluster.cpp.

                                             {
   int32_t i, n;
  
   n = n1 + n2;
   for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
     if (ParamDesc->Circular) {
       // if distance between means is greater than allowed
       // reduce upper point by one "rotation" to compute mean
       // then normalize the mean back into the accepted range
       if ((*m2 - *m1) > ParamDesc->HalfRange) {
         *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
         if (*m < ParamDesc->Min)
           *m += ParamDesc->Range;
       }
       else if ((*m1 - *m2) > ParamDesc->HalfRange) {
         *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
         if (*m < ParamDesc->Min)
           *m += ParamDesc->Range;
       }
       else
         *m = (n1 * *m1 + n2 * *m2) / n;
     }
     else
       *m = (n1 * *m1 + n2 * *m2) / n;
   }
   return n;
 }                                // MergeClusters

◆ NextSample()

CLUSTER* NextSample ( LIST * SearchState )

This routine is used to find all of the samples which belong to a cluster. It starts by removing the top cluster on the cluster list (SearchState). If this cluster is a leaf it is returned. Otherwise, the right subcluster is pushed on the list and we continue the search in the left subcluster. This continues until a leaf is found. If all samples have been found, nullptr is returned. InitSampleSearch() must be called before NextSample() to initialize the search.

Parameters

SearchState ptr to list containing clusters to be searched

Returns: Pointer to the next leaf cluster (sample) or nullptr.

Definition at line 580 of file cluster.cpp.

                                        {
   CLUSTER *Cluster;
  
   if (*SearchState == NIL_LIST)
     return (nullptr);
   Cluster = reinterpret_cast<CLUSTER *>first_node (*SearchState);
   *SearchState = pop (*SearchState);
   for (;;) {
     if (Cluster->Left == nullptr)
       return (Cluster);
     *SearchState = push (*SearchState, Cluster->Right);
     Cluster = Cluster->Left;
   }
 }                                // NextSample

◆ StandardDeviation()

float StandardDeviation	(	PROTOTYPE *	Proto,
		uint16_t	Dimension
	)

This routine returns the standard deviation of the prototype in the indicated dimension.

Parameters

Proto	prototype to return standard deviation of
Dimension	dimension whose stddev is to be returned

Returns: Standard deviation of Prototype in Dimension

Definition at line 613 of file cluster.cpp.

                                                               {
   switch (Proto->Style) {
     case spherical:
       return (static_cast<float>(sqrt (static_cast<double>(Proto->Variance.Spherical))));
     case elliptical:
       return (static_cast<float>(sqrt (static_cast<double>(Proto->Variance.Elliptical[Dimension]))));
     case mixed:
       switch (Proto->Distrib[Dimension]) {
         case normal:
           return (static_cast<float>(sqrt (static_cast<double>(Proto->Variance.Elliptical[Dimension]))));
         case uniform:
         case D_random:
           return (Proto->Variance.Elliptical[Dimension]);
         case DISTRIBUTION_COUNT:
           ASSERT_HOST(!"Distribution count not allowed!");
       }
   }
   return 0.0f;
 }                                // StandardDeviation

Variable Documentation

◆ FTable

const double FTable[FTABLE_Y][FTABLE_X]

Definition at line 36 of file cluster.cpp.

Classes

Macros

Typedefs

Functions

Variables

Macro Definition Documentation

◆ _USE_MATH_DEFINES

◆ Abs

◆ BUCKETTABLESIZE

◆ CHIACCURACY

◆ DELTARATIO

◆ FTABLE_X

◆ FTABLE_Y

◆ HOTELLING

◆ ILLEGAL_CHAR

◆ INITIALDELTA

◆ LOOKUPTABLESIZE

◆ MAXDEGREESOFFREEDOM

◆ MAXDISTANCE

◆ MAXNEIGHBORS

◆ MINALPHA

◆ MINSAMPLES

◆ MINSAMPLESNEEDED

◆ MINSAMPLESPERBUCKET

◆ MINVARIANCE

◆ Mirror

◆ NORMALEXTENT

◆ Odd

◆ SqrtOf2Pi

Typedef Documentation

◆ ClusterHeap

◆ ClusterPair

◆ DENSITYFUNC

◆ SOLVEFUNC

Function Documentation

◆ ClusterSamples()

◆ FreeClusterer()

◆ FreeProtoList()

◆ FreePrototype()

◆ MakeClusterer()

◆ MakeSample()

◆ Mean()

◆ MergeClusters()

◆ NextSample()

◆ StandardDeviation()

Variable Documentation

◆ FTable