tesseract  4.0.0-1-g2a2b
cluster.cpp File Reference
#include <cfloat>
#include <cmath>
#include <vector>
#include "cluster.h"
#include "cutil.h"
#include "emalloc.h"
#include "genericheap.h"
#include "helpers.h"
#include "kdpair.h"
#include "matrix.h"
#include "tprintf.h"

Go to the source code of this file.

Classes

struct  TEMPCLUSTER
 
struct  STATISTICS
 
struct  BUCKETS
 
struct  CHISTRUCT
 
struct  ClusteringContext
 

Macros

#define HOTELLING   1
 
#define FTABLE_X   10
 
#define FTABLE_Y   100
 
#define MINVARIANCE   0.0004
 
#define MINSAMPLESPERBUCKET   5
 
#define MINSAMPLES   (MINBUCKETS * MINSAMPLESPERBUCKET)
 
#define MINSAMPLESNEEDED   1
 
#define BUCKETTABLESIZE   1024
 
#define NORMALEXTENT   3.0
 
#define Odd(N)   ((N)%2)
 
#define Mirror(N, R)   ((R) - (N) - 1)
 
#define Abs(N)   (((N) < 0) ? (-(N)) : (N))
 
#define SqrtOf2Pi   2.506628275
 
#define LOOKUPTABLESIZE   8
 
#define MAXDEGREESOFFREEDOM   MAXBUCKETS
 
#define MAXNEIGHBORS   2
 
#define MAXDISTANCE   FLT_MAX
 
#define CHIACCURACY   0.01
 
#define MINALPHA   (1e-200)
 
#define INITIALDELTA   0.1
 
#define DELTARATIO   0.1
 
#define ILLEGAL_CHAR   2
 

Typedefs

using ClusterPair = tesseract::KDPairInc< float, TEMPCLUSTER * >
 
using ClusterHeap = tesseract::GenericHeap< ClusterPair >
 
typedef double(* DENSITYFUNC) (int32_t)
 
typedef double(* SOLVEFUNC) (CHISTRUCT *, double)
 

Functions

void CreateClusterTree (CLUSTERER *Clusterer)
 
void MakePotentialClusters (ClusteringContext *context, CLUSTER *Cluster, int32_t Level)
 
CLUSTERFindNearestNeighbor (KDTREE *Tree, CLUSTER *Cluster, float *Distance)
 
CLUSTERMakeNewCluster (CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster)
 
int32_t MergeClusters (int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
 
void ComputePrototypes (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
PROTOTYPEMakePrototype (CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster)
 
PROTOTYPEMakeDegenerateProto (uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, int32_t MinSamples)
 
PROTOTYPETestEllipticalProto (CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPEMakeSphericalProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
 
PROTOTYPEMakeEllipticalProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
 
PROTOTYPEMakeMixedProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *NormalBuckets, double Confidence)
 
void MakeDimRandom (uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc)
 
void MakeDimUniform (uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics)
 
STATISTICSComputeStatistics (int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster)
 
PROTOTYPENewSphericalProto (uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewEllipticalProto (int16_t N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewMixedProto (int16_t N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewSimpleProto (int16_t N, CLUSTER *Cluster)
 
bool Independent (PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence)
 
BUCKETSGetBuckets (CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence)
 
BUCKETSMakeBuckets (DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence)
 
uint16_t OptimumNumberOfBuckets (uint32_t SampleCount)
 
double ComputeChiSquared (uint16_t DegreesOfFreedom, double Alpha)
 
double NormalDensity (int32_t x)
 
double UniformDensity (int32_t x)
 
double Integral (double f1, double f2, double Dx)
 
void FillBuckets (BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc, float Mean, float StdDev)
 
uint16_t NormalBucket (PARAM_DESC *ParamDesc, float x, float Mean, float StdDev)
 
uint16_t UniformBucket (PARAM_DESC *ParamDesc, float x, float Mean, float StdDev)
 
bool DistributionOK (BUCKETS *Buckets)
 
void FreeStatistics (STATISTICS *Statistics)
 
void FreeBuckets (BUCKETS *Buckets)
 
void FreeCluster (CLUSTER *Cluster)
 
uint16_t DegreesOfFreedom (DISTRIBUTION Distribution, uint16_t HistogramBuckets)
 
int NumBucketsMatch (void *arg1, void *arg2)
 
int ListEntryMatch (void *arg1, void *arg2)
 
void AdjustBuckets (BUCKETS *Buckets, uint32_t NewSampleCount)
 
void InitBuckets (BUCKETS *Buckets)
 
int AlphaMatch (void *arg1, void *arg2)
 
CHISTRUCTNewChiStruct (uint16_t DegreesOfFreedom, double Alpha)
 
double Solve (SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy)
 
double ChiArea (CHISTRUCT *ChiParams, double x)
 
bool MultipleCharSamples (CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal)
 
double InvertMatrix (const float *input, int size, float *inv)
 
CLUSTERERMakeClusterer (int16_t SampleSize, const PARAM_DESC ParamDesc[])
 
SAMPLEMakeSample (CLUSTERER *Clusterer, const float *Feature, int32_t CharID)
 
LIST ClusterSamples (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
void FreeClusterer (CLUSTERER *Clusterer)
 
void FreeProtoList (LIST *ProtoList)
 
void FreePrototype (void *arg)
 
CLUSTERNextSample (LIST *SearchState)
 
float Mean (PROTOTYPE *Proto, uint16_t Dimension)
 
float StandardDeviation (PROTOTYPE *Proto, uint16_t Dimension)
 

Variables

const double FTable [FTABLE_Y][FTABLE_X]
 

Macro Definition Documentation

◆ Abs

#define Abs (   N)    (((N) < 0) ? (-(N)) : (N))

Definition at line 209 of file cluster.cpp.

◆ BUCKETTABLESIZE

#define BUCKETTABLESIZE   1024

define the size of the table which maps normalized samples to histogram buckets. Also define the number of standard deviations in a normal distribution which are considered to be significant. The mapping table will be defined in such a way that it covers the specified number of standard deviations on either side of the mean. BUCKETTABLESIZE should always be even.

Definition at line 161 of file cluster.cpp.

◆ CHIACCURACY

#define CHIACCURACY   0.01

◆ DELTARATIO

#define DELTARATIO   0.1

◆ FTABLE_X

#define FTABLE_X   10

Definition at line 32 of file cluster.cpp.

◆ FTABLE_Y

#define FTABLE_Y   100

Definition at line 33 of file cluster.cpp.

◆ HOTELLING

#define HOTELLING   1

Definition at line 31 of file cluster.cpp.

◆ ILLEGAL_CHAR

#define ILLEGAL_CHAR   2

◆ INITIALDELTA

#define INITIALDELTA   0.1

◆ LOOKUPTABLESIZE

#define LOOKUPTABLESIZE   8

define lookup tables used to compute the number of histogram buckets that should be used for a given number of samples.

Definition at line 229 of file cluster.cpp.

◆ MAXDEGREESOFFREEDOM

#define MAXDEGREESOFFREEDOM   MAXBUCKETS

Definition at line 230 of file cluster.cpp.

◆ MAXDISTANCE

#define MAXDISTANCE   FLT_MAX

◆ MAXNEIGHBORS

#define MAXNEIGHBORS   2

◆ MINALPHA

#define MINALPHA   (1e-200)

◆ MINSAMPLES

#define MINSAMPLES   (MINBUCKETS * MINSAMPLESPERBUCKET)

Definition at line 152 of file cluster.cpp.

◆ MINSAMPLESNEEDED

#define MINSAMPLESNEEDED   1

Definition at line 153 of file cluster.cpp.

◆ MINSAMPLESPERBUCKET

#define MINSAMPLESPERBUCKET   5

define the absolute minimum number of samples which must be present in order to accurately test hypotheses about underlying probability distributions. Define separately the minimum samples that are needed before a statistical analysis is attempted; this number should be equal to MINSAMPLES but can be set to a lower number for early testing when very few samples are available.

Definition at line 151 of file cluster.cpp.

◆ MINVARIANCE

#define MINVARIANCE   0.0004

define the variance which will be used as a minimum variance for any dimension of any feature. Since most features are calculated from numbers with a precision no better than 1 in 128, the variance should never be less than the square of this number for parameters whose range is 1.

Definition at line 143 of file cluster.cpp.

◆ Mirror

#define Mirror (   N,
 
)    ((R) - (N) - 1)

Definition at line 208 of file cluster.cpp.

◆ NORMALEXTENT

#define NORMALEXTENT   3.0

Definition at line 162 of file cluster.cpp.

◆ Odd

#define Odd (   N)    ((N)%2)

Definition at line 207 of file cluster.cpp.

◆ SqrtOf2Pi

#define SqrtOf2Pi   2.506628275

the following variables describe a discrete normal distribution which is used by NormalDensity() and NormalBucket(). The constant NORMALEXTENT determines how many standard deviations of the distribution are mapped onto the fixed discrete range of x. x=0 is mapped to -NORMALEXTENT standard deviations and x=BUCKETTABLESIZE is mapped to +NORMALEXTENT standard deviations.

Definition at line 219 of file cluster.cpp.

Typedef Documentation

◆ ClusterHeap

Definition at line 170 of file cluster.cpp.

◆ ClusterPair

Definition at line 169 of file cluster.cpp.

◆ DENSITYFUNC

typedef double(* DENSITYFUNC) (int32_t)

Definition at line 204 of file cluster.cpp.

◆ SOLVEFUNC

typedef double(* SOLVEFUNC) (CHISTRUCT *, double)

Definition at line 205 of file cluster.cpp.

Function Documentation

◆ AdjustBuckets()

void AdjustBuckets ( BUCKETS Buckets,
uint32_t  NewSampleCount 
)

This routine multiplies each ExpectedCount histogram entry by NewSampleCount/OldSampleCount so that the histogram is now adjusted to the new sample count.

Parameters
Bucketshistogram data structure to adjust
NewSampleCountnew sample count to adjust to
Returns
none

Definition at line 2162 of file cluster.cpp.

2162  {
2163  int i;
2164  double AdjustFactor;
2165 
2166  AdjustFactor = (((double) NewSampleCount) /
2167  ((double) Buckets->SampleCount));
2168 
2169  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2170  Buckets->ExpectedCount[i] *= AdjustFactor;
2171  }
2172 
2173  Buckets->SampleCount = NewSampleCount;
2174 
2175 } // AdjustBuckets
uint32_t SampleCount
Definition: cluster.cpp:181
uint16_t NumberOfBuckets
Definition: cluster.cpp:184
float * ExpectedCount
Definition: cluster.cpp:187

◆ AlphaMatch()

int AlphaMatch ( void *  arg1,
void *  arg2 
)

This routine is used to search a list of structures which hold pre-computed chi-squared values for a chi-squared value whose corresponding alpha field matches the alpha field of SearchKey.

It is called by the list search routines.

Parameters
arg1chi-squared struct being tested for a match
arg2chi-squared struct that is the search key
Returns
TRUE if ChiStruct's Alpha matches SearchKey's Alpha

Definition at line 2204 of file cluster.cpp.

2205  { //CHISTRUCT *SearchKey)
2206  CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
2207  CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
2208 
2209  return (ChiStruct->Alpha == SearchKey->Alpha);
2210 
2211 } // AlphaMatch
double Alpha
Definition: cluster.cpp:192

◆ ChiArea()

double ChiArea ( CHISTRUCT ChiParams,
double  x 
)

This routine computes the area under a chi density curve from 0 to x, minus the desired area under the curve. The number of degrees of freedom of the chi curve is specified in the ChiParams structure. The desired area is also specified in the ChiParams structure as Alpha (or 1 minus the desired area). This routine is intended to be passed to the Solve() function to find the value of chi-squared which will yield a desired area under the right tail of the chi density curve. The function will only work for even degrees of freedom. The equations are based on integrating the chi density curve in parts to obtain a series that can be used to compute the area under the curve.

Parameters
ChiParamscontains degrees of freedom and alpha
xvalue of chi-squared to evaluate
Returns
Error between actual and desired area under the chi curve.

Definition at line 2310 of file cluster.cpp.

2310  {
2311  int i, N;
2312  double SeriesTotal;
2313  double Denominator;
2314  double PowerOfx;
2315 
2316  N = ChiParams->DegreesOfFreedom / 2 - 1;
2317  SeriesTotal = 1;
2318  Denominator = 1;
2319  PowerOfx = 1;
2320  for (i = 1; i <= N; i++) {
2321  Denominator *= 2 * i;
2322  PowerOfx *= x;
2323  SeriesTotal += PowerOfx / Denominator;
2324  }
2325  return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
2326 
2327 } // ChiArea
double Alpha
Definition: cluster.cpp:192
uint16_t DegreesOfFreedom
Definition: cluster.cpp:191

◆ ClusterSamples()

LIST ClusterSamples ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

This routine first checks to see if the samples in this clusterer have already been clustered before; if so, it does not bother to recreate the cluster tree. It simply recomputes the prototypes based on the new Config info.

If the samples have not been clustered before, the samples in the KD tree are formed into a cluster tree and then the prototypes are computed from the cluster tree.

In either case this routine returns a pointer to a list of prototypes that best represent the samples given the constraints specified in Config.

Parameters
Clustererdata struct containing samples to be clustered
Configparameters which control clustering process
Returns
Pointer to a list of prototypes

Definition at line 506 of file cluster.cpp.

506  {
507  //only create cluster tree if samples have never been clustered before
508  if (Clusterer->Root == nullptr)
509  CreateClusterTree(Clusterer);
510 
511  //deallocate the old prototype list if one exists
512  FreeProtoList (&Clusterer->ProtoList);
513  Clusterer->ProtoList = NIL_LIST;
514 
515  //compute prototypes starting at the root node in the tree
516  ComputePrototypes(Clusterer, Config);
517  // We don't need the cluster pointers in the protos any more, so null them
518  // out, which makes it safe to delete the clusterer.
519  LIST proto_list = Clusterer->ProtoList;
520  iterate(proto_list) {
521  PROTOTYPE *proto = reinterpret_cast<PROTOTYPE *>(first_node(proto_list));
522  proto->Cluster = nullptr;
523  }
524  return Clusterer->ProtoList;
525 } // ClusterSamples
CLUSTERCONFIG Config
void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:894
void CreateClusterTree(CLUSTERER *Clusterer)
Definition: cluster.cpp:678
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:563
LIST ProtoList
Definition: cluster.h:92
CLUSTER * Root
Definition: cluster.h:91
#define first_node(l)
Definition: oldlist.h:141
#define NIL_LIST
Definition: oldlist.h:127
CLUSTER * Cluster
Definition: cluster.h:76
#define iterate(l)
Definition: oldlist.h:161

◆ ComputeChiSquared()

double ComputeChiSquared ( uint16_t  DegreesOfFreedom,
double  Alpha 
)

This routine computes the chi-squared value which will leave a cumulative probability of Alpha in the right tail of a chi-squared distribution with the specified number of degrees of freedom. Alpha must be between 0 and 1. DegreesOfFreedom must be even. The routine maintains an array of lists. Each list corresponds to a different number of degrees of freedom. Each entry in the list corresponds to a different alpha value and its corresponding chi-squared value. Therefore, once a particular chi-squared value is computed, it is stored in the list and never needs to be computed again.

Parameters
DegreesOfFreedomdetermines shape of distribution
Alphaprobability of right tail
Returns
Desired chi-squared value

Definition at line 1799 of file cluster.cpp.

1802 {
1803  static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
1804 
1805  CHISTRUCT *OldChiSquared;
1806  CHISTRUCT SearchKey;
1807 
1808  // limit the minimum alpha that can be used - if alpha is too small
1809  // it may not be possible to compute chi-squared.
1810  Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
1811  if (Odd (DegreesOfFreedom))
1812  DegreesOfFreedom++;
1813 
1814  /* find the list of chi-squared values which have already been computed
1815  for the specified number of degrees of freedom. Search the list for
1816  the desired chi-squared. */
1817  SearchKey.Alpha = Alpha;
1818  OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
1819  &SearchKey, AlphaMatch));
1820 
1821  if (OldChiSquared == nullptr) {
1822  OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
1823  OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
1824  (double) DegreesOfFreedom,
1825  (double) CHIACCURACY);
1826  ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
1827  OldChiSquared);
1828  }
1829  else {
1830  // further optimization might move OldChiSquared to front of list
1831  }
1832 
1833  return (OldChiSquared->ChiSquared);
1834 
1835 } // ComputeChiSquared
#define CHIACCURACY
double Alpha
Definition: cluster.cpp:192
double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy)
Definition: cluster.cpp:2246
LIST push(LIST list, void *element)
Definition: oldlist.cpp:283
#define Odd(N)
Definition: cluster.cpp:207
CHISTRUCT * NewChiStruct(uint16_t DegreesOfFreedom, double Alpha)
Definition: cluster.cpp:2222
double ChiArea(CHISTRUCT *ChiParams, double x)
Definition: cluster.cpp:2310
#define MAXDEGREESOFFREEDOM
Definition: cluster.cpp:230
double ChiSquared
Definition: cluster.cpp:193
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:366
#define first_node(l)
Definition: oldlist.h:141
int AlphaMatch(void *arg1, void *arg2)
Definition: cluster.cpp:2204
uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets)
Definition: cluster.cpp:2113
#define MINALPHA
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:111

◆ ComputePrototypes()

void ComputePrototypes ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

This routine decides which clusters in the cluster tree should be represented by prototypes, forms a list of these prototypes, and places the list in the Clusterer data structure.

Parameters
Clustererdata structure holding cluster tree
Configparameters used to control prototype generation
Returns
None

Definition at line 894 of file cluster.cpp.

894  {
895  LIST ClusterStack = NIL_LIST;
896  CLUSTER *Cluster;
897  PROTOTYPE *Prototype;
898 
899  // use a stack to keep track of clusters waiting to be processed
900  // initially the only cluster on the stack is the root cluster
901  if (Clusterer->Root != nullptr)
902  ClusterStack = push (NIL_LIST, Clusterer->Root);
903 
904  // loop until we have analyzed all clusters which are potential prototypes
905  while (ClusterStack != NIL_LIST) {
906  // remove the next cluster to be analyzed from the stack
907  // try to make a prototype from the cluster
908  // if successful, put it on the proto list, else split the cluster
909  Cluster = (CLUSTER *) first_node (ClusterStack);
910  ClusterStack = pop (ClusterStack);
911  Prototype = MakePrototype(Clusterer, Config, Cluster);
912  if (Prototype != nullptr) {
913  Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
914  }
915  else {
916  ClusterStack = push (ClusterStack, Cluster->Right);
917  ClusterStack = push (ClusterStack, Cluster->Left);
918  }
919  }
920 } // ComputePrototypes
CLUSTERCONFIG Config
Definition: cluster.h:32
struct sample * Left
Definition: cluster.h:36
LIST push(LIST list, void *element)
Definition: oldlist.cpp:283
LIST pop(LIST list)
Definition: oldlist.cpp:266
LIST ProtoList
Definition: cluster.h:92
CLUSTER * Root
Definition: cluster.h:91
PROTOTYPE * MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster)
Definition: cluster.cpp:937
#define first_node(l)
Definition: oldlist.h:141
#define NIL_LIST
Definition: oldlist.h:127
struct sample * Right
Definition: cluster.h:37

◆ ComputeStatistics()

STATISTICS * ComputeStatistics ( int16_t  N,
PARAM_DESC  ParamDesc[],
CLUSTER Cluster 
)

This routine searches the cluster tree for all leaf nodes which are samples in the specified cluster. It computes a full covariance matrix for these samples as well as keeping track of the ranges (min and max) for each dimension. A special data structure is allocated to return this information to the caller. An incremental algorithm for computing statistics is not used because it will not work with circular dimensions.

Parameters
Nnumber of dimensions
ParamDescarray of dimension descriptions
Clustercluster whose stats are to be computed
Returns
Pointer to new data structure containing statistics

Definition at line 1360 of file cluster.cpp.

1360  {
1361  STATISTICS *Statistics;
1362  int i, j;
1363  float *CoVariance;
1364  float *Distance;
1365  LIST SearchState;
1366  SAMPLE *Sample;
1367  uint32_t SampleCountAdjustedForBias;
1368 
1369  // allocate memory to hold the statistics results
1370  Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
1371  Statistics->CoVariance = (float *)Emalloc(sizeof(float) * N * N);
1372  Statistics->Min = (float *) Emalloc (N * sizeof (float));
1373  Statistics->Max = (float *) Emalloc (N * sizeof (float));
1374 
1375  // allocate temporary memory to hold the sample to mean distances
1376  Distance = (float *) Emalloc (N * sizeof (float));
1377 
1378  // initialize the statistics
1379  Statistics->AvgVariance = 1.0;
1380  CoVariance = Statistics->CoVariance;
1381  for (i = 0; i < N; i++) {
1382  Statistics->Min[i] = 0.0;
1383  Statistics->Max[i] = 0.0;
1384  for (j = 0; j < N; j++, CoVariance++)
1385  *CoVariance = 0;
1386  }
1387  // find each sample in the cluster and merge it into the statistics
1388  InitSampleSearch(SearchState, Cluster);
1389  while ((Sample = NextSample (&SearchState)) != nullptr) {
1390  for (i = 0; i < N; i++) {
1391  Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
1392  if (ParamDesc[i].Circular) {
1393  if (Distance[i] > ParamDesc[i].HalfRange)
1394  Distance[i] -= ParamDesc[i].Range;
1395  if (Distance[i] < -ParamDesc[i].HalfRange)
1396  Distance[i] += ParamDesc[i].Range;
1397  }
1398  if (Distance[i] < Statistics->Min[i])
1399  Statistics->Min[i] = Distance[i];
1400  if (Distance[i] > Statistics->Max[i])
1401  Statistics->Max[i] = Distance[i];
1402  }
1403  CoVariance = Statistics->CoVariance;
1404  for (i = 0; i < N; i++)
1405  for (j = 0; j < N; j++, CoVariance++)
1406  *CoVariance += Distance[i] * Distance[j];
1407  }
1408  // normalize the variances by the total number of samples
1409  // use SampleCount-1 instead of SampleCount to get an unbiased estimate
1410  // also compute the geometic mean of the diagonal variances
1411  // ensure that clusters with only 1 sample are handled correctly
1412  if (Cluster->SampleCount > 1)
1413  SampleCountAdjustedForBias = Cluster->SampleCount - 1;
1414  else
1415  SampleCountAdjustedForBias = 1;
1416  CoVariance = Statistics->CoVariance;
1417  for (i = 0; i < N; i++)
1418  for (j = 0; j < N; j++, CoVariance++) {
1419  *CoVariance /= SampleCountAdjustedForBias;
1420  if (j == i) {
1421  if (*CoVariance < MINVARIANCE)
1422  *CoVariance = MINVARIANCE;
1423  Statistics->AvgVariance *= *CoVariance;
1424  }
1425  }
1426  Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
1427  1.0 / N);
1428 
1429  // release temporary memory and return
1430  free(Distance);
1431  return (Statistics);
1432 } // ComputeStatistics
float * Min
Definition: cluster.cpp:175
Definition: cluster.h:32
void * Emalloc(int Size)
Definition: emalloc.cpp:31
float * Max
Definition: cluster.cpp:176
unsigned SampleCount
Definition: cluster.h:35
#define MINVARIANCE
Definition: cluster.cpp:143
float Mean[1]
Definition: cluster.h:39
float Range
Definition: ocrfeatures.h:48
float * CoVariance
Definition: cluster.cpp:174
#define InitSampleSearch(S, C)
Definition: cluster.h:105
float AvgVariance
Definition: cluster.cpp:173
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:606

◆ CreateClusterTree()

void CreateClusterTree ( CLUSTERER Clusterer)

This routine performs a bottoms-up clustering on the samples held in the kd-tree of the Clusterer data structure. The result is a cluster tree. Each node in the tree represents a cluster which conceptually contains a subset of the samples. More precisely, the cluster contains all of the samples which are contained in its two sub-clusters. The leaves of the tree are the individual samples themselves; they have no sub-clusters. The root node of the tree conceptually contains all of the samples.

Parameters
Clustererdata structure holdings samples to be clustered
Returns
None (the Clusterer data structure is changed)

Definition at line 678 of file cluster.cpp.

678  {
679  ClusteringContext context;
680  ClusterPair HeapEntry;
681  TEMPCLUSTER *PotentialCluster;
682 
683  // each sample and its nearest neighbor form a "potential" cluster
684  // save these in a heap with the "best" potential clusters on top
685  context.tree = Clusterer->KDTree;
686  context.candidates = (TEMPCLUSTER *)
687  Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER));
688  context.next = 0;
689  context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
690  KDWalk(context.tree, (void_proc)MakePotentialClusters, &context);
691 
692  // form potential clusters into actual clusters - always do "best" first
693  while (context.heap->Pop(&HeapEntry)) {
694  PotentialCluster = HeapEntry.data;
695 
696  // if main cluster of potential cluster is already in another cluster
697  // then we don't need to worry about it
698  if (PotentialCluster->Cluster->Clustered) {
699  continue;
700  }
701 
702  // if main cluster is not yet clustered, but its nearest neighbor is
703  // then we must find a new nearest neighbor
704  else if (PotentialCluster->Neighbor->Clustered) {
705  PotentialCluster->Neighbor =
706  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
707  &HeapEntry.key);
708  if (PotentialCluster->Neighbor != nullptr) {
709  context.heap->Push(&HeapEntry);
710  }
711  }
712 
713  // if neither cluster is already clustered, form permanent cluster
714  else {
715  PotentialCluster->Cluster =
716  MakeNewCluster(Clusterer, PotentialCluster);
717  PotentialCluster->Neighbor =
718  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
719  &HeapEntry.key);
720  if (PotentialCluster->Neighbor != nullptr) {
721  context.heap->Push(&HeapEntry);
722  }
723  }
724  }
725 
726  // the root node in the cluster tree is now the only node in the kd-tree
727  Clusterer->Root = (CLUSTER *) RootOf(Clusterer->KDTree);
728 
729  // free up the memory used by the K-D tree, heap, and temp clusters
730  FreeKDTree(context.tree);
731  Clusterer->KDTree = nullptr;
732  delete context.heap;
733  free(context.candidates);
734 } // CreateClusterTree
ClusterHeap * heap
Definition: cluster.cpp:198
Definition: cluster.h:32
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:333
void * Emalloc(int Size)
Definition: emalloc.cpp:31
KDTREE * KDTree
Definition: cluster.h:90
void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t Level)
Definition: cluster.cpp:745
#define RootOf(T)
Definition: kdtree.h:56
CLUSTER * MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster)
Definition: cluster.cpp:810
CLUSTER * Root
Definition: cluster.h:91
void Push(Pair *entry)
Definition: genericheap.h:95
tesseract::GenericHeap< ClusterPair > ClusterHeap
Definition: cluster.cpp:170
TEMPCLUSTER * candidates
Definition: cluster.cpp:199
unsigned Clustered
Definition: cluster.h:33
CLUSTER * FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance)
Definition: cluster.cpp:775
void(* void_proc)(...)
Definition: cutil.h:30
int32_t NumberOfSamples
Definition: cluster.h:89
void KDWalk(KDTREE *Tree, void_proc action, void *context)
Definition: kdtree.cpp:316
bool Pop(Pair *entry)
Definition: genericheap.h:118
CLUSTER * Cluster
Definition: cluster.cpp:165
CLUSTER * Neighbor
Definition: cluster.cpp:166

◆ DegreesOfFreedom()

uint16_t DegreesOfFreedom ( DISTRIBUTION  Distribution,
uint16_t  HistogramBuckets 
)

This routine computes the degrees of freedom that should be used in a chi-squared test with the specified number of histogram buckets. The result is always rounded up to the next even number so that the value of chi-squared can be computed more easily. This will cause the value of chi-squared to be higher than the optimum value, resulting in the chi-square test being more lenient than optimum.

Parameters
Distributiondistribution being tested for
HistogramBucketsnumber of buckets in chi-square test
Returns
The number of degrees of freedom for a chi-square test

Definition at line 2113 of file cluster.cpp.

2113  {
2114  static uint8_t DegreeOffsets[] = { 3, 3, 1 };
2115 
2116  uint16_t AdjustedNumBuckets;
2117 
2118  AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
2119  if (Odd (AdjustedNumBuckets))
2120  AdjustedNumBuckets++;
2121  return (AdjustedNumBuckets);
2122 
2123 } // DegreesOfFreedom
#define Odd(N)
Definition: cluster.cpp:207

◆ DistributionOK()

bool DistributionOK ( BUCKETS Buckets)

This routine performs a chi-square goodness of fit test on the histogram data in the Buckets data structure. TRUE is returned if the histogram matches the probability distribution which was specified when the Buckets structure was originally created. Otherwise FALSE is returned.

Parameters
Bucketshistogram data to perform chi-square test on
Returns
TRUE if samples match distribution, FALSE otherwise

Definition at line 2040 of file cluster.cpp.

2040  {
2041  float FrequencyDifference;
2042  float TotalDifference;
2043  int i;
2044 
2045  // compute how well the histogram matches the expected histogram
2046  TotalDifference = 0.0;
2047  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2048  FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
2049  TotalDifference += (FrequencyDifference * FrequencyDifference) /
2050  Buckets->ExpectedCount[i];
2051  }
2052 
2053  // test to see if the difference is more than expected
2054  if (TotalDifference > Buckets->ChiSquared)
2055  return false;
2056  else
2057  return true;
2058 } // DistributionOK
double ChiSquared
Definition: cluster.cpp:183
uint16_t NumberOfBuckets
Definition: cluster.cpp:184
uint32_t * Count
Definition: cluster.cpp:186
float * ExpectedCount
Definition: cluster.cpp:187

◆ FillBuckets()

void FillBuckets ( BUCKETS Buckets,
CLUSTER Cluster,
uint16_t  Dim,
PARAM_DESC ParamDesc,
float  Mean,
float  StdDev 
)

This routine counts the number of cluster samples which fall within the various histogram buckets in Buckets. Only one dimension of each sample is examined. The exact meaning of the Mean and StdDev parameters depends on the distribution which is being analyzed (this info is in the Buckets data structure). For normal distributions, Mean and StdDev have the expected meanings. For uniform and random distributions the Mean is the center point of the range and the StdDev is 1/2 the range. A dimension with zero standard deviation cannot be statistically analyzed. In this case, a pseudo-analysis is used.

Parameters
Bucketshistogram buckets to count samples
Clustercluster whose samples are being analyzed
Dimdimension of samples which is being analyzed
ParamDescdescription of the dimension
Mean"mean" of the distribution
StdDev"standard deviation" of the distribution
Returns
None (the Buckets data structure is filled in)

Definition at line 1905 of file cluster.cpp.

1910  {
1911  uint16_t BucketID;
1912  int i;
1913  LIST SearchState;
1914  SAMPLE *Sample;
1915 
1916  // initialize the histogram bucket counts to 0
1917  for (i = 0; i < Buckets->NumberOfBuckets; i++)
1918  Buckets->Count[i] = 0;
1919 
1920  if (StdDev == 0.0) {
1921  /* if the standard deviation is zero, then we can't statistically
1922  analyze the cluster. Use a pseudo-analysis: samples exactly on
1923  the mean are distributed evenly across all buckets. Samples greater
1924  than the mean are placed in the last bucket; samples less than the
1925  mean are placed in the first bucket. */
1926 
1927  InitSampleSearch(SearchState, Cluster);
1928  i = 0;
1929  while ((Sample = NextSample (&SearchState)) != nullptr) {
1930  if (Sample->Mean[Dim] > Mean)
1931  BucketID = Buckets->NumberOfBuckets - 1;
1932  else if (Sample->Mean[Dim] < Mean)
1933  BucketID = 0;
1934  else
1935  BucketID = i;
1936  Buckets->Count[BucketID] += 1;
1937  i++;
1938  if (i >= Buckets->NumberOfBuckets)
1939  i = 0;
1940  }
1941  }
1942  else {
1943  // search for all samples in the cluster and add to histogram buckets
1944  InitSampleSearch(SearchState, Cluster);
1945  while ((Sample = NextSample (&SearchState)) != nullptr) {
1946  switch (Buckets->Distribution) {
1947  case normal:
1948  BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
1949  Mean, StdDev);
1950  break;
1951  case D_random:
1952  case uniform:
1953  BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
1954  Mean, StdDev);
1955  break;
1956  default:
1957  BucketID = 0;
1958  }
1959  Buckets->Count[Buckets->Bucket[BucketID]] += 1;
1960  }
1961  }
1962 } // FillBuckets
Definition: cluster.h:32
uint16_t Bucket[BUCKETTABLESIZE]
Definition: cluster.cpp:185
DISTRIBUTION Distribution
Definition: cluster.cpp:180
uint16_t NumberOfBuckets
Definition: cluster.cpp:184
uint32_t * Count
Definition: cluster.cpp:186
float Mean[1]
Definition: cluster.h:39
Definition: cluster.h:59
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:628
#define InitSampleSearch(S, C)
Definition: cluster.h:105
uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev)
Definition: cluster.cpp:2008
uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev)
Definition: cluster.cpp:1975
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:606

◆ FindNearestNeighbor()

CLUSTER * FindNearestNeighbor ( KDTREE Tree,
CLUSTER Cluster,
float *  Distance 
)

This routine searches the specified kd-tree for the nearest neighbor of the specified cluster. It actually uses the kd routines to find the 2 nearest neighbors since one of them will be the original cluster. A pointer to the nearest neighbor is returned, if it can be found, otherwise nullptr is returned. The distance between the 2 nodes is placed in the specified variable.

Parameters
Treekd-tree to search in for nearest neighbor
Clustercluster whose nearest neighbor is to be found
Distanceptr to variable to report distance found
Returns
Pointer to the nearest neighbor of Cluster, or nullptr

Definition at line 775 of file cluster.cpp.

778 {
779  CLUSTER *Neighbor[MAXNEIGHBORS];
780  float Dist[MAXNEIGHBORS];
781  int NumberOfNeighbors;
782  int32_t i;
783  CLUSTER *BestNeighbor;
784 
785  // find the 2 nearest neighbors of the cluster
787  &NumberOfNeighbors, (void **)Neighbor, Dist);
788 
789  // search for the nearest neighbor that is not the cluster itself
790  *Distance = MAXDISTANCE;
791  BestNeighbor = nullptr;
792  for (i = 0; i < NumberOfNeighbors; i++) {
793  if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
794  *Distance = Dist[i];
795  BestNeighbor = Neighbor[i];
796  }
797  }
798  return BestNeighbor;
799 } // FindNearestNeighbor
Definition: cluster.h:32
#define MAXDISTANCE
float Mean[1]
Definition: cluster.h:39
void KDNearestNeighborSearch(KDTREE *Tree, float Query[], int QuerySize, float MaxDistance, int *NumberOfResults, void **NBuffer, float DBuffer[])
Definition: kdtree.cpp:306
#define MAXNEIGHBORS

◆ FreeBuckets()

void FreeBuckets ( BUCKETS buckets)

This routine properly frees the memory used by a BUCKETS.

Parameters
bucketspointer to data structure to be freed

Definition at line 2078 of file cluster.cpp.

2078  {
2079  Efree(buckets->Count);
2080  Efree(buckets->ExpectedCount);
2081  Efree(buckets);
2082 } // FreeBuckets
void Efree(void *ptr)
Definition: emalloc.cpp:45
uint32_t * Count
Definition: cluster.cpp:186
float * ExpectedCount
Definition: cluster.cpp:187

◆ FreeCluster()

void FreeCluster ( CLUSTER Cluster)

This routine frees the memory consumed by the specified cluster and all of its subclusters. This is done by recursive calls to FreeCluster().

Parameters
Clusterpointer to cluster to be freed
Returns
None

Definition at line 2093 of file cluster.cpp.

2093  {
2094  if (Cluster != nullptr) {
2095  FreeCluster (Cluster->Left);
2096  FreeCluster (Cluster->Right);
2097  free(Cluster);
2098  }
2099 } // FreeCluster
struct sample * Left
Definition: cluster.h:36
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2093
struct sample * Right
Definition: cluster.h:37

◆ FreeClusterer()

void FreeClusterer ( CLUSTERER Clusterer)

This routine frees all of the memory allocated to the specified data structure. It will not, however, free the memory used by the prototype list. The pointers to the clusters for each prototype in the list will be set to nullptr to indicate that the cluster data structures no longer exist. Any sample lists that have been obtained via calls to GetSamples are no longer valid.

Parameters
Clustererpointer to data structure to be freed
Returns
None

Definition at line 538 of file cluster.cpp.

538  {
539  if (Clusterer != nullptr) {
540  free(Clusterer->ParamDesc);
541  if (Clusterer->KDTree != nullptr)
542  FreeKDTree (Clusterer->KDTree);
543  if (Clusterer->Root != nullptr)
544  FreeCluster (Clusterer->Root);
545  // Free up all used buckets structures.
546  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
547  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
548  if (Clusterer->bucket_cache[d][c] != nullptr)
549  FreeBuckets(Clusterer->bucket_cache[d][c]);
550  }
551 
552  free(Clusterer);
553  }
554 } // FreeClusterer
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:333
PARAM_DESC * ParamDesc
Definition: cluster.h:88
void FreeBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2078
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:95
KDTREE * KDTree
Definition: cluster.h:90
#define MAXBUCKETS
Definition: cluster.h:27
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2093
CLUSTER * Root
Definition: cluster.h:91
#define MINBUCKETS
Definition: cluster.h:26

◆ FreeProtoList()

void FreeProtoList ( LIST ProtoList)

This routine frees all of the memory allocated to the specified list of prototypes. The clusters which are pointed to by the prototypes are not freed.

Parameters
ProtoListpointer to list of prototypes to be freed
Returns
None

Definition at line 563 of file cluster.cpp.

563  {
564  destroy_nodes(*ProtoList, FreePrototype);
565 } // FreeProtoList
void FreePrototype(void *arg)
Definition: cluster.cpp:575
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:186

◆ FreePrototype()

void FreePrototype ( void *  arg)

This routine deallocates the memory consumed by the specified prototype and modifies the corresponding cluster so that it is no longer marked as a prototype. The cluster is NOT deallocated by this routine.

Parameters
argprototype data structure to be deallocated
Returns
None

Definition at line 575 of file cluster.cpp.

575  { //PROTOTYPE *Prototype)
576  PROTOTYPE *Prototype = (PROTOTYPE *) arg;
577 
578  // unmark the corresponding cluster (if there is one
579  if (Prototype->Cluster != nullptr)
580  Prototype->Cluster->Prototype = FALSE;
581 
582  // deallocate the prototype statistics and then the prototype itself
583  free(Prototype->Distrib);
584  free(Prototype->Mean);
585  if (Prototype->Style != spherical) {
586  free(Prototype->Variance.Elliptical);
587  free(Prototype->Magnitude.Elliptical);
588  free(Prototype->Weight.Elliptical);
589  }
590  free(Prototype);
591 } // FreePrototype
float * Mean
Definition: cluster.h:78
unsigned Prototype
Definition: cluster.h:34
float * Elliptical
Definition: cluster.h:64
FLOATUNION Weight
Definition: cluster.h:83
DISTRIBUTION * Distrib
Definition: cluster.h:77
unsigned Style
Definition: cluster.h:74
#define FALSE
Definition: capi.h:52
FLOATUNION Magnitude
Definition: cluster.h:82
CLUSTER * Cluster
Definition: cluster.h:76
FLOATUNION Variance
Definition: cluster.h:81

◆ FreeStatistics()

void FreeStatistics ( STATISTICS Statistics)

This routine frees the memory used by the statistics data structure.

Parameters
Statisticspointer to data structure to be freed
Returns
None

Definition at line 2066 of file cluster.cpp.

2066  {
2067  free(Statistics->CoVariance);
2068  free(Statistics->Min);
2069  free(Statistics->Max);
2070  free(Statistics);
2071 } // FreeStatistics
float * Min
Definition: cluster.cpp:175
float * Max
Definition: cluster.cpp:176
float * CoVariance
Definition: cluster.cpp:174

◆ GetBuckets()

BUCKETS * GetBuckets ( CLUSTERER clusterer,
DISTRIBUTION  Distribution,
uint32_t  SampleCount,
double  Confidence 
)

This routine returns a histogram data structure which can be used by other routines to place samples into histogram buckets, and then apply a goodness of fit test to the histogram data to determine if the samples belong to the specified probability distribution. The routine keeps a list of bucket data structures which have already been created so that it minimizes the computation time needed to create a new bucket.

Parameters
clustererwhich keeps a bucket_cache for us.
Distributiontype of probability distribution to test for
SampleCountnumber of samples that are available
Confidenceprobability of a Type I error
Returns
Bucket data structure

Definition at line 1624 of file cluster.cpp.

1627  {
1628  // Get an old bucket structure with the same number of buckets.
1629  uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1630  BUCKETS *Buckets =
1631  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
1632 
1633  // If a matching bucket structure is not found, make one and save it.
1634  if (Buckets == nullptr) {
1635  Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
1636  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =
1637  Buckets;
1638  } else {
1639  // Just adjust the existing buckets.
1640  if (SampleCount != Buckets->SampleCount)
1641  AdjustBuckets(Buckets, SampleCount);
1642  if (Confidence != Buckets->Confidence) {
1643  Buckets->Confidence = Confidence;
1644  Buckets->ChiSquared = ComputeChiSquared(
1645  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets),
1646  Confidence);
1647  }
1648  InitBuckets(Buckets);
1649  }
1650  return Buckets;
1651 } // GetBuckets
BUCKETS * MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence)
Definition: cluster.cpp:1669
double ChiSquared
Definition: cluster.cpp:183
uint32_t SampleCount
Definition: cluster.cpp:181
uint16_t OptimumNumberOfBuckets(uint32_t SampleCount)
Definition: cluster.cpp:1764
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:95
void InitBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2183
uint16_t NumberOfBuckets
Definition: cluster.cpp:184
void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount)
Definition: cluster.cpp:2162
double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha)
Definition: cluster.cpp:1799
#define MINBUCKETS
Definition: cluster.h:26
uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets)
Definition: cluster.cpp:2113
double Confidence
Definition: cluster.cpp:182

◆ Independent()

bool Independent ( PARAM_DESC ParamDesc,
int16_t  N,
float *  CoVariance,
float  Independence 
)

This routine returns TRUE if the specified covariance matrix indicates that all N dimensions are independent of one another. One dimension is judged to be independent of another when the magnitude of the corresponding correlation coefficient is less than the specified Independence factor. The correlation coefficient is calculated as: (see Duda and Hart, pg. 247) coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj]) The covariance matrix is assumed to be symmetric (which should always be true).

Parameters
ParamDescdescriptions of each feature space dimension
Nnumber of dimensions
CoVarianceptr to a covariance matrix
Independencemax off-diagonal correlation coefficient
Returns
TRUE if dimensions are independent, FALSE otherwise

Definition at line 1579 of file cluster.cpp.

1580  {
1581  int i, j;
1582  float *VARii; // points to ith on-diagonal element
1583  float *VARjj; // points to jth on-diagonal element
1584  float CorrelationCoeff;
1585 
1586  VARii = CoVariance;
1587  for (i = 0; i < N; i++, VARii += N + 1) {
1588  if (ParamDesc[i].NonEssential)
1589  continue;
1590 
1591  VARjj = VARii + N + 1;
1592  CoVariance = VARii + 1;
1593  for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
1594  if (ParamDesc[j].NonEssential)
1595  continue;
1596 
1597  if ((*VARii == 0.0) || (*VARjj == 0.0))
1598  CorrelationCoeff = 0.0;
1599  else
1600  CorrelationCoeff =
1601  sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
1602  if (CorrelationCoeff > Independence)
1603  return false;
1604  }
1605  }
1606  return true;
1607 } // Independent

◆ InitBuckets()

void InitBuckets ( BUCKETS Buckets)

This routine sets the bucket counts in the specified histogram to zero.

Parameters
Bucketshistogram data structure to init
Returns
none

Definition at line 2183 of file cluster.cpp.

2183  {
2184  int i;
2185 
2186  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2187  Buckets->Count[i] = 0;
2188  }
2189 
2190 } // InitBuckets
uint16_t NumberOfBuckets
Definition: cluster.cpp:184
uint32_t * Count
Definition: cluster.cpp:186

◆ Integral()

double Integral ( double  f1,
double  f2,
double  Dx 
)

This routine computes a trapezoidal approximation to the integral of a function over a small delta in x.

Parameters
f1value of function at x1
f2value of function at x2
Dxx2 - x1 (should always be positive)
Returns
Approximation of the integral of the function from x1 to x2.

Definition at line 1881 of file cluster.cpp.

1881  {
1882  return (f1 + f2) * Dx / 2.0;
1883 } // Integral

◆ InvertMatrix()

double InvertMatrix ( const float *  input,
int  size,
float *  inv 
)

Compute the inverse of a matrix using LU decomposition with partial pivoting. The return value is the sum of norms of the off-diagonal terms of the product of a and inv. (A measure of the error.)

Definition at line 2409 of file cluster.cpp.

2409  {
2410  // Allocate memory for the 2D arrays.
2411  GENERIC_2D_ARRAY<double> U(size, size, 0.0);
2412  GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
2413  GENERIC_2D_ARRAY<double> L(size, size, 0.0);
2414 
2415  // Initialize the working matrices. U starts as input, L as I and U_inv as O.
2416  int row;
2417  int col;
2418  for (row = 0; row < size; row++) {
2419  for (col = 0; col < size; col++) {
2420  U[row][col] = input[row*size + col];
2421  L[row][col] = row == col ? 1.0 : 0.0;
2422  U_inv[row][col] = 0.0;
2423  }
2424  }
2425 
2426  // Compute forward matrix by inversion by LU decomposition of input.
2427  for (col = 0; col < size; ++col) {
2428  // Find best pivot
2429  int best_row = 0;
2430  double best_pivot = -1.0;
2431  for (row = col; row < size; ++row) {
2432  if (Abs(U[row][col]) > best_pivot) {
2433  best_pivot = Abs(U[row][col]);
2434  best_row = row;
2435  }
2436  }
2437  // Exchange pivot rows.
2438  if (best_row != col) {
2439  for (int k = 0; k < size; ++k) {
2440  double tmp = U[best_row][k];
2441  U[best_row][k] = U[col][k];
2442  U[col][k] = tmp;
2443  tmp = L[best_row][k];
2444  L[best_row][k] = L[col][k];
2445  L[col][k] = tmp;
2446  }
2447  }
2448  // Now do the pivot itself.
2449  for (row = col + 1; row < size; ++row) {
2450  double ratio = -U[row][col] / U[col][col];
2451  for (int j = col; j < size; ++j) {
2452  U[row][j] += U[col][j] * ratio;
2453  }
2454  for (int k = 0; k < size; ++k) {
2455  L[row][k] += L[col][k] * ratio;
2456  }
2457  }
2458  }
2459  // Next invert U.
2460  for (col = 0; col < size; ++col) {
2461  U_inv[col][col] = 1.0 / U[col][col];
2462  for (row = col - 1; row >= 0; --row) {
2463  double total = 0.0;
2464  for (int k = col; k > row; --k) {
2465  total += U[row][k] * U_inv[k][col];
2466  }
2467  U_inv[row][col] = -total / U[row][row];
2468  }
2469  }
2470  // Now the answer is U_inv.L.
2471  for (row = 0; row < size; row++) {
2472  for (col = 0; col < size; col++) {
2473  double sum = 0.0;
2474  for (int k = row; k < size; ++k) {
2475  sum += U_inv[row][k] * L[k][col];
2476  }
2477  inv[row*size + col] = sum;
2478  }
2479  }
2480  // Check matrix product.
2481  double error_sum = 0.0;
2482  for (row = 0; row < size; row++) {
2483  for (col = 0; col < size; col++) {
2484  double sum = 0.0;
2485  for (int k = 0; k < size; ++k) {
2486  sum += static_cast<double>(input[row * size + k]) * inv[k * size + col];
2487  }
2488  if (row != col) {
2489  error_sum += Abs(sum);
2490  }
2491  }
2492  }
2493  return error_sum;
2494 }
#define Abs(N)
Definition: cluster.cpp:209

◆ ListEntryMatch()

int ListEntryMatch ( void *  arg1,
void *  arg2 
)

This routine is used to search a list for a list node whose contents match Key. It is called by the list delete_d routine.

Returns
TRUE if ListNode matches Key

Definition at line 2148 of file cluster.cpp.

2149  { //Key
2150  return (arg1 == arg2);
2151 
2152 } // ListEntryMatch

◆ MakeBuckets()

BUCKETS * MakeBuckets ( DISTRIBUTION  Distribution,
uint32_t  SampleCount,
double  Confidence 
)

This routine creates a histogram data structure which can be used by other routines to place samples into histogram buckets, and then apply a goodness of fit test to the histogram data to determine if the samples belong to the specified probability distribution. The buckets are allocated in such a way that the expected frequency of samples in each bucket is approximately the same. In order to make this possible, a mapping table is computed which maps "normalized" samples into the appropriate bucket.

Parameters
Distributiontype of probability distribution to test for
SampleCountnumber of samples that are available
Confidenceprobability of a Type I error
Returns
Pointer to new histogram data structure

Definition at line 1669 of file cluster.cpp.

1671  {
1672  const DENSITYFUNC DensityFunction[] =
1674  int i, j;
1675  BUCKETS *Buckets;
1676  double BucketProbability;
1677  double NextBucketBoundary;
1678  double Probability;
1679  double ProbabilityDelta;
1680  double LastProbDensity;
1681  double ProbDensity;
1682  uint16_t CurrentBucket;
1683  bool Symmetrical;
1684 
1685  // allocate memory needed for data structure
1686  Buckets = static_cast<BUCKETS *>(Emalloc(sizeof(BUCKETS)));
1687  Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1688  Buckets->SampleCount = SampleCount;
1689  Buckets->Confidence = Confidence;
1690  Buckets->Count =
1691  static_cast<uint32_t *>(Emalloc(Buckets->NumberOfBuckets * sizeof(uint32_t)));
1692  Buckets->ExpectedCount = static_cast<float *>(
1693  Emalloc(Buckets->NumberOfBuckets * sizeof(float)));
1694 
1695  // initialize simple fields
1696  Buckets->Distribution = Distribution;
1697  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
1698  Buckets->Count[i] = 0;
1699  Buckets->ExpectedCount[i] = 0.0;
1700  }
1701 
1702  // all currently defined distributions are symmetrical
1703  Symmetrical = true;
1704  Buckets->ChiSquared = ComputeChiSquared(
1705  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
1706 
1707  if (Symmetrical) {
1708  // allocate buckets so that all have approx. equal probability
1709  BucketProbability = 1.0 / (double) (Buckets->NumberOfBuckets);
1710 
1711  // distribution is symmetric so fill in upper half then copy
1712  CurrentBucket = Buckets->NumberOfBuckets / 2;
1713  if (Odd (Buckets->NumberOfBuckets))
1714  NextBucketBoundary = BucketProbability / 2;
1715  else
1716  NextBucketBoundary = BucketProbability;
1717 
1718  Probability = 0.0;
1719  LastProbDensity =
1720  (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
1721  for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
1722  ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
1723  ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
1724  Probability += ProbabilityDelta;
1725  if (Probability > NextBucketBoundary) {
1726  if (CurrentBucket < Buckets->NumberOfBuckets - 1)
1727  CurrentBucket++;
1728  NextBucketBoundary += BucketProbability;
1729  }
1730  Buckets->Bucket[i] = CurrentBucket;
1731  Buckets->ExpectedCount[CurrentBucket] +=
1732  (float) (ProbabilityDelta * SampleCount);
1733  LastProbDensity = ProbDensity;
1734  }
1735  // place any leftover probability into the last bucket
1736  Buckets->ExpectedCount[CurrentBucket] +=
1737  (float) ((0.5 - Probability) * SampleCount);
1738 
1739  // copy upper half of distribution to lower half
1740  for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
1741  Buckets->Bucket[i] =
1742  Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
1743 
1744  // copy upper half of expected counts to lower half
1745  for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
1746  Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
1747  }
1748  return Buckets;
1749 } // MakeBuckets
double Integral(double f1, double f2, double Dx)
Definition: cluster.cpp:1881
double ChiSquared
Definition: cluster.cpp:183
uint16_t Bucket[BUCKETTABLESIZE]
Definition: cluster.cpp:185
uint32_t SampleCount
Definition: cluster.cpp:181
void * Emalloc(int Size)
Definition: emalloc.cpp:31
uint16_t OptimumNumberOfBuckets(uint32_t SampleCount)
Definition: cluster.cpp:1764
DISTRIBUTION Distribution
Definition: cluster.cpp:180
#define BUCKETTABLESIZE
Definition: cluster.cpp:161
#define Odd(N)
Definition: cluster.cpp:207
uint16_t NumberOfBuckets
Definition: cluster.cpp:184
uint32_t * Count
Definition: cluster.cpp:186
double(* DENSITYFUNC)(int32_t)
Definition: cluster.cpp:204
double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha)
Definition: cluster.cpp:1799
double UniformDensity(int32_t x)
Definition: cluster.cpp:1864
#define Mirror(N, R)
Definition: cluster.cpp:208
float * ExpectedCount
Definition: cluster.cpp:187
uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets)
Definition: cluster.cpp:2113
double NormalDensity(int32_t x)
Definition: cluster.cpp:1850
double Confidence
Definition: cluster.cpp:182

◆ MakeClusterer()

CLUSTERER* MakeClusterer ( int16_t  SampleSize,
const PARAM_DESC  ParamDesc[] 
)

This routine creates a new clusterer data structure, initializes it, and returns a pointer to it.

Parameters
SampleSizenumber of dimensions in feature space
ParamDescdescription of each dimension
Returns
pointer to the new clusterer data structure

Definition at line 399 of file cluster.cpp.

399  {
400  CLUSTERER *Clusterer;
401  int i;
402 
403  // allocate main clusterer data structure and init simple fields
404  Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
405  Clusterer->SampleSize = SampleSize;
406  Clusterer->NumberOfSamples = 0;
407  Clusterer->NumChar = 0;
408 
409  // init fields which will not be used initially
410  Clusterer->Root = nullptr;
411  Clusterer->ProtoList = NIL_LIST;
412 
413  // maintain a copy of param descriptors in the clusterer data structure
414  Clusterer->ParamDesc =
415  (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
416  for (i = 0; i < SampleSize; i++) {
417  Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
418  Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
419  Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
420  Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
421  Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
422  Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
423  Clusterer->ParamDesc[i].MidRange =
424  (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
425  }
426 
427  // allocate a kd tree to hold the samples
428  Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
429 
430  // Initialize cache of histogram buckets to minimize recomputing them.
431  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
432  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
433  Clusterer->bucket_cache[d][c] = nullptr;
434  }
435 
436  return Clusterer;
437 } // MakeClusterer
float MidRange
Definition: ocrfeatures.h:50
int8_t Circular
Definition: ocrfeatures.h:44
KDTREE * MakeKDTree(int16_t KeySize, const PARAM_DESC KeyDesc[])
Definition: kdtree.cpp:181
float HalfRange
Definition: ocrfeatures.h:49
float Min
Definition: ocrfeatures.h:46
PARAM_DESC * ParamDesc
Definition: cluster.h:88
void * Emalloc(int Size)
Definition: emalloc.cpp:31
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:95
KDTREE * KDTree
Definition: cluster.h:90
#define MAXBUCKETS
Definition: cluster.h:27
float Range
Definition: ocrfeatures.h:48
LIST ProtoList
Definition: cluster.h:92
CLUSTER * Root
Definition: cluster.h:91
int8_t NonEssential
Definition: ocrfeatures.h:45
int32_t NumChar
Definition: cluster.h:93
#define NIL_LIST
Definition: oldlist.h:127
#define MINBUCKETS
Definition: cluster.h:26
int32_t NumberOfSamples
Definition: cluster.h:89
float Max
Definition: ocrfeatures.h:47
int16_t SampleSize
Definition: cluster.h:87

◆ MakeDegenerateProto()

PROTOTYPE * MakeDegenerateProto ( uint16_t  N,
CLUSTER Cluster,
STATISTICS Statistics,
PROTOSTYLE  Style,
int32_t  MinSamples 
)

This routine checks for clusters which are degenerate and therefore cannot be analyzed in a statistically valid way. A cluster is defined as degenerate if it does not have at least MINSAMPLESNEEDED samples in it. If the cluster is found to be degenerate, a prototype of the specified style is generated and marked as insignificant. A cluster is also degenerate if it does not have at least MinSamples samples in it.

If the cluster is not degenerate, nullptr is returned.

Parameters
Nnumber of dimensions
Clustercluster being analyzed
Statisticsstatistical info about cluster
Styletype of prototype to be generated
MinSamplesminimum number of samples in a cluster
Returns
Pointer to degenerate prototype or nullptr.

Definition at line 1027 of file cluster.cpp.

1032  {
1033  PROTOTYPE *Proto = nullptr;
1034 
1035  if (MinSamples < MINSAMPLESNEEDED)
1036  MinSamples = MINSAMPLESNEEDED;
1037 
1038  if (Cluster->SampleCount < MinSamples) {
1039  switch (Style) {
1040  case spherical:
1041  Proto = NewSphericalProto (N, Cluster, Statistics);
1042  break;
1043  case elliptical:
1044  case automatic:
1045  Proto = NewEllipticalProto (N, Cluster, Statistics);
1046  break;
1047  case mixed:
1048  Proto = NewMixedProto (N, Cluster, Statistics);
1049  break;
1050  }
1051  Proto->Significant = FALSE;
1052  }
1053  return (Proto);
1054 } // MakeDegenerateProto
PROTOTYPE * NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1445
#define MINSAMPLESNEEDED
Definition: cluster.cpp:153
unsigned SampleCount
Definition: cluster.h:35
#define FALSE
Definition: capi.h:52
unsigned Significant
Definition: cluster.h:68
PROTOTYPE * NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1476
PROTOTYPE * NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1518
Definition: cluster.h:45

◆ MakeDimRandom()

void MakeDimRandom ( uint16_t  i,
PROTOTYPE Proto,
PARAM_DESC ParamDesc 
)

This routine alters the ith dimension of the specified mixed prototype to be D_random.

Parameters
iindex of dimension to be changed
Protoprototype whose dimension is to be altered
ParamDescdescription of specified dimension
Returns
None

Definition at line 1304 of file cluster.cpp.

1304  {
1305  Proto->Distrib[i] = D_random;
1306  Proto->Mean[i] = ParamDesc->MidRange;
1307  Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
1308 
1309  // subtract out the previous magnitude of this dimension from the total
1310  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1311  Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
1312  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1313  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1314 
1315  // note that the proto Weight is irrelevant for D_random protos
1316 } // MakeDimRandom
float MidRange
Definition: ocrfeatures.h:50
float * Mean
Definition: cluster.h:78
float HalfRange
Definition: ocrfeatures.h:49
float TotalMagnitude
Definition: cluster.h:79
float * Elliptical
Definition: cluster.h:64
DISTRIBUTION * Distrib
Definition: cluster.h:77
float Range
Definition: ocrfeatures.h:48
FLOATUNION Magnitude
Definition: cluster.h:82
float LogMagnitude
Definition: cluster.h:80
FLOATUNION Variance
Definition: cluster.h:81

◆ MakeDimUniform()

void MakeDimUniform ( uint16_t  i,
PROTOTYPE Proto,
STATISTICS Statistics 
)

This routine alters the ith dimension of the specified mixed prototype to be uniform.

Parameters
iindex of dimension to be changed
Protoprototype whose dimension is to be altered
Statisticsstatistical info about prototype
Returns
None

Definition at line 1326 of file cluster.cpp.

1326  {
1327  Proto->Distrib[i] = uniform;
1328  Proto->Mean[i] = Proto->Cluster->Mean[i] +
1329  (Statistics->Min[i] + Statistics->Max[i]) / 2;
1330  Proto->Variance.Elliptical[i] =
1331  (Statistics->Max[i] - Statistics->Min[i]) / 2;
1332  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1333  Proto->Variance.Elliptical[i] = MINVARIANCE;
1334 
1335  // subtract out the previous magnitude of this dimension from the total
1336  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1337  Proto->Magnitude.Elliptical[i] =
1338  1.0 / (2.0 * Proto->Variance.Elliptical[i]);
1339  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1340  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1341 
1342  // note that the proto Weight is irrelevant for uniform protos
1343 } // MakeDimUniform
float * Min
Definition: cluster.cpp:175
float * Mean
Definition: cluster.h:78
float TotalMagnitude
Definition: cluster.h:79
float * Elliptical
Definition: cluster.h:64
float * Max
Definition: cluster.cpp:176
DISTRIBUTION * Distrib
Definition: cluster.h:77
#define MINVARIANCE
Definition: cluster.cpp:143
float Mean[1]
Definition: cluster.h:39
FLOATUNION Magnitude
Definition: cluster.h:82
CLUSTER * Cluster
Definition: cluster.h:76
float LogMagnitude
Definition: cluster.h:80
FLOATUNION Variance
Definition: cluster.h:81

◆ MakeEllipticalProto()

PROTOTYPE * MakeEllipticalProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS Buckets 
)

This routine tests the specified cluster to see if it can be approximated by an elliptical normal distribution. If it can be, then a new prototype is formed and returned to the caller. If it can't be, then nullptr is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Clustercluster to be made into an elliptical prototype
Statisticsstatistical info about cluster
Bucketshistogram struct used to analyze distribution
Returns
Pointer to new elliptical prototype or nullptr.

Definition at line 1205 of file cluster.cpp.

1208  {
1209  PROTOTYPE *Proto = nullptr;
1210  int i;
1211 
1212  // check that each dimension is a normal distribution
1213  for (i = 0; i < Clusterer->SampleSize; i++) {
1214  if (Clusterer->ParamDesc[i].NonEssential)
1215  continue;
1216 
1217  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1218  Cluster->Mean[i],
1219  sqrt ((double) Statistics->
1220  CoVariance[i * (Clusterer->SampleSize + 1)]));
1221  if (!DistributionOK (Buckets))
1222  break;
1223  }
1224  // if all dimensions matched a normal distribution, make a proto
1225  if (i >= Clusterer->SampleSize)
1226  Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1227  return (Proto);
1228 } // MakeEllipticalProto
PARAM_DESC * ParamDesc
Definition: cluster.h:88
float Mean[1]
Definition: cluster.h:39
PROTOTYPE * NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1476
int8_t NonEssential
Definition: ocrfeatures.h:45
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc, float Mean, float StdDev)
Definition: cluster.cpp:1905
int16_t SampleSize
Definition: cluster.h:87
bool DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2040

◆ MakeMixedProto()

PROTOTYPE * MakeMixedProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS NormalBuckets,
double  Confidence 
)

This routine tests each dimension of the specified cluster to see what distribution would best approximate that dimension. Each dimension is compared to the following distributions in order: normal, random, uniform. If each dimension can be represented by one of these distributions, then a new prototype is formed and returned to the caller. If it can't be, then nullptr is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Clustercluster to be made into a prototype
Statisticsstatistical info about cluster
NormalBucketshistogram struct used to analyze distribution
Confidenceconfidence level for alternate distributions
Returns
Pointer to new mixed prototype or nullptr.

Definition at line 1245 of file cluster.cpp.

1249  {
1250  PROTOTYPE *Proto;
1251  int i;
1252  BUCKETS *UniformBuckets = nullptr;
1253  BUCKETS *RandomBuckets = nullptr;
1254 
1255  // create a mixed proto to work on - initially assume all dimensions normal*/
1256  Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
1257 
1258  // find the proper distribution for each dimension
1259  for (i = 0; i < Clusterer->SampleSize; i++) {
1260  if (Clusterer->ParamDesc[i].NonEssential)
1261  continue;
1262 
1263  FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1264  Proto->Mean[i],
1265  sqrt ((double) Proto->Variance.Elliptical[i]));
1266  if (DistributionOK (NormalBuckets))
1267  continue;
1268 
1269  if (RandomBuckets == nullptr)
1270  RandomBuckets =
1271  GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
1272  MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
1273  FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1274  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1275  if (DistributionOK (RandomBuckets))
1276  continue;
1277 
1278  if (UniformBuckets == nullptr)
1279  UniformBuckets =
1280  GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
1281  MakeDimUniform(i, Proto, Statistics);
1282  FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1283  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1284  if (DistributionOK (UniformBuckets))
1285  continue;
1286  break;
1287  }
1288  // if any dimension failed to match a distribution, discard the proto
1289  if (i < Clusterer->SampleSize) {
1290  FreePrototype(Proto);
1291  Proto = nullptr;
1292  }
1293  return (Proto);
1294 } // MakeMixedProto
void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc)
Definition: cluster.cpp:1304
float * Mean
Definition: cluster.h:78
BUCKETS * GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence)
Definition: cluster.cpp:1624
PARAM_DESC * ParamDesc
Definition: cluster.h:88
void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics)
Definition: cluster.cpp:1326
float * Elliptical
Definition: cluster.h:64
unsigned SampleCount
Definition: cluster.h:35
void FreePrototype(void *arg)
Definition: cluster.cpp:575
int8_t NonEssential
Definition: ocrfeatures.h:45
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc, float Mean, float StdDev)
Definition: cluster.cpp:1905
int16_t SampleSize
Definition: cluster.h:87
bool DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2040
FLOATUNION Variance
Definition: cluster.h:81
PROTOTYPE * NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1518

◆ MakeNewCluster()

CLUSTER * MakeNewCluster ( CLUSTERER Clusterer,
TEMPCLUSTER TempCluster 
)

This routine creates a new permanent cluster from the clusters specified in TempCluster. The 2 clusters in TempCluster are marked as "clustered" and deleted from the kd-tree. The new cluster is then added to the kd-tree.

Parameters
Clusterercurrent clustering environment
TempClusterpotential cluster to make permanent
Returns
Pointer to the new permanent cluster

Definition at line 810 of file cluster.cpp.

810  {
811  CLUSTER *Cluster;
812 
813  // allocate the new cluster and initialize it
814  Cluster = (CLUSTER *) Emalloc(
815  sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(float));
816  Cluster->Clustered = FALSE;
817  Cluster->Prototype = FALSE;
818  Cluster->Left = TempCluster->Cluster;
819  Cluster->Right = TempCluster->Neighbor;
820  Cluster->CharID = -1;
821 
822  // mark the old clusters as "clustered" and delete them from the kd-tree
823  Cluster->Left->Clustered = TRUE;
824  Cluster->Right->Clustered = TRUE;
825  KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
826  KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
827 
828  // compute the mean and sample count for the new cluster
829  Cluster->SampleCount =
830  MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
831  Cluster->Left->SampleCount, Cluster->Right->SampleCount,
832  Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
833 
834  // add the new cluster to the KD tree
835  KDStore(Clusterer->KDTree, Cluster->Mean, Cluster);
836  return Cluster;
837 } // MakeNewCluster
#define TRUE
Definition: capi.h:51
Definition: cluster.h:32
PARAM_DESC * ParamDesc
Definition: cluster.h:88
struct sample * Left
Definition: cluster.h:36
void * Emalloc(int Size)
Definition: emalloc.cpp:31
int32_t CharID
Definition: cluster.h:38
unsigned Prototype
Definition: cluster.h:34
void KDStore(KDTREE *Tree, float *Key, void *Data)
Definition: kdtree.cpp:213
KDTREE * KDTree
Definition: cluster.h:90
unsigned SampleCount
Definition: cluster.h:35
void KDDelete(KDTREE *Tree, float Key[], void *Data)
Definition: kdtree.cpp:254
float Mean[1]
Definition: cluster.h:39
#define FALSE
Definition: capi.h:52
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
Definition: cluster.cpp:852
struct sample * Right
Definition: cluster.h:37
unsigned Clustered
Definition: cluster.h:33
int16_t SampleSize
Definition: cluster.h:87
CLUSTER * Cluster
Definition: cluster.cpp:165
CLUSTER * Neighbor
Definition: cluster.cpp:166

◆ MakePotentialClusters()

void MakePotentialClusters ( ClusteringContext context,
CLUSTER Cluster,
int32_t  Level 
)

This routine is designed to be used in concert with the KDWalk routine. It will create a potential cluster for each sample in the kd-tree that is being walked. This potential cluster will then be pushed on the heap.

Parameters
contextClusteringContext (see definition above)
Clustercurrent cluster being visited in kd-tree walk
Levellevel of this cluster in the kd-tree

Definition at line 745 of file cluster.cpp.

746  {
747  ClusterPair HeapEntry;
748  int next = context->next;
749  context->candidates[next].Cluster = Cluster;
750  HeapEntry.data = &(context->candidates[next]);
751  context->candidates[next].Neighbor =
752  FindNearestNeighbor(context->tree,
753  context->candidates[next].Cluster,
754  &HeapEntry.key);
755  if (context->candidates[next].Neighbor != nullptr) {
756  context->heap->Push(&HeapEntry);
757  context->next++;
758  }
759 } // MakePotentialClusters
ClusterHeap * heap
Definition: cluster.cpp:198
void Push(Pair *entry)
Definition: genericheap.h:95
TEMPCLUSTER * candidates
Definition: cluster.cpp:199
CLUSTER * FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance)
Definition: cluster.cpp:775
CLUSTER * Cluster
Definition: cluster.cpp:165
CLUSTER * Neighbor
Definition: cluster.cpp:166

◆ MakePrototype()

PROTOTYPE * MakePrototype ( CLUSTERER Clusterer,
CLUSTERCONFIG Config,
CLUSTER Cluster 
)

This routine attempts to create a prototype from the specified cluster that conforms to the distribution specified in Config. If there are too few samples in the cluster to perform a statistical analysis, then a prototype is generated but labelled as insignificant. If the dimensions of the cluster are not independent, no prototype is generated and nullptr is returned. If a prototype can be found that matches the desired distribution then a pointer to it is returned, otherwise nullptr is returned.

Parameters
Clustererdata structure holding cluster tree
Configparameters used to control prototype generation
Clustercluster to be made into a prototype
Returns
Pointer to new prototype or nullptr

Definition at line 937 of file cluster.cpp.

939  {
940  STATISTICS *Statistics;
941  PROTOTYPE *Proto;
942  BUCKETS *Buckets;
943 
944  // filter out clusters which contain samples from the same character
945  if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
946  return nullptr;
947 
948  // compute the covariance matrix and ranges for the cluster
949  Statistics =
950  ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
951 
952  // check for degenerate clusters which need not be analyzed further
953  // note that the MinSamples test assumes that all clusters with multiple
954  // character samples have been removed (as above)
955  Proto = MakeDegenerateProto(
956  Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
957  (int32_t) (Config->MinSamples * Clusterer->NumChar));
958  if (Proto != nullptr) {
959  FreeStatistics(Statistics);
960  return Proto;
961  }
962  // check to ensure that all dimensions are independent
963  if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize,
964  Statistics->CoVariance, Config->Independence)) {
965  FreeStatistics(Statistics);
966  return nullptr;
967  }
968 
969  if (HOTELLING && Config->ProtoStyle == elliptical) {
970  Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
971  if (Proto != nullptr) {
972  FreeStatistics(Statistics);
973  return Proto;
974  }
975  }
976 
977  // create a histogram data structure used to evaluate distributions
978  Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount,
979  Config->Confidence);
980 
981  // create a prototype based on the statistics and test it
982  switch (Config->ProtoStyle) {
983  case spherical:
984  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
985  break;
986  case elliptical:
987  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
988  break;
989  case mixed:
990  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
991  Config->Confidence);
992  break;
993  case automatic:
994  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
995  if (Proto != nullptr)
996  break;
997  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
998  if (Proto != nullptr)
999  break;
1000  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1001  Config->Confidence);
1002  break;
1003  }
1004  FreeStatistics(Statistics);
1005  return Proto;
1006 } // MakePrototype
PROTOTYPE * MakeDegenerateProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, int32_t MinSamples)
Definition: cluster.cpp:1027
PROTOTYPE * MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1205
bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal)
Definition: cluster.cpp:2353
CLUSTERCONFIG Config
BUCKETS * GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence)
Definition: cluster.cpp:1624
float MinSamples
Definition: cluster.h:50
PARAM_DESC * ParamDesc
Definition: cluster.h:88
void FreeStatistics(STATISTICS *Statistics)
Definition: cluster.cpp:2066
bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence)
Definition: cluster.cpp:1579
PROTOTYPE * MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1170
PROTOSTYLE ProtoStyle
Definition: cluster.h:49
STATISTICS * ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster)
Definition: cluster.cpp:1360
unsigned SampleCount
Definition: cluster.h:35
PROTOTYPE * MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *NormalBuckets, double Confidence)
Definition: cluster.cpp:1245
float MaxIllegal
Definition: cluster.h:51
float * CoVariance
Definition: cluster.cpp:174
double Confidence
Definition: cluster.h:54
PROTOTYPE * TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1069
float Independence
Definition: cluster.h:53
Definition: cluster.h:59
int32_t NumChar
Definition: cluster.h:93
#define HOTELLING
Definition: cluster.cpp:31
int16_t SampleSize
Definition: cluster.h:87
Definition: cluster.h:45

◆ MakeSample()

SAMPLE* MakeSample ( CLUSTERER Clusterer,
const float *  Feature,
int32_t  CharID 
)

This routine creates a new sample data structure to hold the specified feature. This sample is added to the clusterer data structure (so that it knows which samples are to be clustered later), and a pointer to the sample is returned to the caller.

Parameters
Clustererclusterer data structure to add sample to
Featurefeature to be added to clusterer
CharIDunique ident. of char that sample came from
Returns
Pointer to the new sample data structure

Definition at line 452 of file cluster.cpp.

453  {
454  SAMPLE *Sample;
455  int i;
456 
457  // see if the samples have already been clustered - if so trap an error
458  // Can't add samples after they have been clustered.
459  ASSERT_HOST(Clusterer->Root == nullptr);
460 
461  // allocate the new sample and initialize it
462  Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
463  (Clusterer->SampleSize -
464  1) * sizeof (float));
465  Sample->Clustered = FALSE;
466  Sample->Prototype = FALSE;
467  Sample->SampleCount = 1;
468  Sample->Left = nullptr;
469  Sample->Right = nullptr;
470  Sample->CharID = CharID;
471 
472  for (i = 0; i < Clusterer->SampleSize; i++)
473  Sample->Mean[i] = Feature[i];
474 
475  // add the sample to the KD tree - keep track of the total # of samples
476  Clusterer->NumberOfSamples++;
477  KDStore(Clusterer->KDTree, Sample->Mean, Sample);
478  if (CharID >= Clusterer->NumChar)
479  Clusterer->NumChar = CharID + 1;
480 
481  // execute hook for monitoring clustering operation
482  // (*SampleCreationHook)(Sample);
483 
484  return (Sample);
485 } // MakeSample
Definition: cluster.h:32
struct sample * Left
Definition: cluster.h:36
void * Emalloc(int Size)
Definition: emalloc.cpp:31
int32_t CharID
Definition: cluster.h:38
unsigned Prototype
Definition: cluster.h:34
void KDStore(KDTREE *Tree, float *Key, void *Data)
Definition: kdtree.cpp:213
KDTREE * KDTree
Definition: cluster.h:90
unsigned SampleCount
Definition: cluster.h:35
float Mean[1]
Definition: cluster.h:39
#define FALSE
Definition: capi.h:52
CLUSTER * Root
Definition: cluster.h:91
int32_t NumChar
Definition: cluster.h:93
struct sample * Right
Definition: cluster.h:37
unsigned Clustered
Definition: cluster.h:33
int32_t NumberOfSamples
Definition: cluster.h:89
int16_t SampleSize
Definition: cluster.h:87
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ MakeSphericalProto()

PROTOTYPE * MakeSphericalProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS Buckets 
)

This routine tests the specified cluster to see if it can be approximated by a spherical normal distribution. If it can be, then a new prototype is formed and returned to the caller. If it can't be, then nullptr is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Clustercluster to be made into a spherical prototype
Statisticsstatistical info about cluster
Bucketshistogram struct used to analyze distribution
Returns
Pointer to new spherical prototype or nullptr.

Definition at line 1170 of file cluster.cpp.

1173  {
1174  PROTOTYPE *Proto = nullptr;
1175  int i;
1176 
1177  // check that each dimension is a normal distribution
1178  for (i = 0; i < Clusterer->SampleSize; i++) {
1179  if (Clusterer->ParamDesc[i].NonEssential)
1180  continue;
1181 
1182  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1183  Cluster->Mean[i],
1184  sqrt ((double) (Statistics->AvgVariance)));
1185  if (!DistributionOK (Buckets))
1186  break;
1187  }
1188  // if all dimensions matched a normal distribution, make a proto
1189  if (i >= Clusterer->SampleSize)
1190  Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
1191  return (Proto);
1192 } // MakeSphericalProto
PROTOTYPE * NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1445
PARAM_DESC * ParamDesc
Definition: cluster.h:88
float Mean[1]
Definition: cluster.h:39
int8_t NonEssential
Definition: ocrfeatures.h:45
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc, float Mean, float StdDev)
Definition: cluster.cpp:1905
float AvgVariance
Definition: cluster.cpp:173
int16_t SampleSize
Definition: cluster.h:87
bool DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2040

◆ Mean()

float Mean ( PROTOTYPE Proto,
uint16_t  Dimension 
)

This routine returns the mean of the specified prototype in the indicated dimension.

Parameters
Protoprototype to return mean of
Dimensiondimension whose mean is to be returned
Returns
Mean of Prototype in Dimension

Definition at line 628 of file cluster.cpp.

628  {
629  return (Proto->Mean[Dimension]);
630 } // Mean
float * Mean
Definition: cluster.h:78

◆ MergeClusters()

int32_t MergeClusters ( int16_t  N,
PARAM_DESC  ParamDesc[],
int32_t  n1,
int32_t  n2,
float  m[],
float  m1[],
float  m2[] 
)

This routine merges two clusters into one larger cluster. To do this it computes the number of samples in the new cluster and the mean of the new cluster. The ParamDesc information is used to ensure that circular dimensions are handled correctly.

Parameters
N# of dimensions (size of arrays)
ParamDescarray of dimension descriptions
n1,n2number of samples in each old cluster
marray to hold mean of new cluster
m1,m2arrays containing means of old clusters
Returns
The number of samples in the new cluster.

Definition at line 852 of file cluster.cpp.

857  {
858  int32_t i, n;
859 
860  n = n1 + n2;
861  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
862  if (ParamDesc->Circular) {
863  // if distance between means is greater than allowed
864  // reduce upper point by one "rotation" to compute mean
865  // then normalize the mean back into the accepted range
866  if ((*m2 - *m1) > ParamDesc->HalfRange) {
867  *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
868  if (*m < ParamDesc->Min)
869  *m += ParamDesc->Range;
870  }
871  else if ((*m1 - *m2) > ParamDesc->HalfRange) {
872  *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
873  if (*m < ParamDesc->Min)
874  *m += ParamDesc->Range;
875  }
876  else
877  *m = (n1 * *m1 + n2 * *m2) / n;
878  }
879  else
880  *m = (n1 * *m1 + n2 * *m2) / n;
881  }
882  return n;
883 } // MergeClusters

◆ MultipleCharSamples()

bool MultipleCharSamples ( CLUSTERER Clusterer,
CLUSTER Cluster,
float  MaxIllegal 
)

This routine looks at all samples in the specified cluster. It computes a running estimate of the percentage of the characters which have more than 1 sample in the cluster. When this percentage exceeds MaxIllegal, TRUE is returned. Otherwise FALSE is returned. The CharID fields must contain integers which identify the training characters which were used to generate the sample. One integer is used for each sample. The NumChar field in the Clusterer must contain the number of characters in the training set. All CharID fields must be between 0 and NumChar-1. The main function of this routine is to help identify clusters which need to be split further, i.e. if numerous training characters have 2 or more features which are contained in the same cluster, then the cluster should be split.

Parameters
Clustererdata structure holding cluster tree
Clustercluster containing samples to be tested
MaxIllegalmax percentage of samples allowed to have more than 1 feature in the cluster
Returns
TRUE if the cluster should be split, FALSE otherwise.

Definition at line 2353 of file cluster.cpp.

2356 {
2357  static BOOL8 *CharFlags = nullptr;
2358  static int32_t NumFlags = 0;
2359  int i;
2360  LIST SearchState;
2361  SAMPLE *Sample;
2362  int32_t CharID;
2363  int32_t NumCharInCluster;
2364  int32_t NumIllegalInCluster;
2365  float PercentIllegal;
2366 
2367  // initial estimate assumes that no illegal chars exist in the cluster
2368  NumCharInCluster = Cluster->SampleCount;
2369  NumIllegalInCluster = 0;
2370 
2371  if (Clusterer->NumChar > NumFlags) {
2372  free(CharFlags);
2373  NumFlags = Clusterer->NumChar;
2374  CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
2375  }
2376 
2377  for (i = 0; i < NumFlags; i++)
2378  CharFlags[i] = FALSE;
2379 
2380  // find each sample in the cluster and check if we have seen it before
2381  InitSampleSearch(SearchState, Cluster);
2382  while ((Sample = NextSample (&SearchState)) != nullptr) {
2383  CharID = Sample->CharID;
2384  if (CharFlags[CharID] == FALSE) {
2385  CharFlags[CharID] = TRUE;
2386  }
2387  else {
2388  if (CharFlags[CharID] == TRUE) {
2389  NumIllegalInCluster++;
2390  CharFlags[CharID] = ILLEGAL_CHAR;
2391  }
2392  NumCharInCluster--;
2393  PercentIllegal = (float) NumIllegalInCluster / NumCharInCluster;
2394  if (PercentIllegal > MaxIllegal) {
2395  destroy(SearchState);
2396  return true;
2397  }
2398  }
2399  }
2400  return false;
2401 
2402 } // MultipleCharSamples
#define TRUE
Definition: capi.h:51
Definition: cluster.h:32
void * Emalloc(int Size)
Definition: emalloc.cpp:31
int32_t CharID
Definition: cluster.h:38
LIST destroy(LIST list)
Definition: oldlist.cpp:170
unsigned SampleCount
Definition: cluster.h:35
#define FALSE
Definition: capi.h:52
unsigned char BOOL8
Definition: host.h:34
#define ILLEGAL_CHAR
int32_t NumChar
Definition: cluster.h:93
#define InitSampleSearch(S, C)
Definition: cluster.h:105
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:606

◆ NewChiStruct()

CHISTRUCT * NewChiStruct ( uint16_t  DegreesOfFreedom,
double  Alpha 
)

This routine allocates a new data structure which is used to hold a chi-squared value along with its associated number of degrees of freedom and alpha value.

Parameters
DegreesOfFreedomdegrees of freedom for new chi value
Alphaconfidence level for new chi value
Returns
none

Definition at line 2222 of file cluster.cpp.

2222  {
2224 
2225  NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
2227  NewChiStruct->Alpha = Alpha;
2228  return (NewChiStruct);
2229 
2230 } // NewChiStruct
double Alpha
Definition: cluster.cpp:192
void * Emalloc(int Size)
Definition: emalloc.cpp:31
CHISTRUCT * NewChiStruct(uint16_t DegreesOfFreedom, double Alpha)
Definition: cluster.cpp:2222
uint16_t DegreesOfFreedom
Definition: cluster.cpp:191
uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets)
Definition: cluster.cpp:2113

◆ NewEllipticalProto()

PROTOTYPE * NewEllipticalProto ( int16_t  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine creates an elliptical prototype data structure to approximate the samples in the specified cluster. Elliptical prototypes have a variance for each dimension. All dimensions are normally distributed and independent.

Parameters
Nnumber of dimensions
Clustercluster to be made into an elliptical prototype
Statisticsstatistical info about samples in cluster
Returns
Pointer to a new elliptical prototype data structure

Definition at line 1476 of file cluster.cpp.

1478  {
1479  PROTOTYPE *Proto;
1480  float *CoVariance;
1481  int i;
1482 
1483  Proto = NewSimpleProto (N, Cluster);
1484  Proto->Variance.Elliptical = (float *) Emalloc (N * sizeof (float));
1485  Proto->Magnitude.Elliptical = (float *) Emalloc (N * sizeof (float));
1486  Proto->Weight.Elliptical = (float *) Emalloc (N * sizeof (float));
1487 
1488  CoVariance = Statistics->CoVariance;
1489  Proto->TotalMagnitude = 1.0;
1490  for (i = 0; i < N; i++, CoVariance += N + 1) {
1491  Proto->Variance.Elliptical[i] = *CoVariance;
1492  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1493  Proto->Variance.Elliptical[i] = MINVARIANCE;
1494 
1495  Proto->Magnitude.Elliptical[i] =
1496  1.0 / sqrt(2.0 * M_PI * Proto->Variance.Elliptical[i]);
1497  Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
1498  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1499  }
1500  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1501  Proto->Style = elliptical;
1502  return (Proto);
1503 } // NewEllipticalProto
void * Emalloc(int Size)
Definition: emalloc.cpp:31
float TotalMagnitude
Definition: cluster.h:79
float * Elliptical
Definition: cluster.h:64
PROTOTYPE * NewSimpleProto(int16_t N, CLUSTER *Cluster)
Definition: cluster.cpp:1540
FLOATUNION Weight
Definition: cluster.h:83
unsigned Style
Definition: cluster.h:74
#define MINVARIANCE
Definition: cluster.cpp:143
float * CoVariance
Definition: cluster.cpp:174
FLOATUNION Magnitude
Definition: cluster.h:82
float LogMagnitude
Definition: cluster.h:80
FLOATUNION Variance
Definition: cluster.h:81

◆ NewMixedProto()

PROTOTYPE * NewMixedProto ( int16_t  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine creates a mixed prototype data structure to approximate the samples in the specified cluster. Mixed prototypes can have different distributions for each dimension. All dimensions are independent. The structure is initially filled in as though it were an elliptical prototype. The actual distributions of the dimensions can be altered by other routines.

Parameters
Nnumber of dimensions
Clustercluster to be made into a mixed prototype
Statisticsstatistical info about samples in cluster
Returns
Pointer to a new mixed prototype data structure

Definition at line 1518 of file cluster.cpp.

1518  {
1519  PROTOTYPE *Proto;
1520  int i;
1521 
1522  Proto = NewEllipticalProto (N, Cluster, Statistics);
1523  Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
1524 
1525  for (i = 0; i < N; i++) {
1526  Proto->Distrib[i] = normal;
1527  }
1528  Proto->Style = mixed;
1529  return (Proto);
1530 } // NewMixedProto
void * Emalloc(int Size)
Definition: emalloc.cpp:31
DISTRIBUTION * Distrib
Definition: cluster.h:77
unsigned Style
Definition: cluster.h:74
Definition: cluster.h:59
PROTOTYPE * NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1476
DISTRIBUTION
Definition: cluster.h:58
Definition: cluster.h:45

◆ NewSimpleProto()

PROTOTYPE * NewSimpleProto ( int16_t  N,
CLUSTER Cluster 
)

This routine allocates memory to hold a simple prototype data structure, i.e. one without independent distributions and variances for each dimension.

Parameters
Nnumber of dimensions
Clustercluster to be made into a prototype
Returns
Pointer to new simple prototype

Definition at line 1540 of file cluster.cpp.

1540  {
1541  PROTOTYPE *Proto;
1542  int i;
1543 
1544  Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
1545  Proto->Mean = (float *) Emalloc (N * sizeof (float));
1546 
1547  for (i = 0; i < N; i++)
1548  Proto->Mean[i] = Cluster->Mean[i];
1549  Proto->Distrib = nullptr;
1550 
1551  Proto->Significant = TRUE;
1552  Proto->Merged = FALSE;
1553  Proto->Style = spherical;
1554  Proto->NumSamples = Cluster->SampleCount;
1555  Proto->Cluster = Cluster;
1556  Proto->Cluster->Prototype = TRUE;
1557  return (Proto);
1558 } // NewSimpleProto
float * Mean
Definition: cluster.h:78
#define TRUE
Definition: capi.h:51
void * Emalloc(int Size)
Definition: emalloc.cpp:31
unsigned Prototype
Definition: cluster.h:34
unsigned Merged
Definition: cluster.h:69
unsigned SampleCount
Definition: cluster.h:35
DISTRIBUTION * Distrib
Definition: cluster.h:77
unsigned Style
Definition: cluster.h:74
float Mean[1]
Definition: cluster.h:39
#define FALSE
Definition: capi.h:52
unsigned Significant
Definition: cluster.h:68
CLUSTER * Cluster
Definition: cluster.h:76
unsigned NumSamples
Definition: cluster.h:75

◆ NewSphericalProto()

PROTOTYPE * NewSphericalProto ( uint16_t  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine creates a spherical prototype data structure to approximate the samples in the specified cluster. Spherical prototypes have a single variance which is common across all dimensions. All dimensions are normally distributed and independent.

Parameters
Nnumber of dimensions
Clustercluster to be made into a spherical prototype
Statisticsstatistical info about samples in cluster
Returns
Pointer to a new spherical prototype data structure

Definition at line 1445 of file cluster.cpp.

1447  {
1448  PROTOTYPE *Proto;
1449 
1450  Proto = NewSimpleProto (N, Cluster);
1451 
1452  Proto->Variance.Spherical = Statistics->AvgVariance;
1453  if (Proto->Variance.Spherical < MINVARIANCE)
1454  Proto->Variance.Spherical = MINVARIANCE;
1455 
1456  Proto->Magnitude.Spherical =
1457  1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
1458  Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
1459  (double) N);
1460  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
1461  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1462 
1463  return (Proto);
1464 } // NewSphericalProto
float TotalMagnitude
Definition: cluster.h:79
float Spherical
Definition: cluster.h:63
PROTOTYPE * NewSimpleProto(int16_t N, CLUSTER *Cluster)
Definition: cluster.cpp:1540
FLOATUNION Weight
Definition: cluster.h:83
#define MINVARIANCE
Definition: cluster.cpp:143
FLOATUNION Magnitude
Definition: cluster.h:82
float LogMagnitude
Definition: cluster.h:80
float AvgVariance
Definition: cluster.cpp:173
FLOATUNION Variance
Definition: cluster.h:81

◆ NextSample()

CLUSTER* NextSample ( LIST SearchState)

This routine is used to find all of the samples which belong to a cluster. It starts by removing the top cluster on the cluster list (SearchState). If this cluster is a leaf it is returned. Otherwise, the right subcluster is pushed on the list and we continue the search in the left subcluster. This continues until a leaf is found. If all samples have been found, nullptr is returned. InitSampleSearch() must be called before NextSample() to initialize the search.

Parameters
SearchStateptr to list containing clusters to be searched
Returns
Pointer to the next leaf cluster (sample) or nullptr.

Definition at line 606 of file cluster.cpp.

606  {
607  CLUSTER *Cluster;
608 
609  if (*SearchState == NIL_LIST)
610  return (nullptr);
611  Cluster = (CLUSTER *) first_node (*SearchState);
612  *SearchState = pop (*SearchState);
613  while (TRUE) {
614  if (Cluster->Left == nullptr)
615  return (Cluster);
616  *SearchState = push (*SearchState, Cluster->Right);
617  Cluster = Cluster->Left;
618  }
619 } // NextSample
#define TRUE
Definition: capi.h:51
Definition: cluster.h:32
struct sample * Left
Definition: cluster.h:36
LIST push(LIST list, void *element)
Definition: oldlist.cpp:283
LIST pop(LIST list)
Definition: oldlist.cpp:266
#define first_node(l)
Definition: oldlist.h:141
#define NIL_LIST
Definition: oldlist.h:127
struct sample * Right
Definition: cluster.h:37

◆ NormalBucket()

uint16_t NormalBucket ( PARAM_DESC ParamDesc,
float  x,
float  Mean,
float  StdDev 
)

This routine determines which bucket x falls into in the discrete normal distribution defined by kNormalMean and kNormalStdDev. x values which exceed the range of the discrete distribution are clipped.

Parameters
ParamDescused to identify circular dimensions
xvalue to be normalized
Meanmean of normal distribution
StdDevstandard deviation of normal distribution
Returns
Bucket number into which x falls

Definition at line 1975 of file cluster.cpp.

1978  {
1979  float X;
1980 
1981  // wraparound circular parameters if necessary
1982  if (ParamDesc->Circular) {
1983  if (x - Mean > ParamDesc->HalfRange)
1984  x -= ParamDesc->Range;
1985  else if (x - Mean < -ParamDesc->HalfRange)
1986  x += ParamDesc->Range;
1987  }
1988 
1989  X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
1990  if (X < 0)
1991  return 0;
1992  if (X > BUCKETTABLESIZE - 1)
1993  return ((uint16_t) (BUCKETTABLESIZE - 1));
1994  return (uint16_t) floor((double) X);
1995 } // NormalBucket
int8_t Circular
Definition: ocrfeatures.h:44
float HalfRange
Definition: ocrfeatures.h:49
#define BUCKETTABLESIZE
Definition: cluster.cpp:161
float Range
Definition: ocrfeatures.h:48
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:628

◆ NormalDensity()

double NormalDensity ( int32_t  x)

This routine computes the probability density function of a discrete normal distribution defined by the global variables kNormalMean, kNormalVariance, and kNormalMagnitude. Normal magnitude could, of course, be computed in terms of the normal variance but it is precomputed for efficiency.

Parameters
xnumber to compute the normal probability density for
Note
Globals: kNormalMean mean of a discrete normal distribution kNormalVariance variance of a discrete normal distribution kNormalMagnitude magnitude of a discrete normal distribution
Returns
The value of the normal distribution at x.

Definition at line 1850 of file cluster.cpp.

1850  {
1851  double Distance;
1852 
1853  Distance = x - kNormalMean;
1854  return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
1855 } // NormalDensity

◆ NumBucketsMatch()

int NumBucketsMatch ( void *  arg1,
void *  arg2 
)

This routine is used to search a list of histogram data structures to find one with the specified number of buckets. It is called by the list search routines.

Parameters
arg1current histogram being tested for a match
arg2match key
Returns
TRUE if arg1 matches arg2

Definition at line 2133 of file cluster.cpp.

2134  { // uint16_t *DesiredNumberOfBuckets)
2135  BUCKETS *Histogram = (BUCKETS *) arg1;
2136  uint16_t *DesiredNumberOfBuckets = (uint16_t *) arg2;
2137 
2138  return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
2139 
2140 } // NumBucketsMatch
uint16_t NumberOfBuckets
Definition: cluster.cpp:184

◆ OptimumNumberOfBuckets()

uint16_t OptimumNumberOfBuckets ( uint32_t  SampleCount)

This routine computes the optimum number of histogram buckets that should be used in a chi-squared goodness of fit test for the specified number of samples. The optimum number is computed based on Table 4.1 on pg. 147 of "Measurement and Analysis of Random Data" by Bendat & Piersol. Linear interpolation is used to interpolate between table values. The table is intended for a 0.05 level of significance (alpha). This routine assumes that it is equally valid for other alpha's, which may not be true.

Parameters
SampleCountnumber of samples to be tested
Returns
Optimum number of histogram buckets

Definition at line 1764 of file cluster.cpp.

1764  {
1765  uint8_t Last, Next;
1766  float Slope;
1767 
1768  if (SampleCount < kCountTable[0])
1769  return kBucketsTable[0];
1770 
1771  for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
1772  if (SampleCount <= kCountTable[Next]) {
1773  Slope = (float) (kBucketsTable[Next] - kBucketsTable[Last]) /
1774  (float) (kCountTable[Next] - kCountTable[Last]);
1775  return ((uint16_t) (kBucketsTable[Last] +
1776  Slope * (SampleCount - kCountTable[Last])));
1777  }
1778  }
1779  return kBucketsTable[Last];
1780 } // OptimumNumberOfBuckets
#define LOOKUPTABLESIZE
Definition: cluster.cpp:229

◆ Solve()

double Solve ( SOLVEFUNC  Function,
void *  FunctionParams,
double  InitialGuess,
double  Accuracy 
)

This routine attempts to find an x value at which Function goes to zero (i.e. a root of the function). It will only work correctly if a solution actually exists and there are no extrema between the solution and the InitialGuess. The algorithms used are extremely primitive.

Parameters
Functionfunction whose zero is to be found
FunctionParamsarbitrary data to pass to function
InitialGuesspoint to start solution search at
Accuracymaximum allowed error
Returns
Solution of function (x for which f(x) = 0).

Definition at line 2246 of file cluster.cpp.

2250 {
2251  double x;
2252  double f;
2253  double Slope;
2254  double Delta;
2255  double NewDelta;
2256  double xDelta;
2257  double LastPosX, LastNegX;
2258 
2259  x = InitialGuess;
2260  Delta = INITIALDELTA;
2261  LastPosX = FLT_MAX;
2262  LastNegX = -FLT_MAX;
2263  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2264  while (Abs (LastPosX - LastNegX) > Accuracy) {
2265  // keep track of outer bounds of current estimate
2266  if (f < 0)
2267  LastNegX = x;
2268  else
2269  LastPosX = x;
2270 
2271  // compute the approx. slope of f(x) at the current point
2272  Slope =
2273  ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
2274 
2275  // compute the next solution guess */
2276  xDelta = f / Slope;
2277  x -= xDelta;
2278 
2279  // reduce the delta used for computing slope to be a fraction of
2280  //the amount moved to get to the new guess
2281  NewDelta = Abs (xDelta) * DELTARATIO;
2282  if (NewDelta < Delta)
2283  Delta = NewDelta;
2284 
2285  // compute the value of the function at the new guess
2286  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2287  }
2288  return (x);
2289 
2290 } // Solve
#define INITIALDELTA
#define DELTARATIO
#define Abs(N)
Definition: cluster.cpp:209

◆ StandardDeviation()

float StandardDeviation ( PROTOTYPE Proto,
uint16_t  Dimension 
)

This routine returns the standard deviation of the prototype in the indicated dimension.

Parameters
Protoprototype to return standard deviation of
Dimensiondimension whose stddev is to be returned
Returns
Standard deviation of Prototype in Dimension

Definition at line 639 of file cluster.cpp.

639  {
640  switch (Proto->Style) {
641  case spherical:
642  return ((float) sqrt ((double) Proto->Variance.Spherical));
643  case elliptical:
644  return ((float)
645  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
646  case mixed:
647  switch (Proto->Distrib[Dimension]) {
648  case normal:
649  return ((float)
650  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
651  case uniform:
652  case D_random:
653  return (Proto->Variance.Elliptical[Dimension]);
654  case DISTRIBUTION_COUNT:
655  ASSERT_HOST(!"Distribution count not allowed!");
656  }
657  }
658  return 0.0f;
659 } // StandardDeviation
float Spherical
Definition: cluster.h:63
float * Elliptical
Definition: cluster.h:64
DISTRIBUTION * Distrib
Definition: cluster.h:77
unsigned Style
Definition: cluster.h:74
Definition: cluster.h:59
FLOATUNION Variance
Definition: cluster.h:81
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: cluster.h:45

◆ TestEllipticalProto()

PROTOTYPE * TestEllipticalProto ( CLUSTERER Clusterer,
CLUSTERCONFIG Config,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine tests the specified cluster to see if ** there is a statistically significant difference between the sub-clusters that would be made if the cluster were to be split. If not, then a new prototype is formed and returned to the caller. If there is, then nullptr is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Configprovides the magic number of samples that make a good cluster
Clustercluster to be made into an elliptical prototype
Statisticsstatistical info about cluster
Returns
Pointer to new elliptical prototype or nullptr.

Definition at line 1069 of file cluster.cpp.

1072  {
1073  // Fraction of the number of samples used as a range around 1 within
1074  // which a cluster has the magic size that allows a boost to the
1075  // FTable by kFTableBoostMargin, thus allowing clusters near the
1076  // magic size (equal to the number of sample characters) to be more
1077  // likely to stay together.
1078  const double kMagicSampleMargin = 0.0625;
1079  const double kFTableBoostMargin = 2.0;
1080 
1081  int N = Clusterer->SampleSize;
1082  CLUSTER* Left = Cluster->Left;
1083  CLUSTER* Right = Cluster->Right;
1084  if (Left == nullptr || Right == nullptr)
1085  return nullptr;
1086  int TotalDims = Left->SampleCount + Right->SampleCount;
1087  if (TotalDims < N + 1 || TotalDims < 2)
1088  return nullptr;
1089  std::vector<float> Covariance(static_cast<size_t>(N) * N);
1090  std::vector<float> Inverse(static_cast<size_t>(N) * N);
1091  std::vector<float> Delta(N);
1092  // Compute a new covariance matrix that only uses essential features.
1093  for (int i = 0; i < N; ++i) {
1094  int row_offset = i * N;
1095  if (!Clusterer->ParamDesc[i].NonEssential) {
1096  for (int j = 0; j < N; ++j) {
1097  if (!Clusterer->ParamDesc[j].NonEssential)
1098  Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
1099  else
1100  Covariance[j + row_offset] = 0.0f;
1101  }
1102  } else {
1103  for (int j = 0; j < N; ++j) {
1104  if (i == j)
1105  Covariance[j + row_offset] = 1.0f;
1106  else
1107  Covariance[j + row_offset] = 0.0f;
1108  }
1109  }
1110  }
1111  double err = InvertMatrix(&Covariance[0], N, &Inverse[0]);
1112  if (err > 1) {
1113  tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
1114  }
1115  int EssentialN = 0;
1116  for (int dim = 0; dim < N; ++dim) {
1117  if (!Clusterer->ParamDesc[dim].NonEssential) {
1118  Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
1119  ++EssentialN;
1120  } else {
1121  Delta[dim] = 0.0f;
1122  }
1123  }
1124  // Compute Hotelling's T-squared.
1125  double Tsq = 0.0;
1126  for (int x = 0; x < N; ++x) {
1127  double temp = 0.0;
1128  for (int y = 0; y < N; ++y) {
1129  temp += static_cast<double>(Inverse[y + N * x]) * Delta[y];
1130  }
1131  Tsq += Delta[x] * temp;
1132  }
1133  // Changed this function to match the formula in
1134  // Statistical Methods in Medical Research p 473
1135  // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
1136  // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
1137  double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
1138  int Fx = EssentialN;
1139  if (Fx > FTABLE_X)
1140  Fx = FTABLE_X;
1141  --Fx;
1142  int Fy = TotalDims - EssentialN - 1;
1143  if (Fy > FTABLE_Y)
1144  Fy = FTABLE_Y;
1145  --Fy;
1146  double FTarget = FTable[Fy][Fx];
1147  if (Config->MagicSamples > 0 &&
1148  TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
1149  TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
1150  // Give magic-sized clusters a magic FTable boost.
1151  FTarget += kFTableBoostMargin;
1152  }
1153  if (F < FTarget) {
1154  return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1155  }
1156  return nullptr;
1157 }
CLUSTERCONFIG Config
#define FTABLE_X
Definition: cluster.cpp:32
Definition: cluster.h:32
const double FTable[FTABLE_Y][FTABLE_X]
Definition: cluster.cpp:36
PARAM_DESC * ParamDesc
Definition: cluster.h:88
struct sample * Left
Definition: cluster.h:36
unsigned SampleCount
Definition: cluster.h:35
float Mean[1]
Definition: cluster.h:39
int MagicSamples
Definition: cluster.h:55
float * CoVariance
Definition: cluster.cpp:174
#define FTABLE_Y
Definition: cluster.cpp:33
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
PROTOTYPE * NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1476
int8_t NonEssential
Definition: ocrfeatures.h:45
struct sample * Right
Definition: cluster.h:37
int16_t SampleSize
Definition: cluster.h:87
double InvertMatrix(const float *input, int size, float *inv)
Definition: cluster.cpp:2409

◆ UniformBucket()

uint16_t UniformBucket ( PARAM_DESC ParamDesc,
float  x,
float  Mean,
float  StdDev 
)

This routine determines which bucket x falls into in the discrete uniform distribution defined by BUCKETTABLESIZE. x values which exceed the range of the discrete distribution are clipped.

Parameters
ParamDescused to identify circular dimensions
xvalue to be normalized
Meancenter of range of uniform distribution
StdDev1/2 the range of the uniform distribution
Returns
Bucket number into which x falls

Definition at line 2008 of file cluster.cpp.

2011  {
2012  float X;
2013 
2014  // wraparound circular parameters if necessary
2015  if (ParamDesc->Circular) {
2016  if (x - Mean > ParamDesc->HalfRange)
2017  x -= ParamDesc->Range;
2018  else if (x - Mean < -ParamDesc->HalfRange)
2019  x += ParamDesc->Range;
2020  }
2021 
2022  X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2023  if (X < 0)
2024  return 0;
2025  if (X > BUCKETTABLESIZE - 1)
2026  return (uint16_t) (BUCKETTABLESIZE - 1);
2027  return (uint16_t) floor((double) X);
2028 } // UniformBucket
int8_t Circular
Definition: ocrfeatures.h:44
float HalfRange
Definition: ocrfeatures.h:49
#define BUCKETTABLESIZE
Definition: cluster.cpp:161
float Range
Definition: ocrfeatures.h:48
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:628

◆ UniformDensity()

double UniformDensity ( int32_t  x)

This routine computes the probability density function of a uniform distribution at the specified point. The range of the distribution is from 0 to BUCKETTABLESIZE.

Parameters
xnumber to compute the uniform probability density for
Returns
The value of the uniform distribution at x.

Definition at line 1864 of file cluster.cpp.

1864  {
1865  static double UniformDistributionDensity = (double) 1.0 / BUCKETTABLESIZE;
1866 
1867  if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
1868  return UniformDistributionDensity;
1869  else
1870  return (double) 0.0;
1871 } // UniformDensity
#define BUCKETTABLESIZE
Definition: cluster.cpp:161

Variable Documentation

◆ FTable

const double FTable[FTABLE_Y][FTABLE_X]

Definition at line 36 of file cluster.cpp.