tesseract  5.0.0-alpha-619-ge9db
cluster.cpp File Reference
#include <cfloat>
#include <cmath>
#include <vector>
#include "cluster.h"
#include "emalloc.h"
#include "genericheap.h"
#include <tesseract/helpers.h>
#include "kdpair.h"
#include "matrix.h"
#include "tprintf.h"

Go to the source code of this file.

Classes

struct  TEMPCLUSTER
 
struct  STATISTICS
 
struct  BUCKETS
 
struct  CHISTRUCT
 
struct  ClusteringContext
 

Macros

#define _USE_MATH_DEFINES
 
#define HOTELLING   1
 
#define FTABLE_X   10
 
#define FTABLE_Y   100
 
#define MINVARIANCE   0.0004
 
#define MINSAMPLESPERBUCKET   5
 
#define MINSAMPLES   (MINBUCKETS * MINSAMPLESPERBUCKET)
 
#define MINSAMPLESNEEDED   1
 
#define BUCKETTABLESIZE   1024
 
#define NORMALEXTENT   3.0
 
#define Odd(N)   ((N)%2)
 
#define Mirror(N, R)   ((R) - (N) - 1)
 
#define Abs(N)   (((N) < 0) ? (-(N)) : (N))
 
#define SqrtOf2Pi   2.506628275
 
#define LOOKUPTABLESIZE   8
 
#define MAXDEGREESOFFREEDOM   MAXBUCKETS
 
#define MAXNEIGHBORS   2
 
#define MAXDISTANCE   FLT_MAX
 
#define CHIACCURACY   0.01
 
#define MINALPHA   (1e-200)
 
#define INITIALDELTA   0.1
 
#define DELTARATIO   0.1
 
#define ILLEGAL_CHAR   2
 

Typedefs

using ClusterPair = tesseract::KDPairInc< float, TEMPCLUSTER * >
 
using ClusterHeap = tesseract::GenericHeap< ClusterPair >
 
using DENSITYFUNC = double(*)(int32_t)
 
using SOLVEFUNC = double(*)(CHISTRUCT *, double)
 

Functions

CLUSTERERMakeClusterer (int16_t SampleSize, const PARAM_DESC ParamDesc[])
 
SAMPLEMakeSample (CLUSTERER *Clusterer, const float *Feature, int32_t CharID)
 
LIST ClusterSamples (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
void FreeClusterer (CLUSTERER *Clusterer)
 
void FreeProtoList (LIST *ProtoList)
 
void FreePrototype (void *arg)
 
CLUSTERNextSample (LIST *SearchState)
 
float Mean (PROTOTYPE *Proto, uint16_t Dimension)
 
float StandardDeviation (PROTOTYPE *Proto, uint16_t Dimension)
 
int32_t MergeClusters (int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
 

Variables

const double FTable [FTABLE_Y][FTABLE_X]
 

Macro Definition Documentation

◆ _USE_MATH_DEFINES

#define _USE_MATH_DEFINES

Definition at line 17 of file cluster.cpp.

◆ Abs

#define Abs (   N)    (((N) < 0) ? (-(N)) : (N))

Definition at line 209 of file cluster.cpp.

◆ BUCKETTABLESIZE

#define BUCKETTABLESIZE   1024

define the size of the table which maps normalized samples to histogram buckets. Also define the number of standard deviations in a normal distribution which are considered to be significant. The mapping table will be defined in such a way that it covers the specified number of standard deviations on either side of the mean. BUCKETTABLESIZE should always be even.

Definition at line 161 of file cluster.cpp.

◆ CHIACCURACY

#define CHIACCURACY   0.01

◆ DELTARATIO

#define DELTARATIO   0.1

◆ FTABLE_X

#define FTABLE_X   10

Definition at line 32 of file cluster.cpp.

◆ FTABLE_Y

#define FTABLE_Y   100

Definition at line 33 of file cluster.cpp.

◆ HOTELLING

#define HOTELLING   1

Definition at line 31 of file cluster.cpp.

◆ ILLEGAL_CHAR

#define ILLEGAL_CHAR   2

◆ INITIALDELTA

#define INITIALDELTA   0.1

◆ LOOKUPTABLESIZE

#define LOOKUPTABLESIZE   8

define lookup tables used to compute the number of histogram buckets that should be used for a given number of samples.

Definition at line 229 of file cluster.cpp.

◆ MAXDEGREESOFFREEDOM

#define MAXDEGREESOFFREEDOM   MAXBUCKETS

Definition at line 230 of file cluster.cpp.

◆ MAXDISTANCE

#define MAXDISTANCE   FLT_MAX

◆ MAXNEIGHBORS

#define MAXNEIGHBORS   2

◆ MINALPHA

#define MINALPHA   (1e-200)

◆ MINSAMPLES

#define MINSAMPLES   (MINBUCKETS * MINSAMPLESPERBUCKET)

Definition at line 152 of file cluster.cpp.

◆ MINSAMPLESNEEDED

#define MINSAMPLESNEEDED   1

Definition at line 153 of file cluster.cpp.

◆ MINSAMPLESPERBUCKET

#define MINSAMPLESPERBUCKET   5

define the absolute minimum number of samples which must be present in order to accurately test hypotheses about underlying probability distributions. Define separately the minimum samples that are needed before a statistical analysis is attempted; this number should be equal to MINSAMPLES but can be set to a lower number for early testing when very few samples are available.

Definition at line 151 of file cluster.cpp.

◆ MINVARIANCE

#define MINVARIANCE   0.0004

define the variance which will be used as a minimum variance for any dimension of any feature. Since most features are calculated from numbers with a precision no better than 1 in 128, the variance should never be less than the square of this number for parameters whose range is 1.

Definition at line 143 of file cluster.cpp.

◆ Mirror

#define Mirror (   N,
 
)    ((R) - (N) - 1)

Definition at line 208 of file cluster.cpp.

◆ NORMALEXTENT

#define NORMALEXTENT   3.0

Definition at line 162 of file cluster.cpp.

◆ Odd

#define Odd (   N)    ((N)%2)

Definition at line 207 of file cluster.cpp.

◆ SqrtOf2Pi

#define SqrtOf2Pi   2.506628275

the following variables describe a discrete normal distribution which is used by NormalDensity() and NormalBucket(). The constant NORMALEXTENT determines how many standard deviations of the distribution are mapped onto the fixed discrete range of x. x=0 is mapped to -NORMALEXTENT standard deviations and x=BUCKETTABLESIZE is mapped to +NORMALEXTENT standard deviations.

Definition at line 219 of file cluster.cpp.

Typedef Documentation

◆ ClusterHeap

Definition at line 170 of file cluster.cpp.

◆ ClusterPair

Definition at line 169 of file cluster.cpp.

◆ DENSITYFUNC

using DENSITYFUNC = double (*)(int32_t)

Definition at line 204 of file cluster.cpp.

◆ SOLVEFUNC

using SOLVEFUNC = double (*)(CHISTRUCT*, double)

Definition at line 205 of file cluster.cpp.

Function Documentation

◆ ClusterSamples()

LIST ClusterSamples ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

This routine first checks to see if the samples in this clusterer have already been clustered before; if so, it does not bother to recreate the cluster tree. It simply recomputes the prototypes based on the new Config info.

If the samples have not been clustered before, the samples in the KD tree are formed into a cluster tree and then the prototypes are computed from the cluster tree.

In either case this routine returns a pointer to a list of prototypes that best represent the samples given the constraints specified in Config.

Parameters
Clustererdata struct containing samples to be clustered
Configparameters which control clustering process
Returns
Pointer to a list of prototypes

Definition at line 483 of file cluster.cpp.

483  {
484  //only create cluster tree if samples have never been clustered before
485  if (Clusterer->Root == nullptr)
486  CreateClusterTree(Clusterer);
487 
488  //deallocate the old prototype list if one exists
489  FreeProtoList (&Clusterer->ProtoList);
490  Clusterer->ProtoList = NIL_LIST;
491 
492  //compute prototypes starting at the root node in the tree
493  ComputePrototypes(Clusterer, Config);
494  // We don't need the cluster pointers in the protos any more, so null them
495  // out, which makes it safe to delete the clusterer.
496  LIST proto_list = Clusterer->ProtoList;
497  iterate(proto_list) {
498  auto *proto = reinterpret_cast<PROTOTYPE *>(first_node(proto_list));
499  proto->Cluster = nullptr;
500  }
501  return Clusterer->ProtoList;
502 } // ClusterSamples

◆ FreeClusterer()

void FreeClusterer ( CLUSTERER Clusterer)

This routine frees all of the memory allocated to the specified data structure. It will not, however, free the memory used by the prototype list. The pointers to the clusters for each prototype in the list will be set to nullptr to indicate that the cluster data structures no longer exist. Any sample lists that have been obtained via calls to GetSamples are no longer valid.

Parameters
Clustererpointer to data structure to be freed

Definition at line 514 of file cluster.cpp.

514  {
515  if (Clusterer != nullptr) {
516  free(Clusterer->ParamDesc);
517  if (Clusterer->KDTree != nullptr)
518  FreeKDTree (Clusterer->KDTree);
519  if (Clusterer->Root != nullptr)
520  FreeCluster (Clusterer->Root);
521  // Free up all used buckets structures.
522  for (auto & d : Clusterer->bucket_cache) {
523  for (auto & c : d)
524  if (c != nullptr)
525  FreeBuckets(c);
526  }
527 
528  free(Clusterer);
529  }
530 } // FreeClusterer

◆ FreeProtoList()

void FreeProtoList ( LIST ProtoList)

This routine frees all of the memory allocated to the specified list of prototypes. The clusters which are pointed to by the prototypes are not freed.

Parameters
ProtoListpointer to list of prototypes to be freed

Definition at line 538 of file cluster.cpp.

538  {
539  destroy_nodes(*ProtoList, FreePrototype);
540 } // FreeProtoList

◆ FreePrototype()

void FreePrototype ( void *  arg)

This routine deallocates the memory consumed by the specified prototype and modifies the corresponding cluster so that it is no longer marked as a prototype. The cluster is NOT deallocated by this routine.

Parameters
argprototype data structure to be deallocated

Definition at line 549 of file cluster.cpp.

549  { //PROTOTYPE *Prototype)
550  auto *Prototype = static_cast<PROTOTYPE *>(arg);
551 
552  // unmark the corresponding cluster (if there is one
553  if (Prototype->Cluster != nullptr)
554  Prototype->Cluster->Prototype = false;
555 
556  // deallocate the prototype statistics and then the prototype itself
557  free(Prototype->Distrib);
558  free(Prototype->Mean);
559  if (Prototype->Style != spherical) {
560  free(Prototype->Variance.Elliptical);
561  free(Prototype->Magnitude.Elliptical);
562  free(Prototype->Weight.Elliptical);
563  }
564  free(Prototype);
565 } // FreePrototype

◆ MakeClusterer()

CLUSTERER* MakeClusterer ( int16_t  SampleSize,
const PARAM_DESC  ParamDesc[] 
)

This routine creates a new clusterer data structure, initializes it, and returns a pointer to it.

Parameters
SampleSizenumber of dimensions in feature space
ParamDescdescription of each dimension
Returns
pointer to the new clusterer data structure

Definition at line 376 of file cluster.cpp.

376  {
377  CLUSTERER *Clusterer;
378  int i;
379 
380  // allocate main clusterer data structure and init simple fields
381  Clusterer = static_cast<CLUSTERER *>(Emalloc (sizeof (CLUSTERER)));
382  Clusterer->SampleSize = SampleSize;
383  Clusterer->NumberOfSamples = 0;
384  Clusterer->NumChar = 0;
385 
386  // init fields which will not be used initially
387  Clusterer->Root = nullptr;
388  Clusterer->ProtoList = NIL_LIST;
389 
390  // maintain a copy of param descriptors in the clusterer data structure
391  Clusterer->ParamDesc =
392  static_cast<PARAM_DESC *>(Emalloc (SampleSize * sizeof (PARAM_DESC)));
393  for (i = 0; i < SampleSize; i++) {
394  Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
395  Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
396  Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
397  Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
398  Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
399  Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
400  Clusterer->ParamDesc[i].MidRange =
401  (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
402  }
403 
404  // allocate a kd tree to hold the samples
405  Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
406 
407  // Initialize cache of histogram buckets to minimize recomputing them.
408  for (auto & d : Clusterer->bucket_cache) {
409  for (auto & c : d)
410  c = nullptr;
411  }
412 
413  return Clusterer;
414 } // MakeClusterer

◆ MakeSample()

SAMPLE* MakeSample ( CLUSTERER Clusterer,
const float *  Feature,
int32_t  CharID 
)

This routine creates a new sample data structure to hold the specified feature. This sample is added to the clusterer data structure (so that it knows which samples are to be clustered later), and a pointer to the sample is returned to the caller.

Parameters
Clustererclusterer data structure to add sample to
Featurefeature to be added to clusterer
CharIDunique ident. of char that sample came from
Returns
Pointer to the new sample data structure

Definition at line 429 of file cluster.cpp.

430  {
431  SAMPLE *Sample;
432  int i;
433 
434  // see if the samples have already been clustered - if so trap an error
435  // Can't add samples after they have been clustered.
436  ASSERT_HOST(Clusterer->Root == nullptr);
437 
438  // allocate the new sample and initialize it
439  Sample = static_cast<SAMPLE *>(Emalloc (sizeof (SAMPLE) +
440  (Clusterer->SampleSize -
441  1) * sizeof (float)));
442  Sample->Clustered = false;
443  Sample->Prototype = false;
444  Sample->SampleCount = 1;
445  Sample->Left = nullptr;
446  Sample->Right = nullptr;
447  Sample->CharID = CharID;
448 
449  for (i = 0; i < Clusterer->SampleSize; i++)
450  Sample->Mean[i] = Feature[i];
451 
452  // add the sample to the KD tree - keep track of the total # of samples
453  Clusterer->NumberOfSamples++;
454  KDStore(Clusterer->KDTree, Sample->Mean, Sample);
455  if (CharID >= Clusterer->NumChar)
456  Clusterer->NumChar = CharID + 1;
457 
458  // execute hook for monitoring clustering operation
459  // (*SampleCreationHook)(Sample);
460 
461  return (Sample);
462 } // MakeSample

◆ Mean()

float Mean ( PROTOTYPE Proto,
uint16_t  Dimension 
)

This routine returns the mean of the specified prototype in the indicated dimension.

Parameters
Protoprototype to return mean of
Dimensiondimension whose mean is to be returned
Returns
Mean of Prototype in Dimension

Definition at line 602 of file cluster.cpp.

602  {
603  return (Proto->Mean[Dimension]);
604 } // Mean

◆ MergeClusters()

int32_t MergeClusters ( int16_t  N,
PARAM_DESC  ParamDesc[],
int32_t  n1,
int32_t  n2,
float  m[],
float  m1[],
float  m2[] 
)

This routine merges two clusters into one larger cluster. To do this it computes the number of samples in the new cluster and the mean of the new cluster. The ParamDesc information is used to ensure that circular dimensions are handled correctly.

Parameters
N# of dimensions (size of arrays)
ParamDescarray of dimension descriptions
n1,n2number of samples in each old cluster
marray to hold mean of new cluster
m1,m2arrays containing means of old clusters
Returns
The number of samples in the new cluster.

Definition at line 824 of file cluster.cpp.

829  {
830  int32_t i, n;
831 
832  n = n1 + n2;
833  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
834  if (ParamDesc->Circular) {
835  // if distance between means is greater than allowed
836  // reduce upper point by one "rotation" to compute mean
837  // then normalize the mean back into the accepted range
838  if ((*m2 - *m1) > ParamDesc->HalfRange) {
839  *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
840  if (*m < ParamDesc->Min)
841  *m += ParamDesc->Range;
842  }
843  else if ((*m1 - *m2) > ParamDesc->HalfRange) {
844  *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
845  if (*m < ParamDesc->Min)
846  *m += ParamDesc->Range;
847  }
848  else
849  *m = (n1 * *m1 + n2 * *m2) / n;
850  }
851  else
852  *m = (n1 * *m1 + n2 * *m2) / n;
853  }
854  return n;
855 } // MergeClusters

◆ NextSample()

CLUSTER* NextSample ( LIST SearchState)

This routine is used to find all of the samples which belong to a cluster. It starts by removing the top cluster on the cluster list (SearchState). If this cluster is a leaf it is returned. Otherwise, the right subcluster is pushed on the list and we continue the search in the left subcluster. This continues until a leaf is found. If all samples have been found, nullptr is returned. InitSampleSearch() must be called before NextSample() to initialize the search.

Parameters
SearchStateptr to list containing clusters to be searched
Returns
Pointer to the next leaf cluster (sample) or nullptr.

Definition at line 580 of file cluster.cpp.

580  {
581  CLUSTER *Cluster;
582 
583  if (*SearchState == NIL_LIST)
584  return (nullptr);
585  Cluster = reinterpret_cast<CLUSTER *>first_node (*SearchState);
586  *SearchState = pop (*SearchState);
587  for (;;) {
588  if (Cluster->Left == nullptr)
589  return (Cluster);
590  *SearchState = push (*SearchState, Cluster->Right);
591  Cluster = Cluster->Left;
592  }
593 } // NextSample

◆ StandardDeviation()

float StandardDeviation ( PROTOTYPE Proto,
uint16_t  Dimension 
)

This routine returns the standard deviation of the prototype in the indicated dimension.

Parameters
Protoprototype to return standard deviation of
Dimensiondimension whose stddev is to be returned
Returns
Standard deviation of Prototype in Dimension

Definition at line 613 of file cluster.cpp.

613  {
614  switch (Proto->Style) {
615  case spherical:
616  return (static_cast<float>(sqrt (static_cast<double>(Proto->Variance.Spherical))));
617  case elliptical:
618  return (static_cast<float>(sqrt (static_cast<double>(Proto->Variance.Elliptical[Dimension]))));
619  case mixed:
620  switch (Proto->Distrib[Dimension]) {
621  case normal:
622  return (static_cast<float>(sqrt (static_cast<double>(Proto->Variance.Elliptical[Dimension]))));
623  case uniform:
624  case D_random:
625  return (Proto->Variance.Elliptical[Dimension]);
626  case DISTRIBUTION_COUNT:
627  ASSERT_HOST(!"Distribution count not allowed!");
628  }
629  }
630  return 0.0f;
631 } // StandardDeviation

Variable Documentation

◆ FTable

const double FTable[FTABLE_Y][FTABLE_X]

Definition at line 36 of file cluster.cpp.

CLUSTERER::NumChar
int32_t NumChar
Definition: cluster.h:88
sample::SampleCount
unsigned SampleCount
Definition: cluster.h:34
first_node
#define first_node(l)
Definition: oldlist.h:84
PARAM_DESC::Circular
bool Circular
Definition: ocrfeatures.h:42
Emalloc
void * Emalloc(int Size)
Definition: emalloc.cpp:31
destroy_nodes
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:138
elliptical
Definition: cluster.h:43
MakeKDTree
KDTREE * MakeKDTree(int16_t KeySize, const PARAM_DESC KeyDesc[])
Definition: kdtree.cpp:179
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
list_rec
Definition: oldlist.h:73
sample::Clustered
bool Clustered
Definition: cluster.h:32
sample::Mean
float Mean[1]
Definition: cluster.h:38
FreeProtoList
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:538
normal
Definition: cluster.h:55
sample::Prototype
bool Prototype
Definition: cluster.h:33
Config
CLUSTERCONFIG Config
Definition: commontraining.cpp:88
CLUSTERER::SampleSize
int16_t SampleSize
Definition: cluster.h:82
PARAM_DESC::Min
float Min
Definition: ocrfeatures.h:44
CLUSTERER::Root
CLUSTER * Root
Definition: cluster.h:86
PARAM_DESC::Range
float Range
Definition: ocrfeatures.h:46
NIL_LIST
#define NIL_LIST
Definition: oldlist.h:68
PARAM_DESC::MidRange
float MidRange
Definition: ocrfeatures.h:48
sample::Right
struct sample * Right
Definition: cluster.h:36
CLUSTERER::ParamDesc
PARAM_DESC * ParamDesc
Definition: cluster.h:83
CLUSTERER::NumberOfSamples
int32_t NumberOfSamples
Definition: cluster.h:84
uniform
Definition: cluster.h:55
FLOATUNION::Elliptical
float * Elliptical
Definition: cluster.h:59
FreeKDTree
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:330
KDStore
void KDStore(KDTREE *Tree, float *Key, void *Data)
Definition: kdtree.cpp:211
mixed
Definition: cluster.h:43
PARAM_DESC::Max
float Max
Definition: ocrfeatures.h:45
push
LIST push(LIST list, void *element)
Definition: oldlist.cpp:172
sample::CharID
int32_t CharID
Definition: cluster.h:37
sample
Definition: cluster.h:31
PARAM_DESC
Definition: ocrfeatures.h:41
PARAM_DESC::NonEssential
bool NonEssential
Definition: ocrfeatures.h:43
PROTOTYPE::Mean
float * Mean
Definition: cluster.h:73
FLOATUNION::Spherical
float Spherical
Definition: cluster.h:58
CLUSTERER
Definition: cluster.h:81
PARAM_DESC::HalfRange
float HalfRange
Definition: ocrfeatures.h:47
iterate
#define iterate(l)
Definition: oldlist.h:92
FreePrototype
void FreePrototype(void *arg)
Definition: cluster.cpp:549
spherical
Definition: cluster.h:43
PROTOTYPE::Style
unsigned Style
Definition: cluster.h:69
pop
LIST pop(LIST list)
Definition: oldlist.cpp:161
PROTOTYPE::Variance
FLOATUNION Variance
Definition: cluster.h:76
CLUSTERER::bucket_cache
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:90
sample::Left
struct sample * Left
Definition: cluster.h:35
DISTRIBUTION_COUNT
Definition: cluster.h:55
CLUSTERER::ProtoList
LIST ProtoList
Definition: cluster.h:87
D_random
Definition: cluster.h:55
CLUSTERER::KDTree
KDTREE * KDTree
Definition: cluster.h:85
PROTOTYPE::Distrib
DISTRIBUTION * Distrib
Definition: cluster.h:72