tessapi/5.x/a00644_source.html

/******************************************************************************

 ** Filename: cluster.cpp

 ** Purpose:  Routines for clustering points in N-D space

 ** Author:   Dan Johnson

 **

 ** (c) Copyright Hewlett-Packard Company, 1988.

 ** Licensed under the Apache License, Version 2.0 (the "License");

 ** you may not use this file except in compliance with the License.

 ** You may obtain a copy of the License at

 ** http://www.apache.org/licenses/LICENSE-2.0

 ** Unless required by applicable law or agreed to in writing, software

 ** distributed under the License is distributed on an "AS IS" BASIS,

 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 ** See the License for the specific language governing permissions and

 ** limitations under the License.

 *****************************************************************************/


#define _USE_MATH_DEFINES // for M_PI

#include <cfloat>       // for FLT_MAX

#include <cmath>        // for M_PI

#include <vector>       // for std::vector


#include "cluster.h"

#include "emalloc.h"

#include "genericheap.h"

#include <tesseract/helpers.h>

#include "kdpair.h"

#include "matrix.h"

#include "tprintf.h"


#define HOTELLING 1  // If true use Hotelling's test to decide where to split.

#define FTABLE_X 10  // Size of FTable.

#define FTABLE_Y 100  // Size of FTable.


// Table of values approximating the cumulative F-distribution for a confidence of 1%.

const double FTable[FTABLE_Y][FTABLE_X] = {

 {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,},

  {98.502,  99.000,  99.166,  99.249,  99.300,  99.333,  99.356,  99.374,  99.388,  99.399,},

  {34.116,  30.816,  29.457,  28.710,  28.237,  27.911,  27.672,  27.489,  27.345,  27.229,},

  {21.198,  18.000,  16.694,  15.977,  15.522,  15.207,  14.976,  14.799,  14.659,  14.546,},

  {16.258,  13.274,  12.060,  11.392,  10.967,  10.672,  10.456,  10.289,  10.158,  10.051,},

  {13.745,  10.925,   9.780,   9.148,   8.746,   8.466,   8.260,   8.102,   7.976,   7.874,},

  {12.246,   9.547,   8.451,   7.847,   7.460,   7.191,   6.993,   6.840,   6.719,   6.620,},

  {11.259,   8.649,   7.591,   7.006,   6.632,   6.371,   6.178,   6.029,   5.911,   5.814,},

  {10.561,   8.022,   6.992,   6.422,   6.057,   5.802,   5.613,   5.467,   5.351,   5.257,},

  {10.044,   7.559,   6.552,   5.994,   5.636,   5.386,   5.200,   5.057,   4.942,   4.849,},

  { 9.646,   7.206,   6.217,   5.668,   5.316,   5.069,   4.886,   4.744,   4.632,   4.539,},

  { 9.330,   6.927,   5.953,   5.412,   5.064,   4.821,   4.640,   4.499,   4.388,   4.296,},

  { 9.074,   6.701,   5.739,   5.205,   4.862,   4.620,   4.441,   4.302,   4.191,   4.100,},

  { 8.862,   6.515,   5.564,   5.035,   4.695,   4.456,   4.278,   4.140,   4.030,   3.939,},

  { 8.683,   6.359,   5.417,   4.893,   4.556,   4.318,   4.142,   4.004,   3.895,   3.805,},

  { 8.531,   6.226,   5.292,   4.773,   4.437,   4.202,   4.026,   3.890,   3.780,   3.691,},

  { 8.400,   6.112,   5.185,   4.669,   4.336,   4.102,   3.927,   3.791,   3.682,   3.593,},

  { 8.285,   6.013,   5.092,   4.579,   4.248,   4.015,   3.841,   3.705,   3.597,   3.508,},

  { 8.185,   5.926,   5.010,   4.500,   4.171,   3.939,   3.765,   3.631,   3.523,   3.434,},

  { 8.096,   5.849,   4.938,   4.431,   4.103,   3.871,   3.699,   3.564,   3.457,   3.368,},

  { 8.017,   5.780,   4.874,   4.369,   4.042,   3.812,   3.640,   3.506,   3.398,   3.310,},

  { 7.945,   5.719,   4.817,   4.313,   3.988,   3.758,   3.587,   3.453,   3.346,   3.258,},

  { 7.881,   5.664,   4.765,   4.264,   3.939,   3.710,   3.539,   3.406,   3.299,   3.211,},

  { 7.823,   5.614,   4.718,   4.218,   3.895,   3.667,   3.496,   3.363,   3.256,   3.168,},

  { 7.770,   5.568,   4.675,   4.177,   3.855,   3.627,   3.457,   3.324,   3.217,   3.129,},

  { 7.721,   5.526,   4.637,   4.140,   3.818,   3.591,   3.421,   3.288,   3.182,   3.094,},

  { 7.677,   5.488,   4.601,   4.106,   3.785,   3.558,   3.388,   3.256,   3.149,   3.062,},

  { 7.636,   5.453,   4.568,   4.074,   3.754,   3.528,   3.358,   3.226,   3.120,   3.032,},

  { 7.598,   5.420,   4.538,   4.045,   3.725,   3.499,   3.330,   3.198,   3.092,   3.005,},

  { 7.562,   5.390,   4.510,   4.018,   3.699,   3.473,   3.305,   3.173,   3.067,   2.979,},

  { 7.530,   5.362,   4.484,   3.993,   3.675,   3.449,   3.281,   3.149,   3.043,   2.955,},

  { 7.499,   5.336,   4.459,   3.969,   3.652,   3.427,   3.258,   3.127,   3.021,   2.934,},

  { 7.471,   5.312,   4.437,   3.948,   3.630,   3.406,   3.238,   3.106,   3.000,   2.913,},

  { 7.444,   5.289,   4.416,   3.927,   3.611,   3.386,   3.218,   3.087,   2.981,   2.894,},

  { 7.419,   5.268,   4.396,   3.908,   3.592,   3.368,   3.200,   3.069,   2.963,   2.876,},

  { 7.396,   5.248,   4.377,   3.890,   3.574,   3.351,   3.183,   3.052,   2.946,   2.859,},

  { 7.373,   5.229,   4.360,   3.873,   3.558,   3.334,   3.167,   3.036,   2.930,   2.843,},

  { 7.353,   5.211,   4.343,   3.858,   3.542,   3.319,   3.152,   3.021,   2.915,   2.828,},

  { 7.333,   5.194,   4.327,   3.843,   3.528,   3.305,   3.137,   3.006,   2.901,   2.814,},

  { 7.314,   5.179,   4.313,   3.828,   3.514,   3.291,   3.124,   2.993,   2.888,   2.801,},

  { 7.296,   5.163,   4.299,   3.815,   3.501,   3.278,   3.111,   2.980,   2.875,   2.788,},

  { 7.280,   5.149,   4.285,   3.802,   3.488,   3.266,   3.099,   2.968,   2.863,   2.776,},

  { 7.264,   5.136,   4.273,   3.790,   3.476,   3.254,   3.087,   2.957,   2.851,   2.764,},

  { 7.248,   5.123,   4.261,   3.778,   3.465,   3.243,   3.076,   2.946,   2.840,   2.754,},

  { 7.234,   5.110,   4.249,   3.767,   3.454,   3.232,   3.066,   2.935,   2.830,   2.743,},

  { 7.220,   5.099,   4.238,   3.757,   3.444,   3.222,   3.056,   2.925,   2.820,   2.733,},

  { 7.207,   5.087,   4.228,   3.747,   3.434,   3.213,   3.046,   2.916,   2.811,   2.724,},

  { 7.194,   5.077,   4.218,   3.737,   3.425,   3.204,   3.037,   2.907,   2.802,   2.715,},

  { 7.182,   5.066,   4.208,   3.728,   3.416,   3.195,   3.028,   2.898,   2.793,   2.706,},

  { 7.171,   5.057,   4.199,   3.720,   3.408,   3.186,   3.020,   2.890,   2.785,   2.698,},

  { 7.159,   5.047,   4.191,   3.711,   3.400,   3.178,   3.012,   2.882,   2.777,   2.690,},

  { 7.149,   5.038,   4.182,   3.703,   3.392,   3.171,   3.005,   2.874,   2.769,   2.683,},

  { 7.139,   5.030,   4.174,   3.695,   3.384,   3.163,   2.997,   2.867,   2.762,   2.675,},

  { 7.129,   5.021,   4.167,   3.688,   3.377,   3.156,   2.990,   2.860,   2.755,   2.668,},

  { 7.119,   5.013,   4.159,   3.681,   3.370,   3.149,   2.983,   2.853,   2.748,   2.662,},

  { 7.110,   5.006,   4.152,   3.674,   3.363,   3.143,   2.977,   2.847,   2.742,   2.655,},

  { 7.102,   4.998,   4.145,   3.667,   3.357,   3.136,   2.971,   2.841,   2.736,   2.649,},

  { 7.093,   4.991,   4.138,   3.661,   3.351,   3.130,   2.965,   2.835,   2.730,   2.643,},

  { 7.085,   4.984,   4.132,   3.655,   3.345,   3.124,   2.959,   2.829,   2.724,   2.637,},

  { 7.077,   4.977,   4.126,   3.649,   3.339,   3.119,   2.953,   2.823,   2.718,   2.632,},

  { 7.070,   4.971,   4.120,   3.643,   3.333,   3.113,   2.948,   2.818,   2.713,   2.626,},

  { 7.062,   4.965,   4.114,   3.638,   3.328,   3.108,   2.942,   2.813,   2.708,   2.621,},

  { 7.055,   4.959,   4.109,   3.632,   3.323,   3.103,   2.937,   2.808,   2.703,   2.616,},

  { 7.048,   4.953,   4.103,   3.627,   3.318,   3.098,   2.932,   2.803,   2.698,   2.611,},

  { 7.042,   4.947,   4.098,   3.622,   3.313,   3.093,   2.928,   2.798,   2.693,   2.607,},

  { 7.035,   4.942,   4.093,   3.618,   3.308,   3.088,   2.923,   2.793,   2.689,   2.602,},

  { 7.029,   4.937,   4.088,   3.613,   3.304,   3.084,   2.919,   2.789,   2.684,   2.598,},

  { 7.023,   4.932,   4.083,   3.608,   3.299,   3.080,   2.914,   2.785,   2.680,   2.593,},

  { 7.017,   4.927,   4.079,   3.604,   3.295,   3.075,   2.910,   2.781,   2.676,   2.589,},

  { 7.011,   4.922,   4.074,   3.600,   3.291,   3.071,   2.906,   2.777,   2.672,   2.585,},

  { 7.006,   4.917,   4.070,   3.596,   3.287,   3.067,   2.902,   2.773,   2.668,   2.581,},

  { 7.001,   4.913,   4.066,   3.591,   3.283,   3.063,   2.898,   2.769,   2.664,   2.578,},

  { 6.995,   4.908,   4.062,   3.588,   3.279,   3.060,   2.895,   2.765,   2.660,   2.574,},

  { 6.990,   4.904,   4.058,   3.584,   3.275,   3.056,   2.891,   2.762,   2.657,   2.570,},

  { 6.985,   4.900,   4.054,   3.580,   3.272,   3.052,   2.887,   2.758,   2.653,   2.567,},

  { 6.981,   4.896,   4.050,   3.577,   3.268,   3.049,   2.884,   2.755,   2.650,   2.563,},

  { 6.976,   4.892,   4.047,   3.573,   3.265,   3.046,   2.881,   2.751,   2.647,   2.560,},

  { 6.971,   4.888,   4.043,   3.570,   3.261,   3.042,   2.877,   2.748,   2.644,   2.557,},

  { 6.967,   4.884,   4.040,   3.566,   3.258,   3.039,   2.874,   2.745,   2.640,   2.554,},

  { 6.963,   4.881,   4.036,   3.563,   3.255,   3.036,   2.871,   2.742,   2.637,   2.551,},

  { 6.958,   4.877,   4.033,   3.560,   3.252,   3.033,   2.868,   2.739,   2.634,   2.548,},

  { 6.954,   4.874,   4.030,   3.557,   3.249,   3.030,   2.865,   2.736,   2.632,   2.545,},

  { 6.950,   4.870,   4.027,   3.554,   3.246,   3.027,   2.863,   2.733,   2.629,   2.542,},

  { 6.947,   4.867,   4.024,   3.551,   3.243,   3.025,   2.860,   2.731,   2.626,   2.539,},

  { 6.943,   4.864,   4.021,   3.548,   3.240,   3.022,   2.857,   2.728,   2.623,   2.537,},

  { 6.939,   4.861,   4.018,   3.545,   3.238,   3.019,   2.854,   2.725,   2.621,   2.534,},

  { 6.935,   4.858,   4.015,   3.543,   3.235,   3.017,   2.852,   2.723,   2.618,   2.532,},

  { 6.932,   4.855,   4.012,   3.540,   3.233,   3.014,   2.849,   2.720,   2.616,   2.529,},

  { 6.928,   4.852,   4.010,   3.538,   3.230,   3.012,   2.847,   2.718,   2.613,   2.527,},

  { 6.925,   4.849,   4.007,   3.535,   3.228,   3.009,   2.845,   2.715,   2.611,   2.524,},

  { 6.922,   4.846,   4.004,   3.533,   3.225,   3.007,   2.842,   2.713,   2.609,   2.522,},

  { 6.919,   4.844,   4.002,   3.530,   3.223,   3.004,   2.840,   2.711,   2.606,   2.520,},

  { 6.915,   4.841,   3.999,   3.528,   3.221,   3.002,   2.838,   2.709,   2.604,   2.518,},

  { 6.912,   4.838,   3.997,   3.525,   3.218,   3.000,   2.835,   2.706,   2.602,   2.515,},

  { 6.909,   4.836,   3.995,   3.523,   3.216,   2.998,   2.833,   2.704,   2.600,   2.513,},

  { 6.906,   4.833,   3.992,   3.521,   3.214,   2.996,   2.831,   2.702,   2.598,   2.511,},

  { 6.904,   4.831,   3.990,   3.519,   3.212,   2.994,   2.829,   2.700,   2.596,   2.509,},

  { 6.901,   4.829,   3.988,   3.517,   3.210,   2.992,   2.827,   2.698,   2.594,   2.507,},

  { 6.898,   4.826,   3.986,   3.515,   3.208,   2.990,   2.825,   2.696,   2.592,   2.505,},

  { 6.895,   4.824,   3.984,   3.513,   3.206,   2.988,   2.823,   2.694,   2.590,   2.503}

};


#define MINVARIANCE     0.0004


#define MINSAMPLESPERBUCKET 5

#define MINSAMPLES    (MINBUCKETS * MINSAMPLESPERBUCKET)

#define MINSAMPLESNEEDED  1


#define BUCKETTABLESIZE   1024

#define NORMALEXTENT    3.0


struct TEMPCLUSTER {

  CLUSTER *Cluster;

  CLUSTER *Neighbor;

};


using ClusterPair = tesseract::KDPairInc<float, TEMPCLUSTER*>;

using ClusterHeap = tesseract::GenericHeap<ClusterPair>;


struct STATISTICS {

  float AvgVariance;

  float *CoVariance;

  float *Min;                  // largest negative distance from the mean

  float *Max;                  // largest positive distance from the mean

};


struct BUCKETS {

  DISTRIBUTION Distribution;        // distribution being tested for

  uint32_t SampleCount;             // # of samples in histogram

  double Confidence;                // confidence level of test

  double ChiSquared;                // test threshold

  uint16_t NumberOfBuckets;         // number of cells in histogram

  uint16_t Bucket[BUCKETTABLESIZE]; // mapping to histogram buckets

  uint32_t *Count;                  // frequency of occurrence histogram

  float *ExpectedCount;             // expected histogram

};


struct CHISTRUCT{

  uint16_t DegreesOfFreedom;

  double Alpha;

  double ChiSquared;

};


// For use with KDWalk / MakePotentialClusters

struct ClusteringContext {

  ClusterHeap *heap;  // heap used to hold temp clusters, "best" on top

  TEMPCLUSTER *candidates;  // array of potential clusters

  KDTREE *tree;  // kd-tree to be searched for neighbors

  int32_t next;  // next candidate to be used

};


using DENSITYFUNC = double (*)(int32_t);

using SOLVEFUNC = double (*)(CHISTRUCT*, double);


#define Odd(N) ((N)%2)

#define Mirror(N,R) ((R) - (N) - 1)

#define Abs(N) (((N) < 0) ? (-(N)) : (N))


//--------------Global Data Definitions and Declarations----------------------

#define SqrtOf2Pi     2.506628275

static const double kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);

static const double kNormalVariance =

    (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT);

static const double kNormalMagnitude =

    (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);

static const double kNormalMean = BUCKETTABLESIZE / 2;


#define LOOKUPTABLESIZE   8

#define MAXDEGREESOFFREEDOM MAXBUCKETS


static const uint32_t kCountTable[LOOKUPTABLESIZE] = {

  MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000

};  // number of samples


static const uint16_t kBucketsTable[LOOKUPTABLESIZE] = {

  MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS

};  // number of buckets


/*-------------------------------------------------------------------------

          Private Function Prototypes

--------------------------------------------------------------------------*/

static void CreateClusterTree(CLUSTERER* Clusterer);


static void MakePotentialClusters(ClusteringContext* context, CLUSTER* Cluster,

                                  int32_t Level);


static CLUSTER* FindNearestNeighbor(KDTREE*Tree, CLUSTER* Cluster,

                                    float* Distance);


static CLUSTER* MakeNewCluster(CLUSTERER* Clusterer, TEMPCLUSTER* TempCluster);


static void ComputePrototypes(CLUSTERER* Clusterer, CLUSTERCONFIG* Config);


static PROTOTYPE* MakePrototype(CLUSTERER* Clusterer, CLUSTERCONFIG* Config,

                                CLUSTER* Cluster);


static PROTOTYPE* MakeDegenerateProto(uint16_t N,

                                      CLUSTER* Cluster, STATISTICS* Statistics,

                                      PROTOSTYLE Style, int32_t MinSamples);


static PROTOTYPE* TestEllipticalProto(CLUSTERER* Clusterer,

                                      CLUSTERCONFIG* Config, CLUSTER* Cluster,

                                      STATISTICS* Statistics);


static PROTOTYPE* MakeSphericalProto(CLUSTERER* Clusterer,

                                     CLUSTER* Cluster, STATISTICS* Statistics,

                                     BUCKETS* Buckets);


static PROTOTYPE* MakeEllipticalProto(CLUSTERER* Clusterer,

                                      CLUSTER* Cluster, STATISTICS* Statistics,

                                      BUCKETS* Buckets);


static PROTOTYPE* MakeMixedProto(CLUSTERER* Clusterer,

                                 CLUSTER* Cluster, STATISTICS* Statistics,

                                 BUCKETS* NormalBuckets, double Confidence);


static void MakeDimRandom(uint16_t i, PROTOTYPE* Proto, PARAM_DESC* ParamDesc);


static void MakeDimUniform(uint16_t i, PROTOTYPE* Proto, STATISTICS* Statistics);


static STATISTICS* ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[],

                                     CLUSTER* Cluster);


static PROTOTYPE* NewSphericalProto(uint16_t N, CLUSTER* Cluster,

                                    STATISTICS* Statistics);


static PROTOTYPE* NewEllipticalProto(int16_t N, CLUSTER* Cluster,

                                     STATISTICS* Statistics);


static PROTOTYPE* NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);


static PROTOTYPE* NewSimpleProto(int16_t N, CLUSTER *Cluster);


static bool Independent(PARAM_DESC* ParamDesc,

                 int16_t N, float* CoVariance, float Independence);


static BUCKETS *GetBuckets(CLUSTERER* clusterer,

                    DISTRIBUTION Distribution,

                    uint32_t SampleCount,

                    double Confidence);


static BUCKETS *MakeBuckets(DISTRIBUTION Distribution,

                     uint32_t SampleCount,

                     double Confidence);


static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount);


static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha);


static double NormalDensity(int32_t x);


static double UniformDensity(int32_t x);


static double Integral(double f1, double f2, double Dx);


static void FillBuckets(BUCKETS *Buckets,

                 CLUSTER *Cluster,

                 uint16_t Dim,

                 PARAM_DESC *ParamDesc,

                 float Mean,

                 float StdDev);


static uint16_t NormalBucket(PARAM_DESC *ParamDesc,

                    float x,

                    float Mean,

                    float StdDev);


static uint16_t UniformBucket(PARAM_DESC *ParamDesc,

                     float x,

                     float Mean,

                     float StdDev);


static bool DistributionOK(BUCKETS* Buckets);


static void FreeStatistics(STATISTICS *Statistics);


static void FreeBuckets(BUCKETS *Buckets);


static void FreeCluster(CLUSTER *Cluster);


static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets);


static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount);


static void InitBuckets(BUCKETS *Buckets);


static int AlphaMatch(void *arg1,   // CHISTRUCT *ChiStruct,

               void *arg2);  // CHISTRUCT *SearchKey);


static CHISTRUCT *NewChiStruct(uint16_t DegreesOfFreedom, double Alpha);


static double Solve(SOLVEFUNC Function,

             void *FunctionParams,

             double InitialGuess,

             double Accuracy);


static double ChiArea(CHISTRUCT *ChiParams, double x);


static bool MultipleCharSamples(CLUSTERER* Clusterer,

                         CLUSTER* Cluster,

                         float MaxIllegal);


static double InvertMatrix(const float* input, int size, float* inv);


//--------------------------Public Code--------------------------------------

CLUSTERER *

MakeClusterer (int16_t SampleSize, const PARAM_DESC ParamDesc[]) {

  CLUSTERER *Clusterer;

  int i;


  // allocate main clusterer data structure and init simple fields

  Clusterer = static_cast<CLUSTERER *>(Emalloc (sizeof (CLUSTERER)));

  Clusterer->SampleSize = SampleSize;

  Clusterer->NumberOfSamples = 0;

  Clusterer->NumChar = 0;


  // init fields which will not be used initially

  Clusterer->Root = nullptr;

  Clusterer->ProtoList = NIL_LIST;


  // maintain a copy of param descriptors in the clusterer data structure

  Clusterer->ParamDesc =

    static_cast<PARAM_DESC *>(Emalloc (SampleSize * sizeof (PARAM_DESC)));

  for (i = 0; i < SampleSize; i++) {

    Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;

    Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;

    Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;

    Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;

    Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;

    Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;

    Clusterer->ParamDesc[i].MidRange =

      (ParamDesc[i].Max + ParamDesc[i].Min) / 2;

  }


  // allocate a kd tree to hold the samples

  Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);


  // Initialize cache of histogram buckets to minimize recomputing them.

  for (auto & d : Clusterer->bucket_cache) {

    for (auto & c : d)

      c = nullptr;

  }


  return Clusterer;

}                                // MakeClusterer


SAMPLE* MakeSample(CLUSTERER * Clusterer, const float* Feature,

                   int32_t CharID) {

  SAMPLE *Sample;

  int i;


  // see if the samples have already been clustered - if so trap an error

  // Can't add samples after they have been clustered.

  ASSERT_HOST(Clusterer->Root == nullptr);


  // allocate the new sample and initialize it

  Sample = static_cast<SAMPLE *>(Emalloc (sizeof (SAMPLE) +

    (Clusterer->SampleSize -

    1) * sizeof (float)));

  Sample->Clustered = false;

  Sample->Prototype = false;

  Sample->SampleCount = 1;

  Sample->Left = nullptr;

  Sample->Right = nullptr;

  Sample->CharID = CharID;


  for (i = 0; i < Clusterer->SampleSize; i++)

    Sample->Mean[i] = Feature[i];


  // add the sample to the KD tree - keep track of the total # of samples

  Clusterer->NumberOfSamples++;

  KDStore(Clusterer->KDTree, Sample->Mean, Sample);

  if (CharID >= Clusterer->NumChar)

    Clusterer->NumChar = CharID + 1;


  // execute hook for monitoring clustering operation

  // (*SampleCreationHook)(Sample);


  return (Sample);

}                                // MakeSample


LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {

  //only create cluster tree if samples have never been clustered before

  if (Clusterer->Root == nullptr)

    CreateClusterTree(Clusterer);


  //deallocate the old prototype list if one exists

  FreeProtoList (&Clusterer->ProtoList);

  Clusterer->ProtoList = NIL_LIST;


  //compute prototypes starting at the root node in the tree

  ComputePrototypes(Clusterer, Config);

  // We don't need the cluster pointers in the protos any more, so null them

  // out, which makes it safe to delete the clusterer.

  LIST proto_list = Clusterer->ProtoList;

  iterate(proto_list) {

    auto *proto = reinterpret_cast<PROTOTYPE *>(first_node(proto_list));

    proto->Cluster = nullptr;

  }

  return Clusterer->ProtoList;

}                                // ClusterSamples


void FreeClusterer(CLUSTERER *Clusterer) {

  if (Clusterer != nullptr) {

    free(Clusterer->ParamDesc);

    if (Clusterer->KDTree != nullptr)

      FreeKDTree (Clusterer->KDTree);

    if (Clusterer->Root != nullptr)

      FreeCluster (Clusterer->Root);

    // Free up all used buckets structures.

    for (auto & d : Clusterer->bucket_cache) {

      for (auto & c : d)

        if (c != nullptr)

          FreeBuckets(c);

    }


    free(Clusterer);

  }

}                                // FreeClusterer


void FreeProtoList(LIST *ProtoList) {

  destroy_nodes(*ProtoList, FreePrototype);

}                                // FreeProtoList


void FreePrototype(void *arg) {  //PROTOTYPE     *Prototype)

  auto *Prototype = static_cast<PROTOTYPE *>(arg);


  // unmark the corresponding cluster (if there is one

  if (Prototype->Cluster != nullptr)

    Prototype->Cluster->Prototype = false;


  // deallocate the prototype statistics and then the prototype itself

  free(Prototype->Distrib);

  free(Prototype->Mean);

  if (Prototype->Style != spherical) {

    free(Prototype->Variance.Elliptical);

    free(Prototype->Magnitude.Elliptical);

    free(Prototype->Weight.Elliptical);

  }

  free(Prototype);

}                                // FreePrototype


CLUSTER *NextSample(LIST *SearchState) {

  CLUSTER *Cluster;


  if (*SearchState == NIL_LIST)

    return (nullptr);

  Cluster = reinterpret_cast<CLUSTER *>first_node (*SearchState);

  *SearchState = pop (*SearchState);

  for (;;) {

    if (Cluster->Left == nullptr)

      return (Cluster);

    *SearchState = push (*SearchState, Cluster->Right);

    Cluster = Cluster->Left;

  }

}                                // NextSample


float Mean(PROTOTYPE *Proto, uint16_t Dimension) {

  return (Proto->Mean[Dimension]);

}                                // Mean


float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension) {

  switch (Proto->Style) {

    case spherical:

      return (static_cast<float>(sqrt (static_cast<double>(Proto->Variance.Spherical))));

    case elliptical:

      return (static_cast<float>(sqrt (static_cast<double>(Proto->Variance.Elliptical[Dimension]))));

    case mixed:

      switch (Proto->Distrib[Dimension]) {

        case normal:

          return (static_cast<float>(sqrt (static_cast<double>(Proto->Variance.Elliptical[Dimension]))));

        case uniform:

        case D_random:

          return (Proto->Variance.Elliptical[Dimension]);

        case DISTRIBUTION_COUNT:

          ASSERT_HOST(!"Distribution count not allowed!");

      }

  }

  return 0.0f;

}                                // StandardDeviation


/*---------------------------------------------------------------------------

            Private Code

----------------------------------------------------------------------------*/

static void CreateClusterTree(CLUSTERER *Clusterer) {

  ClusteringContext context;

  ClusterPair HeapEntry;

  TEMPCLUSTER *PotentialCluster;


  // each sample and its nearest neighbor form a "potential" cluster

  // save these in a heap with the "best" potential clusters on top

  context.tree = Clusterer->KDTree;

  context.candidates = static_cast<TEMPCLUSTER *>(Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER)));

  context.next = 0;

  context.heap = new ClusterHeap(Clusterer->NumberOfSamples);

  KDWalk(context.tree, reinterpret_cast<void_proc>(MakePotentialClusters), &context);


  // form potential clusters into actual clusters - always do "best" first

  while (context.heap->Pop(&HeapEntry)) {

    PotentialCluster = HeapEntry.data;


    // if main cluster of potential cluster is already in another cluster

    // then we don't need to worry about it

    if (PotentialCluster->Cluster->Clustered) {

      continue;

    }


    // if main cluster is not yet clustered, but its nearest neighbor is

    // then we must find a new nearest neighbor

    else if (PotentialCluster->Neighbor->Clustered) {

      PotentialCluster->Neighbor =

        FindNearestNeighbor(context.tree, PotentialCluster->Cluster,

                            &HeapEntry.key);

      if (PotentialCluster->Neighbor != nullptr) {

        context.heap->Push(&HeapEntry);

      }

    }


    // if neither cluster is already clustered, form permanent cluster

    else {

      PotentialCluster->Cluster =

          MakeNewCluster(Clusterer, PotentialCluster);

      PotentialCluster->Neighbor =

          FindNearestNeighbor(context.tree, PotentialCluster->Cluster,

                              &HeapEntry.key);

      if (PotentialCluster->Neighbor != nullptr) {

        context.heap->Push(&HeapEntry);

      }

    }

  }


  // the root node in the cluster tree is now the only node in the kd-tree

  Clusterer->Root = static_cast<CLUSTER *>RootOf(Clusterer->KDTree);


  // free up the memory used by the K-D tree, heap, and temp clusters

  FreeKDTree(context.tree);

  Clusterer->KDTree = nullptr;

  delete context.heap;

  free(context.candidates);

}                                // CreateClusterTree


static void MakePotentialClusters(ClusteringContext* context,

                                  CLUSTER* Cluster, int32_t /*Level*/) {

  ClusterPair HeapEntry;

  int next = context->next;

  context->candidates[next].Cluster = Cluster;

  HeapEntry.data = &(context->candidates[next]);

  context->candidates[next].Neighbor =

      FindNearestNeighbor(context->tree,

                          context->candidates[next].Cluster,

                          &HeapEntry.key);

  if (context->candidates[next].Neighbor != nullptr) {

    context->heap->Push(&HeapEntry);

    context->next++;

  }

}                                // MakePotentialClusters


static CLUSTER*

FindNearestNeighbor(KDTREE* Tree, CLUSTER* Cluster, float* Distance)

#define MAXNEIGHBORS  2

#define MAXDISTANCE   FLT_MAX

{

  CLUSTER *Neighbor[MAXNEIGHBORS];

  float Dist[MAXNEIGHBORS];

  int NumberOfNeighbors;

  int32_t i;

  CLUSTER *BestNeighbor;


  // find the 2 nearest neighbors of the cluster

  KDNearestNeighborSearch(Tree, Cluster->Mean, MAXNEIGHBORS, MAXDISTANCE,

                          &NumberOfNeighbors, reinterpret_cast<void **>(Neighbor), Dist);


  // search for the nearest neighbor that is not the cluster itself

  *Distance = MAXDISTANCE;

  BestNeighbor = nullptr;

  for (i = 0; i < NumberOfNeighbors; i++) {

    if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {

      *Distance = Dist[i];

      BestNeighbor = Neighbor[i];

    }

  }

  return BestNeighbor;

}                                // FindNearestNeighbor


static CLUSTER* MakeNewCluster(CLUSTERER* Clusterer,

                               TEMPCLUSTER* TempCluster) {

  CLUSTER *Cluster;


  // allocate the new cluster and initialize it

  Cluster = static_cast<CLUSTER *>(Emalloc(

      sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(float)));

  Cluster->Clustered = false;

  Cluster->Prototype = false;

  Cluster->Left = TempCluster->Cluster;

  Cluster->Right = TempCluster->Neighbor;

  Cluster->CharID = -1;


  // mark the old clusters as "clustered" and delete them from the kd-tree

  Cluster->Left->Clustered = true;

  Cluster->Right->Clustered = true;

  KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);

  KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);


  // compute the mean and sample count for the new cluster

  Cluster->SampleCount =

      MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,

                    Cluster->Left->SampleCount, Cluster->Right->SampleCount,

                    Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);


  // add the new cluster to the KD tree

  KDStore(Clusterer->KDTree, Cluster->Mean, Cluster);

  return Cluster;

}                                // MakeNewCluster


int32_t MergeClusters(int16_t N,

                    PARAM_DESC ParamDesc[],

                    int32_t n1,

                    int32_t n2,

                    float m[],

                    float m1[], float m2[]) {

  int32_t i, n;


  n = n1 + n2;

  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {

    if (ParamDesc->Circular) {

      // if distance between means is greater than allowed

      // reduce upper point by one "rotation" to compute mean

      // then normalize the mean back into the accepted range

      if ((*m2 - *m1) > ParamDesc->HalfRange) {

        *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;

        if (*m < ParamDesc->Min)

          *m += ParamDesc->Range;

      }

      else if ((*m1 - *m2) > ParamDesc->HalfRange) {

        *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;

        if (*m < ParamDesc->Min)

          *m += ParamDesc->Range;

      }

      else

        *m = (n1 * *m1 + n2 * *m2) / n;

    }

    else

      *m = (n1 * *m1 + n2 * *m2) / n;

  }

  return n;

}                                // MergeClusters


static void ComputePrototypes(CLUSTERER* Clusterer, CLUSTERCONFIG* Config) {

  LIST ClusterStack = NIL_LIST;

  CLUSTER *Cluster;

  PROTOTYPE *Prototype;


  // use a stack to keep track of clusters waiting to be processed

  // initially the only cluster on the stack is the root cluster

  if (Clusterer->Root != nullptr)

    ClusterStack = push (NIL_LIST, Clusterer->Root);


  // loop until we have analyzed all clusters which are potential prototypes

  while (ClusterStack != NIL_LIST) {

    // remove the next cluster to be analyzed from the stack

    // try to make a prototype from the cluster

    // if successful, put it on the proto list, else split the cluster

    Cluster = reinterpret_cast<CLUSTER *>first_node (ClusterStack);

    ClusterStack = pop (ClusterStack);

    Prototype = MakePrototype(Clusterer, Config, Cluster);

    if (Prototype != nullptr) {

      Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);

    }

    else {

      ClusterStack = push (ClusterStack, Cluster->Right);

      ClusterStack = push (ClusterStack, Cluster->Left);

    }

  }

}                                // ComputePrototypes


static PROTOTYPE* MakePrototype(CLUSTERER* Clusterer, CLUSTERCONFIG* Config,

                                CLUSTER* Cluster) {

  STATISTICS *Statistics;

  PROTOTYPE *Proto;

  BUCKETS *Buckets;


  // filter out clusters which contain samples from the same character

  if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))

    return nullptr;


  // compute the covariance matrix and ranges for the cluster

  Statistics =

      ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);


  // check for degenerate clusters which need not be analyzed further

  // note that the MinSamples test assumes that all clusters with multiple

  // character samples have been removed (as above)

  Proto = MakeDegenerateProto(

      Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,

      static_cast<int32_t>(Config->MinSamples * Clusterer->NumChar));

  if (Proto != nullptr) {

    FreeStatistics(Statistics);

    return Proto;

  }

  // check to ensure that all dimensions are independent

  if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize,

                   Statistics->CoVariance, Config->Independence)) {

    FreeStatistics(Statistics);

    return nullptr;

  }


  if (HOTELLING && Config->ProtoStyle == elliptical) {

    Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);

    if (Proto != nullptr) {

      FreeStatistics(Statistics);

      return Proto;

    }

  }


  // create a histogram data structure used to evaluate distributions

  Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount,

                       Config->Confidence);


  // create a prototype based on the statistics and test it

  switch (Config->ProtoStyle) {

    case spherical:

      Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);

      break;

    case elliptical:

      Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);

      break;

    case mixed:

      Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,

                             Config->Confidence);

      break;

    case automatic:

      Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);

      if (Proto != nullptr)

        break;

      Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);

      if (Proto != nullptr)

        break;

      Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,

                             Config->Confidence);

      break;

  }

  FreeStatistics(Statistics);

  return Proto;

}                                // MakePrototype


static PROTOTYPE* MakeDegenerateProto(  //this was MinSample

                               uint16_t N,

                               CLUSTER *Cluster,

                               STATISTICS *Statistics,

                               PROTOSTYLE Style,

                               int32_t MinSamples) {

  PROTOTYPE *Proto = nullptr;


  if (MinSamples < MINSAMPLESNEEDED)

    MinSamples = MINSAMPLESNEEDED;


  if (Cluster->SampleCount < MinSamples) {

    switch (Style) {

      case spherical:

        Proto = NewSphericalProto (N, Cluster, Statistics);

        break;

      case elliptical:

      case automatic:

        Proto = NewEllipticalProto (N, Cluster, Statistics);

        break;

      case mixed:

        Proto = NewMixedProto (N, Cluster, Statistics);

        break;

    }

    Proto->Significant = false;

  }

  return (Proto);

}                                // MakeDegenerateProto


static PROTOTYPE* TestEllipticalProto(CLUSTERER* Clusterer,

                                      CLUSTERCONFIG *Config, CLUSTER* Cluster,

                                      STATISTICS* Statistics) {

  // Fraction of the number of samples used as a range around 1 within

  // which a cluster has the magic size that allows a boost to the

  // FTable by kFTableBoostMargin, thus allowing clusters near the

  // magic size (equal to the number of sample characters) to be more

  // likely to stay together.

  const double kMagicSampleMargin = 0.0625;

  const double kFTableBoostMargin = 2.0;


  int N = Clusterer->SampleSize;

  CLUSTER* Left = Cluster->Left;

  CLUSTER* Right = Cluster->Right;

  if (Left == nullptr || Right == nullptr)

    return nullptr;

  int TotalDims = Left->SampleCount + Right->SampleCount;

  if (TotalDims < N + 1 || TotalDims < 2)

    return nullptr;

  std::vector<float> Covariance(static_cast<size_t>(N) * N);

  std::vector<float> Inverse(static_cast<size_t>(N) * N);

  std::vector<float> Delta(N);

  // Compute a new covariance matrix that only uses essential features.

  for (int i = 0; i < N; ++i) {

    int row_offset = i * N;

    if (!Clusterer->ParamDesc[i].NonEssential) {

      for (int j = 0; j < N; ++j) {

        if (!Clusterer->ParamDesc[j].NonEssential)

          Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];

        else

          Covariance[j + row_offset] = 0.0f;

      }

    } else {

      for (int j = 0; j < N; ++j) {

        if (i == j)

          Covariance[j + row_offset] = 1.0f;

        else

          Covariance[j + row_offset] = 0.0f;

      }

    }

  }

  double err = InvertMatrix(&Covariance[0], N, &Inverse[0]);

  if (err > 1) {

    tprintf("Clustering error: Matrix inverse failed with error %g\n", err);

  }

  int EssentialN = 0;

  for (int dim = 0; dim < N; ++dim) {

    if (!Clusterer->ParamDesc[dim].NonEssential) {

      Delta[dim] = Left->Mean[dim] - Right->Mean[dim];

      ++EssentialN;

    } else {

      Delta[dim] = 0.0f;

    }

  }

  // Compute Hotelling's T-squared.

  double Tsq = 0.0;

  for (int x = 0; x < N; ++x) {

    double temp = 0.0;

    for (int y = 0; y < N; ++y) {

      temp += static_cast<double>(Inverse[y + N * x]) * Delta[y];

    }

    Tsq += Delta[x] * temp;

  }

  // Changed this function to match the formula in

  // Statistical Methods in Medical Research p 473

  // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.

  // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;

  double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);

  int Fx = EssentialN;

  if (Fx > FTABLE_X)

    Fx = FTABLE_X;

  --Fx;

  int Fy = TotalDims - EssentialN - 1;

  if (Fy > FTABLE_Y)

    Fy = FTABLE_Y;

  --Fy;

  double FTarget = FTable[Fy][Fx];

  if (Config->MagicSamples > 0 &&

      TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&

      TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {

    // Give magic-sized clusters a magic FTable boost.

    FTarget += kFTableBoostMargin;

  }

  if (F < FTarget) {

    return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);

  }

  return nullptr;

}


static PROTOTYPE* MakeSphericalProto(CLUSTERER* Clusterer,

                                     CLUSTER* Cluster, STATISTICS* Statistics,

                                     BUCKETS* Buckets) {

  PROTOTYPE *Proto = nullptr;

  int i;


  // check that each dimension is a normal distribution

  for (i = 0; i < Clusterer->SampleSize; i++) {

    if (Clusterer->ParamDesc[i].NonEssential)

      continue;


    FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),

      Cluster->Mean[i],

      sqrt (static_cast<double>(Statistics->AvgVariance)));

    if (!DistributionOK (Buckets))

      break;

  }

  // if all dimensions matched a normal distribution, make a proto

  if (i >= Clusterer->SampleSize)

    Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);

  return (Proto);

}                                // MakeSphericalProto


static PROTOTYPE* MakeEllipticalProto(CLUSTERER* Clusterer,

                                      CLUSTER* Cluster, STATISTICS* Statistics,

                                      BUCKETS* Buckets) {

  PROTOTYPE *Proto = nullptr;

  int i;


  // check that each dimension is a normal distribution

  for (i = 0; i < Clusterer->SampleSize; i++) {

    if (Clusterer->ParamDesc[i].NonEssential)

      continue;


    FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),

      Cluster->Mean[i],

      sqrt (static_cast<double>(Statistics->

      CoVariance[i * (Clusterer->SampleSize + 1)])));

    if (!DistributionOK (Buckets))

      break;

  }

  // if all dimensions matched a normal distribution, make a proto

  if (i >= Clusterer->SampleSize)

    Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);

  return (Proto);

}                                // MakeEllipticalProto


static PROTOTYPE* MakeMixedProto(CLUSTERER* Clusterer,

                                 CLUSTER* Cluster, STATISTICS* Statistics,

                                 BUCKETS* NormalBuckets, double Confidence) {

  PROTOTYPE *Proto;

  int i;

  BUCKETS *UniformBuckets = nullptr;

  BUCKETS *RandomBuckets = nullptr;


  // create a mixed proto to work on - initially assume all dimensions normal*/

  Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);


  // find the proper distribution for each dimension

  for (i = 0; i < Clusterer->SampleSize; i++) {

    if (Clusterer->ParamDesc[i].NonEssential)

      continue;


    FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),

      Proto->Mean[i],

      sqrt (static_cast<double>(Proto->Variance.Elliptical[i])));

    if (DistributionOK (NormalBuckets))

      continue;


    if (RandomBuckets == nullptr)

      RandomBuckets =

        GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);

    MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));

    FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),

      Proto->Mean[i], Proto->Variance.Elliptical[i]);

    if (DistributionOK (RandomBuckets))

      continue;


    if (UniformBuckets == nullptr)

      UniformBuckets =

        GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);

    MakeDimUniform(i, Proto, Statistics);

    FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),

      Proto->Mean[i], Proto->Variance.Elliptical[i]);

    if (DistributionOK (UniformBuckets))

      continue;

    break;

  }

  // if any dimension failed to match a distribution, discard the proto

  if (i < Clusterer->SampleSize) {

    FreePrototype(Proto);

    Proto = nullptr;

  }

  return (Proto);

}                                // MakeMixedProto


static void MakeDimRandom(uint16_t i, PROTOTYPE* Proto, PARAM_DESC* ParamDesc) {

  Proto->Distrib[i] = D_random;

  Proto->Mean[i] = ParamDesc->MidRange;

  Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;


  // subtract out the previous magnitude of this dimension from the total

  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];

  Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;

  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];

  Proto->LogMagnitude = log (static_cast<double>(Proto->TotalMagnitude));


  // note that the proto Weight is irrelevant for D_random protos

}                                // MakeDimRandom


static void MakeDimUniform(uint16_t i, PROTOTYPE* Proto, STATISTICS* Statistics) {

  Proto->Distrib[i] = uniform;

  Proto->Mean[i] = Proto->Cluster->Mean[i] +

    (Statistics->Min[i] + Statistics->Max[i]) / 2;

  Proto->Variance.Elliptical[i] =

    (Statistics->Max[i] - Statistics->Min[i]) / 2;

  if (Proto->Variance.Elliptical[i] < MINVARIANCE)

    Proto->Variance.Elliptical[i] = MINVARIANCE;


  // subtract out the previous magnitude of this dimension from the total

  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];

  Proto->Magnitude.Elliptical[i] =

    1.0 / (2.0 * Proto->Variance.Elliptical[i]);

  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];

  Proto->LogMagnitude = log (static_cast<double>(Proto->TotalMagnitude));


  // note that the proto Weight is irrelevant for uniform protos

}                                // MakeDimUniform


static STATISTICS*

ComputeStatistics (int16_t N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) {

  STATISTICS *Statistics;

  int i, j;

  float *CoVariance;

  float *Distance;

  LIST SearchState;

  SAMPLE *Sample;

  uint32_t SampleCountAdjustedForBias;


  // allocate memory to hold the statistics results

  Statistics = static_cast<STATISTICS *>(Emalloc (sizeof (STATISTICS)));

  Statistics->CoVariance = static_cast<float *>(Emalloc(sizeof(float) * N * N));

  Statistics->Min = static_cast<float *>(Emalloc (N * sizeof (float)));

  Statistics->Max = static_cast<float *>(Emalloc (N * sizeof (float)));


  // allocate temporary memory to hold the sample to mean distances

  Distance = static_cast<float *>(Emalloc (N * sizeof (float)));


  // initialize the statistics

  Statistics->AvgVariance = 1.0;

  CoVariance = Statistics->CoVariance;

  for (i = 0; i < N; i++) {

    Statistics->Min[i] = 0.0;

    Statistics->Max[i] = 0.0;

    for (j = 0; j < N; j++, CoVariance++)

      *CoVariance = 0;

  }

  // find each sample in the cluster and merge it into the statistics

  InitSampleSearch(SearchState, Cluster);

  while ((Sample = NextSample (&SearchState)) != nullptr) {

    for (i = 0; i < N; i++) {

      Distance[i] = Sample->Mean[i] - Cluster->Mean[i];

      if (ParamDesc[i].Circular) {

        if (Distance[i] > ParamDesc[i].HalfRange)

          Distance[i] -= ParamDesc[i].Range;

        if (Distance[i] < -ParamDesc[i].HalfRange)

          Distance[i] += ParamDesc[i].Range;

      }

      if (Distance[i] < Statistics->Min[i])

        Statistics->Min[i] = Distance[i];

      if (Distance[i] > Statistics->Max[i])

        Statistics->Max[i] = Distance[i];

    }

    CoVariance = Statistics->CoVariance;

    for (i = 0; i < N; i++)

      for (j = 0; j < N; j++, CoVariance++)

        *CoVariance += Distance[i] * Distance[j];

  }

  // normalize the variances by the total number of samples

  // use SampleCount-1 instead of SampleCount to get an unbiased estimate

  // also compute the geometic mean of the diagonal variances

  // ensure that clusters with only 1 sample are handled correctly

  if (Cluster->SampleCount > 1)

    SampleCountAdjustedForBias = Cluster->SampleCount - 1;

  else

    SampleCountAdjustedForBias = 1;

  CoVariance = Statistics->CoVariance;

  for (i = 0; i < N; i++)

  for (j = 0; j < N; j++, CoVariance++) {

    *CoVariance /= SampleCountAdjustedForBias;

    if (j == i) {

      if (*CoVariance < MINVARIANCE)

        *CoVariance = MINVARIANCE;

      Statistics->AvgVariance *= *CoVariance;

    }

  }

  Statistics->AvgVariance = static_cast<float>(pow(static_cast<double>(Statistics->AvgVariance),

                                       1.0 / N));


  // release temporary memory and return

  free(Distance);

  return (Statistics);

}                                // ComputeStatistics


static PROTOTYPE* NewSphericalProto(uint16_t N, CLUSTER* Cluster,

                                    STATISTICS* Statistics) {

  PROTOTYPE *Proto;


  Proto = NewSimpleProto (N, Cluster);


  Proto->Variance.Spherical = Statistics->AvgVariance;

  if (Proto->Variance.Spherical < MINVARIANCE)

    Proto->Variance.Spherical = MINVARIANCE;


  Proto->Magnitude.Spherical =

    1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);

  Proto->TotalMagnitude = static_cast<float>(pow(static_cast<double>(Proto->Magnitude.Spherical),

                                     static_cast<double>(N)));

  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;

  Proto->LogMagnitude = log (static_cast<double>(Proto->TotalMagnitude));


  return (Proto);

}                                // NewSphericalProto


static PROTOTYPE* NewEllipticalProto(int16_t N, CLUSTER* Cluster,

                                     STATISTICS* Statistics) {

  PROTOTYPE *Proto;

  float *CoVariance;

  int i;


  Proto = NewSimpleProto (N, Cluster);

  Proto->Variance.Elliptical = static_cast<float *>(Emalloc (N * sizeof (float)));

  Proto->Magnitude.Elliptical = static_cast<float *>(Emalloc (N * sizeof (float)));

  Proto->Weight.Elliptical = static_cast<float *>(Emalloc (N * sizeof (float)));


  CoVariance = Statistics->CoVariance;

  Proto->TotalMagnitude = 1.0;

  for (i = 0; i < N; i++, CoVariance += N + 1) {

    Proto->Variance.Elliptical[i] = *CoVariance;

    if (Proto->Variance.Elliptical[i] < MINVARIANCE)

      Proto->Variance.Elliptical[i] = MINVARIANCE;


    Proto->Magnitude.Elliptical[i] =

      1.0 / sqrt(2.0 * M_PI * Proto->Variance.Elliptical[i]);

    Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];

    Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];

  }

  Proto->LogMagnitude = log (static_cast<double>(Proto->TotalMagnitude));

  Proto->Style = elliptical;

  return (Proto);

}                                // NewEllipticalProto


static PROTOTYPE* NewMixedProto(int16_t N, CLUSTER* Cluster,

                                STATISTICS* Statistics) {

  PROTOTYPE *Proto;

  int i;


  Proto = NewEllipticalProto (N, Cluster, Statistics);

  Proto->Distrib = static_cast<DISTRIBUTION *>(Emalloc (N * sizeof (DISTRIBUTION)));


  for (i = 0; i < N; i++) {

    Proto->Distrib[i] = normal;

  }

  Proto->Style = mixed;

  return (Proto);

}                                // NewMixedProto


static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster) {

  PROTOTYPE *Proto;

  int i;


  Proto = static_cast<PROTOTYPE *>(Emalloc (sizeof (PROTOTYPE)));

  Proto->Mean = static_cast<float *>(Emalloc (N * sizeof (float)));


  for (i = 0; i < N; i++)

    Proto->Mean[i] = Cluster->Mean[i];

  Proto->Distrib = nullptr;


  Proto->Significant = true;

  Proto->Merged = false;

  Proto->Style = spherical;

  Proto->NumSamples = Cluster->SampleCount;

  Proto->Cluster = Cluster;

  Proto->Cluster->Prototype = true;

  return (Proto);

}                                // NewSimpleProto


static bool

Independent(PARAM_DESC* ParamDesc,

            int16_t N, float* CoVariance, float Independence) {

  int i, j;

  float *VARii;                // points to ith on-diagonal element

  float *VARjj;                // points to jth on-diagonal element

  float CorrelationCoeff;


  VARii = CoVariance;

  for (i = 0; i < N; i++, VARii += N + 1) {

    if (ParamDesc[i].NonEssential)

      continue;


    VARjj = VARii + N + 1;

    CoVariance = VARii + 1;

    for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {

      if (ParamDesc[j].NonEssential)

        continue;


      if ((*VARii == 0.0) || (*VARjj == 0.0))

        CorrelationCoeff = 0.0;

      else

        CorrelationCoeff =

          sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));

      if (CorrelationCoeff > Independence)

        return false;

    }

  }

  return true;

}                                // Independent


static BUCKETS *GetBuckets(CLUSTERER* clusterer,

                    DISTRIBUTION Distribution,

                    uint32_t SampleCount,

                    double Confidence) {

  // Get an old bucket structure with the same number of buckets.

  uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);

  BUCKETS *Buckets =

      clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];


  // If a matching bucket structure is not found, make one and save it.

  if (Buckets == nullptr) {

    Buckets = MakeBuckets(Distribution, SampleCount, Confidence);

    clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =

        Buckets;

  } else {

    // Just adjust the existing buckets.

    if (SampleCount != Buckets->SampleCount)

      AdjustBuckets(Buckets, SampleCount);

    if (Confidence != Buckets->Confidence) {

      Buckets->Confidence = Confidence;

      Buckets->ChiSquared = ComputeChiSquared(

          DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets),

          Confidence);

    }

    InitBuckets(Buckets);

  }

  return Buckets;

}                                // GetBuckets


static BUCKETS *MakeBuckets(DISTRIBUTION Distribution,

                     uint32_t SampleCount,

                     double Confidence) {

  const DENSITYFUNC DensityFunction[] =

    { NormalDensity, UniformDensity, UniformDensity };

  int i, j;

  BUCKETS *Buckets;

  double BucketProbability;

  double NextBucketBoundary;

  double Probability;

  double ProbabilityDelta;

  double LastProbDensity;

  double ProbDensity;

  uint16_t CurrentBucket;

  bool Symmetrical;


  // allocate memory needed for data structure

  Buckets = static_cast<BUCKETS *>(Emalloc(sizeof(BUCKETS)));

  Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);

  Buckets->SampleCount = SampleCount;

  Buckets->Confidence = Confidence;

  Buckets->Count =

      static_cast<uint32_t *>(Emalloc(Buckets->NumberOfBuckets * sizeof(uint32_t)));

  Buckets->ExpectedCount = static_cast<float *>(

      Emalloc(Buckets->NumberOfBuckets * sizeof(float)));


  // initialize simple fields

  Buckets->Distribution = Distribution;

  for (i = 0; i < Buckets->NumberOfBuckets; i++) {

    Buckets->Count[i] = 0;

    Buckets->ExpectedCount[i] = 0.0;

  }


  // all currently defined distributions are symmetrical

  Symmetrical = true;

  Buckets->ChiSquared = ComputeChiSquared(

      DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);


  if (Symmetrical) {

    // allocate buckets so that all have approx. equal probability

    BucketProbability = 1.0 / static_cast<double>(Buckets->NumberOfBuckets);


    // distribution is symmetric so fill in upper half then copy

    CurrentBucket = Buckets->NumberOfBuckets / 2;

    if (Odd (Buckets->NumberOfBuckets))

      NextBucketBoundary = BucketProbability / 2;

    else

      NextBucketBoundary = BucketProbability;


    Probability = 0.0;

    LastProbDensity =

      (*DensityFunction[static_cast<int>(Distribution)]) (BUCKETTABLESIZE / 2);

    for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {

      ProbDensity = (*DensityFunction[static_cast<int>(Distribution)]) (i + 1);

      ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);

      Probability += ProbabilityDelta;

      if (Probability > NextBucketBoundary) {

        if (CurrentBucket < Buckets->NumberOfBuckets - 1)

          CurrentBucket++;

        NextBucketBoundary += BucketProbability;

      }

      Buckets->Bucket[i] = CurrentBucket;

      Buckets->ExpectedCount[CurrentBucket] +=

        static_cast<float>(ProbabilityDelta * SampleCount);

      LastProbDensity = ProbDensity;

    }

    // place any leftover probability into the last bucket

    Buckets->ExpectedCount[CurrentBucket] +=

      static_cast<float>((0.5 - Probability) * SampleCount);


    // copy upper half of distribution to lower half

    for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)

      Buckets->Bucket[i] =

        Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);


    // copy upper half of expected counts to lower half

    for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)

      Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];

  }

  return Buckets;

}                                // MakeBuckets


static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount) {

  uint8_t Last, Next;

  float Slope;


  if (SampleCount < kCountTable[0])

    return kBucketsTable[0];


  for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {

    if (SampleCount <= kCountTable[Next]) {

      Slope = static_cast<float>(kBucketsTable[Next] - kBucketsTable[Last]) /

          static_cast<float>(kCountTable[Next] - kCountTable[Last]);

      return (static_cast<uint16_t>(kBucketsTable[Last] +

          Slope * (SampleCount - kCountTable[Last])));

    }

  }

  return kBucketsTable[Last];

}                                // OptimumNumberOfBuckets


static double

ComputeChiSquared (uint16_t DegreesOfFreedom, double Alpha)

#define CHIACCURACY     0.01

#define MINALPHA  (1e-200)

{

  static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];


  CHISTRUCT *OldChiSquared;

  CHISTRUCT SearchKey;


  // limit the minimum alpha that can be used - if alpha is too small

  //      it may not be possible to compute chi-squared.

  Alpha = ClipToRange(Alpha, MINALPHA, 1.0);

  if (Odd (DegreesOfFreedom))

    DegreesOfFreedom++;


  /* find the list of chi-squared values which have already been computed

     for the specified number of degrees of freedom.  Search the list for

     the desired chi-squared. */

  SearchKey.Alpha = Alpha;

  OldChiSquared = reinterpret_cast<CHISTRUCT *>first_node (search (ChiWith[DegreesOfFreedom],

    &SearchKey, AlphaMatch));


  if (OldChiSquared == nullptr) {

    OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);

    OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,

      static_cast<double>(DegreesOfFreedom),

      CHIACCURACY);

    ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],

      OldChiSquared);

  }

  else {

    // further optimization might move OldChiSquared to front of list

  }


  return (OldChiSquared->ChiSquared);


}                                // ComputeChiSquared


static double NormalDensity(int32_t x) {

  double Distance;


  Distance = x - kNormalMean;

  return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);

}                                // NormalDensity


static double UniformDensity(int32_t x) {

  constexpr auto UniformDistributionDensity = 1.0 / BUCKETTABLESIZE;


  if ((x >= 0) && (x <= BUCKETTABLESIZE)) {

    return UniformDistributionDensity;

  } else {

    return 0.0;

  }

}                                // UniformDensity


static double Integral(double f1, double f2, double Dx) {

  return (f1 + f2) * Dx / 2.0;

}                                // Integral


static void FillBuckets(BUCKETS *Buckets,

                 CLUSTER *Cluster,

                 uint16_t Dim,

                 PARAM_DESC *ParamDesc,

                 float Mean,

                 float StdDev) {

  uint16_t BucketID;

  int i;

  LIST SearchState;

  SAMPLE *Sample;


  // initialize the histogram bucket counts to 0

  for (i = 0; i < Buckets->NumberOfBuckets; i++)

    Buckets->Count[i] = 0;


  if (StdDev == 0.0) {

    /* if the standard deviation is zero, then we can't statistically

       analyze the cluster.  Use a pseudo-analysis: samples exactly on

       the mean are distributed evenly across all buckets.  Samples greater

       than the mean are placed in the last bucket; samples less than the

       mean are placed in the first bucket. */


    InitSampleSearch(SearchState, Cluster);

    i = 0;

    while ((Sample = NextSample (&SearchState)) != nullptr) {

      if (Sample->Mean[Dim] > Mean)

        BucketID = Buckets->NumberOfBuckets - 1;

      else if (Sample->Mean[Dim] < Mean)

        BucketID = 0;

      else

        BucketID = i;

      Buckets->Count[BucketID] += 1;

      i++;

      if (i >= Buckets->NumberOfBuckets)

        i = 0;

    }

  }

  else {

    // search for all samples in the cluster and add to histogram buckets

    InitSampleSearch(SearchState, Cluster);

    while ((Sample = NextSample (&SearchState)) != nullptr) {

      switch (Buckets->Distribution) {

        case normal:

          BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],

            Mean, StdDev);

          break;

        case D_random:

        case uniform:

          BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],

            Mean, StdDev);

          break;

        default:

          BucketID = 0;

      }

      Buckets->Count[Buckets->Bucket[BucketID]] += 1;

    }

  }

}                                // FillBuckets


static uint16_t NormalBucket(PARAM_DESC *ParamDesc,

                      float x,

                      float Mean,

                      float StdDev) {

  float X;


  // wraparound circular parameters if necessary

  if (ParamDesc->Circular) {

    if (x - Mean > ParamDesc->HalfRange)

      x -= ParamDesc->Range;

    else if (x - Mean < -ParamDesc->HalfRange)

      x += ParamDesc->Range;

  }


  X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;

  if (X < 0)

    return 0;

  if (X > BUCKETTABLESIZE - 1)

    return (static_cast<uint16_t>(BUCKETTABLESIZE - 1));

  return static_cast<uint16_t>(floor(static_cast<double>(X)));

}                                // NormalBucket


static uint16_t UniformBucket(PARAM_DESC *ParamDesc,

                       float x,

                       float Mean,

                       float StdDev) {

  float X;


  // wraparound circular parameters if necessary

  if (ParamDesc->Circular) {

    if (x - Mean > ParamDesc->HalfRange)

      x -= ParamDesc->Range;

    else if (x - Mean < -ParamDesc->HalfRange)

      x += ParamDesc->Range;

  }


  X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);

  if (X < 0)

    return 0;

  if (X > BUCKETTABLESIZE - 1)

    return static_cast<uint16_t>(BUCKETTABLESIZE - 1);

  return static_cast<uint16_t>(floor(static_cast<double>(X)));

}                                // UniformBucket


static bool DistributionOK(BUCKETS* Buckets) {

  float FrequencyDifference;

  float TotalDifference;

  int i;


  // compute how well the histogram matches the expected histogram

  TotalDifference = 0.0;

  for (i = 0; i < Buckets->NumberOfBuckets; i++) {

    FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];

    TotalDifference += (FrequencyDifference * FrequencyDifference) /

      Buckets->ExpectedCount[i];

  }


  // test to see if the difference is more than expected

  if (TotalDifference > Buckets->ChiSquared)

    return false;

  else

    return true;

}                                // DistributionOK


static void FreeStatistics(STATISTICS *Statistics) {

  free(Statistics->CoVariance);

  free(Statistics->Min);

  free(Statistics->Max);

  free(Statistics);

}                                // FreeStatistics


static void FreeBuckets(BUCKETS *buckets) {

  Efree(buckets->Count);

  Efree(buckets->ExpectedCount);

  Efree(buckets);

}                                // FreeBuckets


static void FreeCluster(CLUSTER *Cluster) {

  if (Cluster != nullptr) {

    FreeCluster (Cluster->Left);

    FreeCluster (Cluster->Right);

    free(Cluster);

  }

}                                // FreeCluster


static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets) {

  static uint8_t DegreeOffsets[] = { 3, 3, 1 };


  uint16_t AdjustedNumBuckets;


  AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[static_cast<int>(Distribution)];

  if (Odd (AdjustedNumBuckets))

    AdjustedNumBuckets++;

  return (AdjustedNumBuckets);


}                                // DegreesOfFreedom


static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount) {

  int i;

  double AdjustFactor;


  AdjustFactor = ((static_cast<double>(NewSampleCount)) /

    (static_cast<double>(Buckets->SampleCount)));


  for (i = 0; i < Buckets->NumberOfBuckets; i++) {

    Buckets->ExpectedCount[i] *= AdjustFactor;

  }


  Buckets->SampleCount = NewSampleCount;


}                                // AdjustBuckets


static void InitBuckets(BUCKETS *Buckets) {

  int i;


  for (i = 0; i < Buckets->NumberOfBuckets; i++) {

    Buckets->Count[i] = 0;

  }


}                                // InitBuckets


static int AlphaMatch(void *arg1,    //CHISTRUCT                             *ChiStruct,

               void *arg2) {  //CHISTRUCT                             *SearchKey)

  auto *ChiStruct = static_cast<CHISTRUCT *>(arg1);

  auto *SearchKey = static_cast<CHISTRUCT *>(arg2);


  return (ChiStruct->Alpha == SearchKey->Alpha);


}                                // AlphaMatch


static CHISTRUCT *NewChiStruct(uint16_t DegreesOfFreedom, double Alpha) {

  CHISTRUCT *NewChiStruct;


  NewChiStruct = static_cast<CHISTRUCT *>(Emalloc (sizeof (CHISTRUCT)));

  NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;

  NewChiStruct->Alpha = Alpha;

  return (NewChiStruct);


}                                // NewChiStruct


static double

Solve (SOLVEFUNC Function,

void *FunctionParams, double InitialGuess, double Accuracy)

#define INITIALDELTA    0.1

#define  DELTARATIO     0.1

{

  double x;

  double f;

  double Slope;

  double Delta;

  double NewDelta;

  double xDelta;

  double LastPosX, LastNegX;


  x = InitialGuess;

  Delta = INITIALDELTA;

  LastPosX = FLT_MAX;

  LastNegX = -FLT_MAX;

  f = (*Function) (static_cast<CHISTRUCT *>(FunctionParams), x);

  while (Abs (LastPosX - LastNegX) > Accuracy) {

    // keep track of outer bounds of current estimate

    if (f < 0)

      LastNegX = x;

    else

      LastPosX = x;


    // compute the approx. slope of f(x) at the current point

    Slope =

      ((*Function) (static_cast<CHISTRUCT *>(FunctionParams), x + Delta) - f) / Delta;


    // compute the next solution guess */

    xDelta = f / Slope;

    x -= xDelta;


    // reduce the delta used for computing slope to be a fraction of

    //the amount moved to get to the new guess

    NewDelta = Abs (xDelta) * DELTARATIO;

    if (NewDelta < Delta)

      Delta = NewDelta;


    // compute the value of the function at the new guess

    f = (*Function) (static_cast<CHISTRUCT *>(FunctionParams), x);

  }

  return (x);


}                                // Solve


static double ChiArea(CHISTRUCT *ChiParams, double x) {

  int i, N;

  double SeriesTotal;

  double Denominator;

  double PowerOfx;


  N = ChiParams->DegreesOfFreedom / 2 - 1;

  SeriesTotal = 1;

  Denominator = 1;

  PowerOfx = 1;

  for (i = 1; i <= N; i++) {

    Denominator *= 2 * i;

    PowerOfx *= x;

    SeriesTotal += PowerOfx / Denominator;

  }

  return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);


}                                // ChiArea


static bool

MultipleCharSamples(CLUSTERER* Clusterer,

                    CLUSTER* Cluster, float MaxIllegal)

#define ILLEGAL_CHAR    2

{

  static std::vector<uint8_t> CharFlags;

  LIST SearchState;

  SAMPLE *Sample;

  int32_t CharID;

  int32_t NumCharInCluster;

  int32_t NumIllegalInCluster;

  float PercentIllegal;


  // initial estimate assumes that no illegal chars exist in the cluster

  NumCharInCluster = Cluster->SampleCount;

  NumIllegalInCluster = 0;


  if (Clusterer->NumChar > CharFlags.size()) {

    CharFlags.resize(Clusterer->NumChar);

  }


  for (auto& CharFlag : CharFlags)

    CharFlag = false;


  // find each sample in the cluster and check if we have seen it before

  InitSampleSearch(SearchState, Cluster);

  while ((Sample = NextSample (&SearchState)) != nullptr) {

    CharID = Sample->CharID;

    if (CharFlags[CharID] == false) {

      CharFlags[CharID] = true;

    }

    else {

      if (CharFlags[CharID] == true) {

        NumIllegalInCluster++;

        CharFlags[CharID] = ILLEGAL_CHAR;

      }

      NumCharInCluster--;

      PercentIllegal = static_cast<float>(NumIllegalInCluster) / NumCharInCluster;

      if (PercentIllegal > MaxIllegal) {

        destroy(SearchState);

        return true;

      }

    }

  }

  return false;


}                                // MultipleCharSamples


static double InvertMatrix(const float* input, int size, float* inv) {

  // Allocate memory for the 2D arrays.

  GENERIC_2D_ARRAY<double> U(size, size, 0.0);

  GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);

  GENERIC_2D_ARRAY<double> L(size, size, 0.0);


  // Initialize the working matrices. U starts as input, L as I and U_inv as O.

  int row;

  int col;

  for (row = 0; row < size; row++) {

    for (col = 0; col < size; col++) {

      U[row][col] = input[row*size + col];

      L[row][col] = row == col ? 1.0 : 0.0;

      U_inv[row][col] = 0.0;

    }

  }


  // Compute forward matrix by inversion by LU decomposition of input.

  for (col = 0; col < size; ++col) {

    // Find best pivot

    int best_row = 0;

    double best_pivot = -1.0;

    for (row = col; row < size; ++row) {

      if (Abs(U[row][col]) > best_pivot) {

        best_pivot = Abs(U[row][col]);

        best_row = row;

      }

    }

    // Exchange pivot rows.

    if (best_row != col) {

      for (int k = 0; k < size; ++k) {

        double tmp = U[best_row][k];

        U[best_row][k] = U[col][k];

        U[col][k] = tmp;

        tmp = L[best_row][k];

        L[best_row][k] = L[col][k];

        L[col][k] = tmp;

      }

    }

    // Now do the pivot itself.

    for (row = col + 1; row < size; ++row) {

      double ratio = -U[row][col] / U[col][col];

      for (int j = col; j < size; ++j) {

        U[row][j] += U[col][j] * ratio;

      }

      for (int k = 0; k < size; ++k) {

        L[row][k] += L[col][k] * ratio;

      }

    }

  }

  // Next invert U.

  for (col = 0; col < size; ++col) {

    U_inv[col][col] = 1.0 / U[col][col];

    for (row = col - 1; row >= 0; --row) {

      double total = 0.0;

      for (int k = col; k > row; --k) {

        total += U[row][k] * U_inv[k][col];

      }

      U_inv[row][col] = -total / U[row][row];

    }

  }

  // Now the answer is U_inv.L.

  for (row = 0; row < size; row++) {

    for (col = 0; col < size; col++) {

      double sum = 0.0;

      for (int k = row; k < size; ++k) {

        sum += U_inv[row][k] * L[k][col];

      }

      inv[row*size + col] = sum;

    }

  }

  // Check matrix product.

  double error_sum = 0.0;

  for (row = 0; row < size; row++) {

    for (col = 0; col < size; col++) {

      double sum = 0.0;

      for (int k = 0; k < size; ++k) {

        sum += static_cast<double>(input[row * size + k]) * inv[k * size + col];

      }

      if (row != col) {

        error_sum += Abs(sum);

      }

    }

  }

  return error_sum;

}