18 #define _USE_MATH_DEFINES // for M_PI
31 #define HOTELLING 1 // If true use Hotelling's test to decide where to split.
32 #define FTABLE_X 10 // Size of FTable.
33 #define FTABLE_Y 100 // Size of FTable.
37 {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,},
38 {98.502, 99.000, 99.166, 99.249, 99.300, 99.333, 99.356, 99.374, 99.388, 99.399,},
39 {34.116, 30.816, 29.457, 28.710, 28.237, 27.911, 27.672, 27.489, 27.345, 27.229,},
40 {21.198, 18.000, 16.694, 15.977, 15.522, 15.207, 14.976, 14.799, 14.659, 14.546,},
41 {16.258, 13.274, 12.060, 11.392, 10.967, 10.672, 10.456, 10.289, 10.158, 10.051,},
42 {13.745, 10.925, 9.780, 9.148, 8.746, 8.466, 8.260, 8.102, 7.976, 7.874,},
43 {12.246, 9.547, 8.451, 7.847, 7.460, 7.191, 6.993, 6.840, 6.719, 6.620,},
44 {11.259, 8.649, 7.591, 7.006, 6.632, 6.371, 6.178, 6.029, 5.911, 5.814,},
45 {10.561, 8.022, 6.992, 6.422, 6.057, 5.802, 5.613, 5.467, 5.351, 5.257,},
46 {10.044, 7.559, 6.552, 5.994, 5.636, 5.386, 5.200, 5.057, 4.942, 4.849,},
47 { 9.646, 7.206, 6.217, 5.668, 5.316, 5.069, 4.886, 4.744, 4.632, 4.539,},
48 { 9.330, 6.927, 5.953, 5.412, 5.064, 4.821, 4.640, 4.499, 4.388, 4.296,},
49 { 9.074, 6.701, 5.739, 5.205, 4.862, 4.620, 4.441, 4.302, 4.191, 4.100,},
50 { 8.862, 6.515, 5.564, 5.035, 4.695, 4.456, 4.278, 4.140, 4.030, 3.939,},
51 { 8.683, 6.359, 5.417, 4.893, 4.556, 4.318, 4.142, 4.004, 3.895, 3.805,},
52 { 8.531, 6.226, 5.292, 4.773, 4.437, 4.202, 4.026, 3.890, 3.780, 3.691,},
53 { 8.400, 6.112, 5.185, 4.669, 4.336, 4.102, 3.927, 3.791, 3.682, 3.593,},
54 { 8.285, 6.013, 5.092, 4.579, 4.248, 4.015, 3.841, 3.705, 3.597, 3.508,},
55 { 8.185, 5.926, 5.010, 4.500, 4.171, 3.939, 3.765, 3.631, 3.523, 3.434,},
56 { 8.096, 5.849, 4.938, 4.431, 4.103, 3.871, 3.699, 3.564, 3.457, 3.368,},
57 { 8.017, 5.780, 4.874, 4.369, 4.042, 3.812, 3.640, 3.506, 3.398, 3.310,},
58 { 7.945, 5.719, 4.817, 4.313, 3.988, 3.758, 3.587, 3.453, 3.346, 3.258,},
59 { 7.881, 5.664, 4.765, 4.264, 3.939, 3.710, 3.539, 3.406, 3.299, 3.211,},
60 { 7.823, 5.614, 4.718, 4.218, 3.895, 3.667, 3.496, 3.363, 3.256, 3.168,},
61 { 7.770, 5.568, 4.675, 4.177, 3.855, 3.627, 3.457, 3.324, 3.217, 3.129,},
62 { 7.721, 5.526, 4.637, 4.140, 3.818, 3.591, 3.421, 3.288, 3.182, 3.094,},
63 { 7.677, 5.488, 4.601, 4.106, 3.785, 3.558, 3.388, 3.256, 3.149, 3.062,},
64 { 7.636, 5.453, 4.568, 4.074, 3.754, 3.528, 3.358, 3.226, 3.120, 3.032,},
65 { 7.598, 5.420, 4.538, 4.045, 3.725, 3.499, 3.330, 3.198, 3.092, 3.005,},
66 { 7.562, 5.390, 4.510, 4.018, 3.699, 3.473, 3.305, 3.173, 3.067, 2.979,},
67 { 7.530, 5.362, 4.484, 3.993, 3.675, 3.449, 3.281, 3.149, 3.043, 2.955,},
68 { 7.499, 5.336, 4.459, 3.969, 3.652, 3.427, 3.258, 3.127, 3.021, 2.934,},
69 { 7.471, 5.312, 4.437, 3.948, 3.630, 3.406, 3.238, 3.106, 3.000, 2.913,},
70 { 7.444, 5.289, 4.416, 3.927, 3.611, 3.386, 3.218, 3.087, 2.981, 2.894,},
71 { 7.419, 5.268, 4.396, 3.908, 3.592, 3.368, 3.200, 3.069, 2.963, 2.876,},
72 { 7.396, 5.248, 4.377, 3.890, 3.574, 3.351, 3.183, 3.052, 2.946, 2.859,},
73 { 7.373, 5.229, 4.360, 3.873, 3.558, 3.334, 3.167, 3.036, 2.930, 2.843,},
74 { 7.353, 5.211, 4.343, 3.858, 3.542, 3.319, 3.152, 3.021, 2.915, 2.828,},
75 { 7.333, 5.194, 4.327, 3.843, 3.528, 3.305, 3.137, 3.006, 2.901, 2.814,},
76 { 7.314, 5.179, 4.313, 3.828, 3.514, 3.291, 3.124, 2.993, 2.888, 2.801,},
77 { 7.296, 5.163, 4.299, 3.815, 3.501, 3.278, 3.111, 2.980, 2.875, 2.788,},
78 { 7.280, 5.149, 4.285, 3.802, 3.488, 3.266, 3.099, 2.968, 2.863, 2.776,},
79 { 7.264, 5.136, 4.273, 3.790, 3.476, 3.254, 3.087, 2.957, 2.851, 2.764,},
80 { 7.248, 5.123, 4.261, 3.778, 3.465, 3.243, 3.076, 2.946, 2.840, 2.754,},
81 { 7.234, 5.110, 4.249, 3.767, 3.454, 3.232, 3.066, 2.935, 2.830, 2.743,},
82 { 7.220, 5.099, 4.238, 3.757, 3.444, 3.222, 3.056, 2.925, 2.820, 2.733,},
83 { 7.207, 5.087, 4.228, 3.747, 3.434, 3.213, 3.046, 2.916, 2.811, 2.724,},
84 { 7.194, 5.077, 4.218, 3.737, 3.425, 3.204, 3.037, 2.907, 2.802, 2.715,},
85 { 7.182, 5.066, 4.208, 3.728, 3.416, 3.195, 3.028, 2.898, 2.793, 2.706,},
86 { 7.171, 5.057, 4.199, 3.720, 3.408, 3.186, 3.020, 2.890, 2.785, 2.698,},
87 { 7.159, 5.047, 4.191, 3.711, 3.400, 3.178, 3.012, 2.882, 2.777, 2.690,},
88 { 7.149, 5.038, 4.182, 3.703, 3.392, 3.171, 3.005, 2.874, 2.769, 2.683,},
89 { 7.139, 5.030, 4.174, 3.695, 3.384, 3.163, 2.997, 2.867, 2.762, 2.675,},
90 { 7.129, 5.021, 4.167, 3.688, 3.377, 3.156, 2.990, 2.860, 2.755, 2.668,},
91 { 7.119, 5.013, 4.159, 3.681, 3.370, 3.149, 2.983, 2.853, 2.748, 2.662,},
92 { 7.110, 5.006, 4.152, 3.674, 3.363, 3.143, 2.977, 2.847, 2.742, 2.655,},
93 { 7.102, 4.998, 4.145, 3.667, 3.357, 3.136, 2.971, 2.841, 2.736, 2.649,},
94 { 7.093, 4.991, 4.138, 3.661, 3.351, 3.130, 2.965, 2.835, 2.730, 2.643,},
95 { 7.085, 4.984, 4.132, 3.655, 3.345, 3.124, 2.959, 2.829, 2.724, 2.637,},
96 { 7.077, 4.977, 4.126, 3.649, 3.339, 3.119, 2.953, 2.823, 2.718, 2.632,},
97 { 7.070, 4.971, 4.120, 3.643, 3.333, 3.113, 2.948, 2.818, 2.713, 2.626,},
98 { 7.062, 4.965, 4.114, 3.638, 3.328, 3.108, 2.942, 2.813, 2.708, 2.621,},
99 { 7.055, 4.959, 4.109, 3.632, 3.323, 3.103, 2.937, 2.808, 2.703, 2.616,},
100 { 7.048, 4.953, 4.103, 3.627, 3.318, 3.098, 2.932, 2.803, 2.698, 2.611,},
101 { 7.042, 4.947, 4.098, 3.622, 3.313, 3.093, 2.928, 2.798, 2.693, 2.607,},
102 { 7.035, 4.942, 4.093, 3.618, 3.308, 3.088, 2.923, 2.793, 2.689, 2.602,},
103 { 7.029, 4.937, 4.088, 3.613, 3.304, 3.084, 2.919, 2.789, 2.684, 2.598,},
104 { 7.023, 4.932, 4.083, 3.608, 3.299, 3.080, 2.914, 2.785, 2.680, 2.593,},
105 { 7.017, 4.927, 4.079, 3.604, 3.295, 3.075, 2.910, 2.781, 2.676, 2.589,},
106 { 7.011, 4.922, 4.074, 3.600, 3.291, 3.071, 2.906, 2.777, 2.672, 2.585,},
107 { 7.006, 4.917, 4.070, 3.596, 3.287, 3.067, 2.902, 2.773, 2.668, 2.581,},
108 { 7.001, 4.913, 4.066, 3.591, 3.283, 3.063, 2.898, 2.769, 2.664, 2.578,},
109 { 6.995, 4.908, 4.062, 3.588, 3.279, 3.060, 2.895, 2.765, 2.660, 2.574,},
110 { 6.990, 4.904, 4.058, 3.584, 3.275, 3.056, 2.891, 2.762, 2.657, 2.570,},
111 { 6.985, 4.900, 4.054, 3.580, 3.272, 3.052, 2.887, 2.758, 2.653, 2.567,},
112 { 6.981, 4.896, 4.050, 3.577, 3.268, 3.049, 2.884, 2.755, 2.650, 2.563,},
113 { 6.976, 4.892, 4.047, 3.573, 3.265, 3.046, 2.881, 2.751, 2.647, 2.560,},
114 { 6.971, 4.888, 4.043, 3.570, 3.261, 3.042, 2.877, 2.748, 2.644, 2.557,},
115 { 6.967, 4.884, 4.040, 3.566, 3.258, 3.039, 2.874, 2.745, 2.640, 2.554,},
116 { 6.963, 4.881, 4.036, 3.563, 3.255, 3.036, 2.871, 2.742, 2.637, 2.551,},
117 { 6.958, 4.877, 4.033, 3.560, 3.252, 3.033, 2.868, 2.739, 2.634, 2.548,},
118 { 6.954, 4.874, 4.030, 3.557, 3.249, 3.030, 2.865, 2.736, 2.632, 2.545,},
119 { 6.950, 4.870, 4.027, 3.554, 3.246, 3.027, 2.863, 2.733, 2.629, 2.542,},
120 { 6.947, 4.867, 4.024, 3.551, 3.243, 3.025, 2.860, 2.731, 2.626, 2.539,},
121 { 6.943, 4.864, 4.021, 3.548, 3.240, 3.022, 2.857, 2.728, 2.623, 2.537,},
122 { 6.939, 4.861, 4.018, 3.545, 3.238, 3.019, 2.854, 2.725, 2.621, 2.534,},
123 { 6.935, 4.858, 4.015, 3.543, 3.235, 3.017, 2.852, 2.723, 2.618, 2.532,},
124 { 6.932, 4.855, 4.012, 3.540, 3.233, 3.014, 2.849, 2.720, 2.616, 2.529,},
125 { 6.928, 4.852, 4.010, 3.538, 3.230, 3.012, 2.847, 2.718, 2.613, 2.527,},
126 { 6.925, 4.849, 4.007, 3.535, 3.228, 3.009, 2.845, 2.715, 2.611, 2.524,},
127 { 6.922, 4.846, 4.004, 3.533, 3.225, 3.007, 2.842, 2.713, 2.609, 2.522,},
128 { 6.919, 4.844, 4.002, 3.530, 3.223, 3.004, 2.840, 2.711, 2.606, 2.520,},
129 { 6.915, 4.841, 3.999, 3.528, 3.221, 3.002, 2.838, 2.709, 2.604, 2.518,},
130 { 6.912, 4.838, 3.997, 3.525, 3.218, 3.000, 2.835, 2.706, 2.602, 2.515,},
131 { 6.909, 4.836, 3.995, 3.523, 3.216, 2.998, 2.833, 2.704, 2.600, 2.513,},
132 { 6.906, 4.833, 3.992, 3.521, 3.214, 2.996, 2.831, 2.702, 2.598, 2.511,},
133 { 6.904, 4.831, 3.990, 3.519, 3.212, 2.994, 2.829, 2.700, 2.596, 2.509,},
134 { 6.901, 4.829, 3.988, 3.517, 3.210, 2.992, 2.827, 2.698, 2.594, 2.507,},
135 { 6.898, 4.826, 3.986, 3.515, 3.208, 2.990, 2.825, 2.696, 2.592, 2.505,},
136 { 6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}
143 #define MINVARIANCE 0.0004
151 #define MINSAMPLESPERBUCKET 5
152 #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
153 #define MINSAMPLESNEEDED 1
161 #define BUCKETTABLESIZE 1024
162 #define NORMALEXTENT 3.0
207 #define Odd(N) ((N)%2)
208 #define Mirror(N,R) ((R) - (N) - 1)
209 #define Abs(N) (((N) < 0) ? (-(N)) : (N))
219 #define SqrtOf2Pi 2.506628275
221 static const double kNormalVariance =
223 static const double kNormalMagnitude =
229 #define LOOKUPTABLESIZE 8
230 #define MAXDEGREESOFFREEDOM MAXBUCKETS
233 MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000
243 static void CreateClusterTree(
CLUSTERER* Clusterer);
258 static PROTOTYPE* MakeDegenerateProto(uint16_t N,
276 BUCKETS* NormalBuckets,
double Confidence);
295 static bool Independent(
PARAM_DESC* ParamDesc,
296 int16_t N,
float* CoVariance,
float Independence);
300 uint32_t SampleCount,
304 uint32_t SampleCount,
307 static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount);
309 static double ComputeChiSquared(uint16_t DegreesOfFreedom,
double Alpha);
311 static double NormalDensity(int32_t x);
313 static double UniformDensity(int32_t x);
315 static double Integral(
double f1,
double f2,
double Dx);
317 static void FillBuckets(
BUCKETS *Buckets,
324 static uint16_t NormalBucket(
PARAM_DESC *ParamDesc,
329 static uint16_t UniformBucket(
PARAM_DESC *ParamDesc,
334 static bool DistributionOK(
BUCKETS* Buckets);
336 static void FreeStatistics(
STATISTICS *Statistics);
338 static void FreeBuckets(
BUCKETS *Buckets);
340 static void FreeCluster(
CLUSTER *Cluster);
342 static uint16_t DegreesOfFreedom(
DISTRIBUTION Distribution, uint16_t HistogramBuckets);
344 static void AdjustBuckets(
BUCKETS *Buckets, uint32_t NewSampleCount);
346 static void InitBuckets(
BUCKETS *Buckets);
348 static int AlphaMatch(
void *arg1,
351 static CHISTRUCT *NewChiStruct(uint16_t DegreesOfFreedom,
double Alpha);
354 void *FunctionParams,
358 static double ChiArea(
CHISTRUCT *ChiParams,
double x);
360 static bool MultipleCharSamples(
CLUSTERER* Clusterer,
364 static double InvertMatrix(
const float* input,
int size,
float* inv);
387 Clusterer->
Root =
nullptr;
393 for (i = 0; i < SampleSize; i++) {
401 (ParamDesc[i].
Max + ParamDesc[i].
Min) / 2;
441 1) * sizeof (
float)));
445 Sample->
Left =
nullptr;
446 Sample->
Right =
nullptr;
450 Sample->
Mean[i] = Feature[i];
455 if (CharID >= Clusterer->
NumChar)
456 Clusterer->
NumChar = CharID + 1;
485 if (Clusterer->
Root ==
nullptr)
486 CreateClusterTree(Clusterer);
493 ComputePrototypes(Clusterer,
Config);
498 auto *proto = reinterpret_cast<PROTOTYPE *>(
first_node(proto_list));
499 proto->Cluster =
nullptr;
515 if (Clusterer !=
nullptr) {
517 if (Clusterer->
KDTree !=
nullptr)
519 if (Clusterer->
Root !=
nullptr)
520 FreeCluster (Clusterer->
Root);
550 auto *Prototype = static_cast<PROTOTYPE *>(arg);
553 if (Prototype->Cluster !=
nullptr)
554 Prototype->Cluster->Prototype =
false;
557 free(Prototype->Distrib);
558 free(Prototype->Mean);
560 free(Prototype->Variance.Elliptical);
561 free(Prototype->Magnitude.Elliptical);
562 free(Prototype->Weight.Elliptical);
586 *SearchState =
pop (*SearchState);
588 if (Cluster->
Left ==
nullptr)
590 *SearchState =
push (*SearchState, Cluster->
Right);
591 Cluster = Cluster->
Left;
603 return (Proto->
Mean[Dimension]);
614 switch (Proto->
Style) {
616 return (static_cast<float>(sqrt (static_cast<double>(Proto->
Variance.
Spherical))));
618 return (static_cast<float>(sqrt (static_cast<double>(Proto->
Variance.
Elliptical[Dimension]))));
620 switch (Proto->
Distrib[Dimension]) {
622 return (static_cast<float>(sqrt (static_cast<double>(Proto->
Variance.
Elliptical[Dimension]))));
650 static void CreateClusterTree(
CLUSTERER *Clusterer) {
661 KDWalk(context.
tree, reinterpret_cast<void_proc>(MakePotentialClusters), &context);
664 while (context.
heap->
Pop(&HeapEntry)) {
665 PotentialCluster = HeapEntry.
data;
677 FindNearestNeighbor(context.
tree, PotentialCluster->
Cluster,
679 if (PotentialCluster->
Neighbor !=
nullptr) {
687 MakeNewCluster(Clusterer, PotentialCluster);
689 FindNearestNeighbor(context.
tree, PotentialCluster->
Cluster,
691 if (PotentialCluster->
Neighbor !=
nullptr) {
702 Clusterer->
KDTree =
nullptr;
719 int next = context->
next;
723 FindNearestNeighbor(context->
tree,
746 FindNearestNeighbor(
KDTREE* Tree,
CLUSTER* Cluster,
float* Distance)
747 #define MAXNEIGHBORS 2
748 #define MAXDISTANCE FLT_MAX
752 int NumberOfNeighbors;
758 &NumberOfNeighbors, reinterpret_cast<void **>(Neighbor), Dist);
762 BestNeighbor =
nullptr;
763 for (i = 0; i < NumberOfNeighbors; i++) {
764 if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
766 BestNeighbor = Neighbor[i];
786 Cluster = static_cast<CLUSTER *>(
Emalloc(
829 float m1[],
float m2[]) {
833 for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
834 if (ParamDesc->Circular) {
838 if ((*m2 - *m1) > ParamDesc->HalfRange) {
839 *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
840 if (*m < ParamDesc->Min)
841 *m += ParamDesc->Range;
843 else if ((*m1 - *m2) > ParamDesc->HalfRange) {
844 *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
845 if (*m < ParamDesc->Min)
846 *m += ParamDesc->Range;
849 *m = (n1 * *m1 + n2 * *m2) / n;
852 *m = (n1 * *m1 + n2 * *m2) / n;
872 if (Clusterer->
Root !=
nullptr)
881 ClusterStack =
pop (ClusterStack);
882 Prototype = MakePrototype(Clusterer,
Config, Cluster);
883 if (Prototype !=
nullptr) {
887 ClusterStack =
push (ClusterStack, Cluster->
Right);
888 ClusterStack =
push (ClusterStack, Cluster->
Left);
925 Proto = MakeDegenerateProto(
928 if (Proto !=
nullptr) {
929 FreeStatistics(Statistics);
935 FreeStatistics(Statistics);
940 Proto = TestEllipticalProto(Clusterer,
Config, Cluster, Statistics);
941 if (Proto !=
nullptr) {
942 FreeStatistics(Statistics);
954 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
957 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
960 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
964 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
965 if (Proto !=
nullptr)
967 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
968 if (Proto !=
nullptr)
970 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
974 FreeStatistics(Statistics);
1002 int32_t MinSamples) {
1011 Proto = NewSphericalProto (N, Cluster, Statistics);
1015 Proto = NewEllipticalProto (N, Cluster, Statistics);
1018 Proto = NewMixedProto (N, Cluster, Statistics);
1047 const double kMagicSampleMargin = 0.0625;
1048 const double kFTableBoostMargin = 2.0;
1053 if (Left ==
nullptr || Right ==
nullptr)
1056 if (TotalDims < N + 1 || TotalDims < 2)
1058 std::vector<float> Covariance(static_cast<size_t>(N) * N);
1059 std::vector<float> Inverse(static_cast<size_t>(N) * N);
1060 std::vector<float> Delta(N);
1062 for (
int i = 0; i < N; ++i) {
1063 int row_offset = i * N;
1065 for (
int j = 0; j < N; ++j) {
1067 Covariance[j + row_offset] = Statistics->
CoVariance[j + row_offset];
1069 Covariance[j + row_offset] = 0.0f;
1072 for (
int j = 0; j < N; ++j) {
1074 Covariance[j + row_offset] = 1.0f;
1076 Covariance[j + row_offset] = 0.0f;
1080 double err = InvertMatrix(&Covariance[0], N, &Inverse[0]);
1082 tprintf(
"Clustering error: Matrix inverse failed with error %g\n", err);
1085 for (
int dim = 0; dim < N; ++dim) {
1087 Delta[dim] = Left->
Mean[dim] - Right->
Mean[dim];
1095 for (
int x = 0; x < N; ++x) {
1097 for (
int y = 0; y < N; ++y) {
1098 temp += static_cast<double>(Inverse[y + N * x]) * Delta[y];
1100 Tsq += Delta[x] * temp;
1106 double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
1107 int Fx = EssentialN;
1111 int Fy = TotalDims - EssentialN - 1;
1115 double FTarget =
FTable[Fy][Fx];
1120 FTarget += kFTableBoostMargin;
1123 return NewEllipticalProto (Clusterer->
SampleSize, Cluster, Statistics);
1146 for (i = 0; i < Clusterer->
SampleSize; i++) {
1150 FillBuckets (Buckets, Cluster, i, &(Clusterer->
ParamDesc[i]),
1152 sqrt (static_cast<double>(Statistics->
AvgVariance)));
1153 if (!DistributionOK (Buckets))
1158 Proto = NewSphericalProto (Clusterer->
SampleSize, Cluster, Statistics);
1180 for (i = 0; i < Clusterer->
SampleSize; i++) {
1184 FillBuckets (Buckets, Cluster, i, &(Clusterer->
ParamDesc[i]),
1186 sqrt (static_cast<double>(Statistics->
1187 CoVariance[i * (Clusterer->
SampleSize + 1)])));
1188 if (!DistributionOK (Buckets))
1193 Proto = NewEllipticalProto (Clusterer->
SampleSize, Cluster, Statistics);
1214 BUCKETS* NormalBuckets,
double Confidence) {
1217 BUCKETS *UniformBuckets =
nullptr;
1218 BUCKETS *RandomBuckets =
nullptr;
1221 Proto = NewMixedProto (Clusterer->
SampleSize, Cluster, Statistics);
1224 for (i = 0; i < Clusterer->
SampleSize; i++) {
1228 FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->
ParamDesc[i]),
1231 if (DistributionOK (NormalBuckets))
1234 if (RandomBuckets ==
nullptr)
1237 MakeDimRandom (i, Proto, &(Clusterer->
ParamDesc[i]));
1238 FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->
ParamDesc[i]),
1240 if (DistributionOK (RandomBuckets))
1243 if (UniformBuckets ==
nullptr)
1246 MakeDimUniform(i, Proto, Statistics);
1247 FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->
ParamDesc[i]),
1249 if (DistributionOK (UniformBuckets))
1254 if (i < Clusterer->SampleSize) {
1292 (Statistics->
Min[i] + Statistics->
Max[i]) / 2;
1294 (Statistics->
Max[i] - Statistics->
Min[i]) / 2;
1330 uint32_t SampleCountAdjustedForBias;
1335 Statistics->
Min = static_cast<float *>(
Emalloc (N *
sizeof (
float)));
1336 Statistics->
Max = static_cast<float *>(
Emalloc (N *
sizeof (
float)));
1339 Distance = static_cast<float *>(
Emalloc (N *
sizeof (
float)));
1344 for (i = 0; i < N; i++) {
1345 Statistics->
Min[i] = 0.0;
1346 Statistics->
Max[i] = 0.0;
1347 for (j = 0; j < N; j++, CoVariance++)
1352 while ((Sample =
NextSample (&SearchState)) !=
nullptr) {
1353 for (i = 0; i < N; i++) {
1354 Distance[i] = Sample->
Mean[i] - Cluster->
Mean[i];
1355 if (ParamDesc[i].Circular) {
1356 if (Distance[i] > ParamDesc[i].HalfRange)
1357 Distance[i] -= ParamDesc[i].
Range;
1358 if (Distance[i] < -ParamDesc[i].HalfRange)
1359 Distance[i] += ParamDesc[i].
Range;
1361 if (Distance[i] < Statistics->
Min[i])
1362 Statistics->
Min[i] = Distance[i];
1363 if (Distance[i] > Statistics->
Max[i])
1364 Statistics->
Max[i] = Distance[i];
1367 for (i = 0; i < N; i++)
1368 for (j = 0; j < N; j++, CoVariance++)
1369 *CoVariance += Distance[i] * Distance[j];
1376 SampleCountAdjustedForBias = Cluster->
SampleCount - 1;
1378 SampleCountAdjustedForBias = 1;
1380 for (i = 0; i < N; i++)
1381 for (j = 0; j < N; j++, CoVariance++) {
1382 *CoVariance /= SampleCountAdjustedForBias;
1394 return (Statistics);
1412 Proto = NewSimpleProto (N, Cluster);
1421 static_cast<double>(N)));
1444 Proto = NewSimpleProto (N, Cluster);
1451 for (i = 0; i < N; i++, CoVariance += N + 1) {
1484 Proto = NewEllipticalProto (N, Cluster, Statistics);
1487 for (i = 0; i < N; i++) {
1507 Proto->
Mean = static_cast<float *>(
Emalloc (N *
sizeof (
float)));
1509 for (i = 0; i < N; i++)
1542 int16_t N,
float* CoVariance,
float Independence) {
1546 float CorrelationCoeff;
1549 for (i = 0; i < N; i++, VARii += N + 1) {
1550 if (ParamDesc[i].NonEssential)
1553 VARjj = VARii + N + 1;
1554 CoVariance = VARii + 1;
1555 for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
1556 if (ParamDesc[j].NonEssential)
1559 if ((*VARii == 0.0) || (*VARjj == 0.0))
1560 CorrelationCoeff = 0.0;
1563 sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
1564 if (CorrelationCoeff > Independence)
1588 uint32_t SampleCount,
1589 double Confidence) {
1591 uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1596 if (Buckets ==
nullptr) {
1597 Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
1603 AdjustBuckets(Buckets, SampleCount);
1610 InitBuckets(Buckets);
1632 uint32_t SampleCount,
1633 double Confidence) {
1635 { NormalDensity, UniformDensity, UniformDensity };
1638 double BucketProbability;
1639 double NextBucketBoundary;
1641 double ProbabilityDelta;
1642 double LastProbDensity;
1644 uint16_t CurrentBucket;
1660 Buckets->
Count[i] = 0;
1667 DegreesOfFreedom(Distribution, Buckets->
NumberOfBuckets), Confidence);
1671 BucketProbability = 1.0 / static_cast<double>(Buckets->
NumberOfBuckets);
1676 NextBucketBoundary = BucketProbability / 2;
1678 NextBucketBoundary = BucketProbability;
1682 (*DensityFunction[static_cast<int>(Distribution)]) (
BUCKETTABLESIZE / 2);
1684 ProbDensity = (*DensityFunction[static_cast<int>(Distribution)]) (i + 1);
1685 ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
1686 Probability += ProbabilityDelta;
1687 if (Probability > NextBucketBoundary) {
1688 if (CurrentBucket < Buckets->NumberOfBuckets - 1)
1690 NextBucketBoundary += BucketProbability;
1692 Buckets->
Bucket[i] = CurrentBucket;
1694 static_cast<float>(ProbabilityDelta * SampleCount);
1695 LastProbDensity = ProbDensity;
1699 static_cast<float>((0.5 - Probability) * SampleCount);
1726 static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount) {
1730 if (SampleCount < kCountTable[0])
1731 return kBucketsTable[0];
1734 if (SampleCount <= kCountTable[Next]) {
1735 Slope = static_cast<float>(kBucketsTable[Next] - kBucketsTable[Last]) /
1736 static_cast<float>(kCountTable[Next] - kCountTable[Last]);
1737 return (static_cast<uint16_t>(kBucketsTable[Last] +
1738 Slope * (SampleCount - kCountTable[Last])));
1741 return kBucketsTable[Last];
1761 ComputeChiSquared (uint16_t DegreesOfFreedom,
double Alpha)
1762 #define CHIACCURACY 0.01
1763 #define MINALPHA (1e-200)
1773 if (
Odd (DegreesOfFreedom))
1779 SearchKey.
Alpha = Alpha;
1781 &SearchKey, AlphaMatch));
1783 if (OldChiSquared ==
nullptr) {
1784 OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
1785 OldChiSquared->
ChiSquared = Solve (ChiArea, OldChiSquared,
1786 static_cast<double>(DegreesOfFreedom),
1788 ChiWith[DegreesOfFreedom] =
push (ChiWith[DegreesOfFreedom],
1812 static double NormalDensity(int32_t x) {
1815 Distance = x - kNormalMean;
1816 return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
1826 static double UniformDensity(int32_t x) {
1830 return UniformDistributionDensity;
1844 static double Integral(
double f1,
double f2,
double Dx) {
1845 return (f1 + f2) * Dx / 2.0;
1868 static void FillBuckets(
BUCKETS *Buckets,
1881 Buckets->
Count[i] = 0;
1883 if (StdDev == 0.0) {
1892 while ((Sample =
NextSample (&SearchState)) !=
nullptr) {
1899 Buckets->
Count[BucketID] += 1;
1908 while ((Sample =
NextSample (&SearchState)) !=
nullptr) {
1911 BucketID = NormalBucket (ParamDesc, Sample->
Mean[Dim],
1916 BucketID = UniformBucket (ParamDesc, Sample->
Mean[Dim],
1938 static uint16_t NormalBucket(
PARAM_DESC *ParamDesc,
1947 x -= ParamDesc->
Range;
1948 else if (x - Mean < -ParamDesc->HalfRange)
1949 x += ParamDesc->
Range;
1952 X = ((x -
Mean) / StdDev) * kNormalStdDev + kNormalMean;
1957 return static_cast<uint16_t>(floor(static_cast<double>(X)));
1971 static uint16_t UniformBucket(
PARAM_DESC *ParamDesc,
1980 x -= ParamDesc->
Range;
1981 else if (x - Mean < -ParamDesc->HalfRange)
1982 x += ParamDesc->
Range;
1990 return static_cast<uint16_t>(floor(static_cast<double>(X)));
2003 static bool DistributionOK(
BUCKETS* Buckets) {
2004 float FrequencyDifference;
2005 float TotalDifference;
2009 TotalDifference = 0.0;
2012 TotalDifference += (FrequencyDifference * FrequencyDifference) /
2028 static void FreeStatistics(
STATISTICS *Statistics) {
2030 free(Statistics->
Min);
2031 free(Statistics->
Max);
2040 static void FreeBuckets(
BUCKETS *buckets) {
2053 static void FreeCluster(
CLUSTER *Cluster) {
2054 if (Cluster !=
nullptr) {
2055 FreeCluster (Cluster->
Left);
2056 FreeCluster (Cluster->
Right);
2073 static uint16_t DegreesOfFreedom(
DISTRIBUTION Distribution, uint16_t HistogramBuckets) {
2074 static uint8_t DegreeOffsets[] = { 3, 3, 1 };
2076 uint16_t AdjustedNumBuckets;
2078 AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[static_cast<int>(Distribution)];
2079 if (
Odd (AdjustedNumBuckets))
2080 AdjustedNumBuckets++;
2081 return (AdjustedNumBuckets);
2092 static void AdjustBuckets(
BUCKETS *Buckets, uint32_t NewSampleCount) {
2094 double AdjustFactor;
2096 AdjustFactor = ((static_cast<double>(NewSampleCount)) /
2112 static void InitBuckets(
BUCKETS *Buckets) {
2116 Buckets->
Count[i] = 0;
2133 static int AlphaMatch(
void *arg1,
2135 auto *ChiStruct = static_cast<CHISTRUCT *>(arg1);
2136 auto *SearchKey = static_cast<CHISTRUCT *>(arg2);
2138 return (ChiStruct->Alpha == SearchKey->
Alpha);
2151 static CHISTRUCT *NewChiStruct(uint16_t DegreesOfFreedom,
double Alpha) {
2156 NewChiStruct->
Alpha = Alpha;
2157 return (NewChiStruct);
2176 void *FunctionParams,
double InitialGuess,
double Accuracy)
2177 #define INITIALDELTA 0.1
2178 #define DELTARATIO 0.1
2186 double LastPosX, LastNegX;
2191 LastNegX = -FLT_MAX;
2192 f = (*Function) (static_cast<CHISTRUCT *>(FunctionParams), x);
2193 while (
Abs (LastPosX - LastNegX) > Accuracy) {
2202 ((*Function) (static_cast<CHISTRUCT *>(FunctionParams), x + Delta) - f) / Delta;
2211 if (NewDelta < Delta)
2215 f = (*Function) (static_cast<CHISTRUCT *>(FunctionParams), x);
2239 static double ChiArea(
CHISTRUCT *ChiParams,
double x) {
2249 for (i = 1; i <= N; i++) {
2250 Denominator *= 2 * i;
2252 SeriesTotal += PowerOfx / Denominator;
2254 return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->
Alpha);
2282 MultipleCharSamples(
CLUSTERER* Clusterer,
2283 CLUSTER* Cluster,
float MaxIllegal)
2284 #define ILLEGAL_CHAR 2
2286 static std::vector<uint8_t> CharFlags;
2290 int32_t NumCharInCluster;
2291 int32_t NumIllegalInCluster;
2292 float PercentIllegal;
2296 NumIllegalInCluster = 0;
2298 if (Clusterer->
NumChar > CharFlags.size()) {
2299 CharFlags.resize(Clusterer->
NumChar);
2302 for (
auto& CharFlag : CharFlags)
2307 while ((Sample =
NextSample (&SearchState)) !=
nullptr) {
2309 if (CharFlags[CharID] ==
false) {
2310 CharFlags[CharID] =
true;
2313 if (CharFlags[CharID] ==
true) {
2314 NumIllegalInCluster++;
2318 PercentIllegal = static_cast<float>(NumIllegalInCluster) / NumCharInCluster;
2319 if (PercentIllegal > MaxIllegal) {
2334 static double InvertMatrix(
const float* input,
int size,
float* inv) {
2343 for (row = 0; row < size; row++) {
2344 for (col = 0; col < size; col++) {
2345 U[row][col] = input[row*size + col];
2346 L[row][col] = row == col ? 1.0 : 0.0;
2347 U_inv[row][col] = 0.0;
2352 for (col = 0; col < size; ++col) {
2355 double best_pivot = -1.0;
2356 for (row = col; row < size; ++row) {
2357 if (
Abs(U[row][col]) > best_pivot) {
2358 best_pivot =
Abs(U[row][col]);
2363 if (best_row != col) {
2364 for (
int k = 0; k < size; ++k) {
2365 double tmp = U[best_row][k];
2366 U[best_row][k] = U[col][k];
2368 tmp = L[best_row][k];
2369 L[best_row][k] = L[col][k];
2374 for (row = col + 1; row < size; ++row) {
2375 double ratio = -U[row][col] / U[col][col];
2376 for (
int j = col; j < size; ++j) {
2377 U[row][j] += U[col][j] * ratio;
2379 for (
int k = 0; k < size; ++k) {
2380 L[row][k] += L[col][k] * ratio;
2385 for (col = 0; col < size; ++col) {
2386 U_inv[col][col] = 1.0 / U[col][col];
2387 for (row = col - 1; row >= 0; --row) {
2389 for (
int k = col; k > row; --k) {
2390 total += U[row][k] * U_inv[k][col];
2392 U_inv[row][col] = -total / U[row][row];
2396 for (row = 0; row < size; row++) {
2397 for (col = 0; col < size; col++) {
2399 for (
int k = row; k < size; ++k) {
2400 sum += U_inv[row][k] * L[k][col];
2402 inv[row*size + col] = sum;
2406 double error_sum = 0.0;
2407 for (row = 0; row < size; row++) {
2408 for (col = 0; col < size; col++) {
2410 for (
int k = 0; k < size; ++k) {
2411 sum += static_cast<double>(input[row * size + k]) * inv[k * size + col];
2414 error_sum +=
Abs(sum);