31 #define HOTELLING 1 // If true use Hotelling's test to decide where to split. 32 #define FTABLE_X 10 // Size of FTable. 33 #define FTABLE_Y 100 // Size of FTable. 37 {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,},
38 {98.502, 99.000, 99.166, 99.249, 99.300, 99.333, 99.356, 99.374, 99.388, 99.399,},
39 {34.116, 30.816, 29.457, 28.710, 28.237, 27.911, 27.672, 27.489, 27.345, 27.229,},
40 {21.198, 18.000, 16.694, 15.977, 15.522, 15.207, 14.976, 14.799, 14.659, 14.546,},
41 {16.258, 13.274, 12.060, 11.392, 10.967, 10.672, 10.456, 10.289, 10.158, 10.051,},
42 {13.745, 10.925, 9.780, 9.148, 8.746, 8.466, 8.260, 8.102, 7.976, 7.874,},
43 {12.246, 9.547, 8.451, 7.847, 7.460, 7.191, 6.993, 6.840, 6.719, 6.620,},
44 {11.259, 8.649, 7.591, 7.006, 6.632, 6.371, 6.178, 6.029, 5.911, 5.814,},
45 {10.561, 8.022, 6.992, 6.422, 6.057, 5.802, 5.613, 5.467, 5.351, 5.257,},
46 {10.044, 7.559, 6.552, 5.994, 5.636, 5.386, 5.200, 5.057, 4.942, 4.849,},
47 { 9.646, 7.206, 6.217, 5.668, 5.316, 5.069, 4.886, 4.744, 4.632, 4.539,},
48 { 9.330, 6.927, 5.953, 5.412, 5.064, 4.821, 4.640, 4.499, 4.388, 4.296,},
49 { 9.074, 6.701, 5.739, 5.205, 4.862, 4.620, 4.441, 4.302, 4.191, 4.100,},
50 { 8.862, 6.515, 5.564, 5.035, 4.695, 4.456, 4.278, 4.140, 4.030, 3.939,},
51 { 8.683, 6.359, 5.417, 4.893, 4.556, 4.318, 4.142, 4.004, 3.895, 3.805,},
52 { 8.531, 6.226, 5.292, 4.773, 4.437, 4.202, 4.026, 3.890, 3.780, 3.691,},
53 { 8.400, 6.112, 5.185, 4.669, 4.336, 4.102, 3.927, 3.791, 3.682, 3.593,},
54 { 8.285, 6.013, 5.092, 4.579, 4.248, 4.015, 3.841, 3.705, 3.597, 3.508,},
55 { 8.185, 5.926, 5.010, 4.500, 4.171, 3.939, 3.765, 3.631, 3.523, 3.434,},
56 { 8.096, 5.849, 4.938, 4.431, 4.103, 3.871, 3.699, 3.564, 3.457, 3.368,},
57 { 8.017, 5.780, 4.874, 4.369, 4.042, 3.812, 3.640, 3.506, 3.398, 3.310,},
58 { 7.945, 5.719, 4.817, 4.313, 3.988, 3.758, 3.587, 3.453, 3.346, 3.258,},
59 { 7.881, 5.664, 4.765, 4.264, 3.939, 3.710, 3.539, 3.406, 3.299, 3.211,},
60 { 7.823, 5.614, 4.718, 4.218, 3.895, 3.667, 3.496, 3.363, 3.256, 3.168,},
61 { 7.770, 5.568, 4.675, 4.177, 3.855, 3.627, 3.457, 3.324, 3.217, 3.129,},
62 { 7.721, 5.526, 4.637, 4.140, 3.818, 3.591, 3.421, 3.288, 3.182, 3.094,},
63 { 7.677, 5.488, 4.601, 4.106, 3.785, 3.558, 3.388, 3.256, 3.149, 3.062,},
64 { 7.636, 5.453, 4.568, 4.074, 3.754, 3.528, 3.358, 3.226, 3.120, 3.032,},
65 { 7.598, 5.420, 4.538, 4.045, 3.725, 3.499, 3.330, 3.198, 3.092, 3.005,},
66 { 7.562, 5.390, 4.510, 4.018, 3.699, 3.473, 3.305, 3.173, 3.067, 2.979,},
67 { 7.530, 5.362, 4.484, 3.993, 3.675, 3.449, 3.281, 3.149, 3.043, 2.955,},
68 { 7.499, 5.336, 4.459, 3.969, 3.652, 3.427, 3.258, 3.127, 3.021, 2.934,},
69 { 7.471, 5.312, 4.437, 3.948, 3.630, 3.406, 3.238, 3.106, 3.000, 2.913,},
70 { 7.444, 5.289, 4.416, 3.927, 3.611, 3.386, 3.218, 3.087, 2.981, 2.894,},
71 { 7.419, 5.268, 4.396, 3.908, 3.592, 3.368, 3.200, 3.069, 2.963, 2.876,},
72 { 7.396, 5.248, 4.377, 3.890, 3.574, 3.351, 3.183, 3.052, 2.946, 2.859,},
73 { 7.373, 5.229, 4.360, 3.873, 3.558, 3.334, 3.167, 3.036, 2.930, 2.843,},
74 { 7.353, 5.211, 4.343, 3.858, 3.542, 3.319, 3.152, 3.021, 2.915, 2.828,},
75 { 7.333, 5.194, 4.327, 3.843, 3.528, 3.305, 3.137, 3.006, 2.901, 2.814,},
76 { 7.314, 5.179, 4.313, 3.828, 3.514, 3.291, 3.124, 2.993, 2.888, 2.801,},
77 { 7.296, 5.163, 4.299, 3.815, 3.501, 3.278, 3.111, 2.980, 2.875, 2.788,},
78 { 7.280, 5.149, 4.285, 3.802, 3.488, 3.266, 3.099, 2.968, 2.863, 2.776,},
79 { 7.264, 5.136, 4.273, 3.790, 3.476, 3.254, 3.087, 2.957, 2.851, 2.764,},
80 { 7.248, 5.123, 4.261, 3.778, 3.465, 3.243, 3.076, 2.946, 2.840, 2.754,},
81 { 7.234, 5.110, 4.249, 3.767, 3.454, 3.232, 3.066, 2.935, 2.830, 2.743,},
82 { 7.220, 5.099, 4.238, 3.757, 3.444, 3.222, 3.056, 2.925, 2.820, 2.733,},
83 { 7.207, 5.087, 4.228, 3.747, 3.434, 3.213, 3.046, 2.916, 2.811, 2.724,},
84 { 7.194, 5.077, 4.218, 3.737, 3.425, 3.204, 3.037, 2.907, 2.802, 2.715,},
85 { 7.182, 5.066, 4.208, 3.728, 3.416, 3.195, 3.028, 2.898, 2.793, 2.706,},
86 { 7.171, 5.057, 4.199, 3.720, 3.408, 3.186, 3.020, 2.890, 2.785, 2.698,},
87 { 7.159, 5.047, 4.191, 3.711, 3.400, 3.178, 3.012, 2.882, 2.777, 2.690,},
88 { 7.149, 5.038, 4.182, 3.703, 3.392, 3.171, 3.005, 2.874, 2.769, 2.683,},
89 { 7.139, 5.030, 4.174, 3.695, 3.384, 3.163, 2.997, 2.867, 2.762, 2.675,},
90 { 7.129, 5.021, 4.167, 3.688, 3.377, 3.156, 2.990, 2.860, 2.755, 2.668,},
91 { 7.119, 5.013, 4.159, 3.681, 3.370, 3.149, 2.983, 2.853, 2.748, 2.662,},
92 { 7.110, 5.006, 4.152, 3.674, 3.363, 3.143, 2.977, 2.847, 2.742, 2.655,},
93 { 7.102, 4.998, 4.145, 3.667, 3.357, 3.136, 2.971, 2.841, 2.736, 2.649,},
94 { 7.093, 4.991, 4.138, 3.661, 3.351, 3.130, 2.965, 2.835, 2.730, 2.643,},
95 { 7.085, 4.984, 4.132, 3.655, 3.345, 3.124, 2.959, 2.829, 2.724, 2.637,},
96 { 7.077, 4.977, 4.126, 3.649, 3.339, 3.119, 2.953, 2.823, 2.718, 2.632,},
97 { 7.070, 4.971, 4.120, 3.643, 3.333, 3.113, 2.948, 2.818, 2.713, 2.626,},
98 { 7.062, 4.965, 4.114, 3.638, 3.328, 3.108, 2.942, 2.813, 2.708, 2.621,},
99 { 7.055, 4.959, 4.109, 3.632, 3.323, 3.103, 2.937, 2.808, 2.703, 2.616,},
100 { 7.048, 4.953, 4.103, 3.627, 3.318, 3.098, 2.932, 2.803, 2.698, 2.611,},
101 { 7.042, 4.947, 4.098, 3.622, 3.313, 3.093, 2.928, 2.798, 2.693, 2.607,},
102 { 7.035, 4.942, 4.093, 3.618, 3.308, 3.088, 2.923, 2.793, 2.689, 2.602,},
103 { 7.029, 4.937, 4.088, 3.613, 3.304, 3.084, 2.919, 2.789, 2.684, 2.598,},
104 { 7.023, 4.932, 4.083, 3.608, 3.299, 3.080, 2.914, 2.785, 2.680, 2.593,},
105 { 7.017, 4.927, 4.079, 3.604, 3.295, 3.075, 2.910, 2.781, 2.676, 2.589,},
106 { 7.011, 4.922, 4.074, 3.600, 3.291, 3.071, 2.906, 2.777, 2.672, 2.585,},
107 { 7.006, 4.917, 4.070, 3.596, 3.287, 3.067, 2.902, 2.773, 2.668, 2.581,},
108 { 7.001, 4.913, 4.066, 3.591, 3.283, 3.063, 2.898, 2.769, 2.664, 2.578,},
109 { 6.995, 4.908, 4.062, 3.588, 3.279, 3.060, 2.895, 2.765, 2.660, 2.574,},
110 { 6.990, 4.904, 4.058, 3.584, 3.275, 3.056, 2.891, 2.762, 2.657, 2.570,},
111 { 6.985, 4.900, 4.054, 3.580, 3.272, 3.052, 2.887, 2.758, 2.653, 2.567,},
112 { 6.981, 4.896, 4.050, 3.577, 3.268, 3.049, 2.884, 2.755, 2.650, 2.563,},
113 { 6.976, 4.892, 4.047, 3.573, 3.265, 3.046, 2.881, 2.751, 2.647, 2.560,},
114 { 6.971, 4.888, 4.043, 3.570, 3.261, 3.042, 2.877, 2.748, 2.644, 2.557,},
115 { 6.967, 4.884, 4.040, 3.566, 3.258, 3.039, 2.874, 2.745, 2.640, 2.554,},
116 { 6.963, 4.881, 4.036, 3.563, 3.255, 3.036, 2.871, 2.742, 2.637, 2.551,},
117 { 6.958, 4.877, 4.033, 3.560, 3.252, 3.033, 2.868, 2.739, 2.634, 2.548,},
118 { 6.954, 4.874, 4.030, 3.557, 3.249, 3.030, 2.865, 2.736, 2.632, 2.545,},
119 { 6.950, 4.870, 4.027, 3.554, 3.246, 3.027, 2.863, 2.733, 2.629, 2.542,},
120 { 6.947, 4.867, 4.024, 3.551, 3.243, 3.025, 2.860, 2.731, 2.626, 2.539,},
121 { 6.943, 4.864, 4.021, 3.548, 3.240, 3.022, 2.857, 2.728, 2.623, 2.537,},
122 { 6.939, 4.861, 4.018, 3.545, 3.238, 3.019, 2.854, 2.725, 2.621, 2.534,},
123 { 6.935, 4.858, 4.015, 3.543, 3.235, 3.017, 2.852, 2.723, 2.618, 2.532,},
124 { 6.932, 4.855, 4.012, 3.540, 3.233, 3.014, 2.849, 2.720, 2.616, 2.529,},
125 { 6.928, 4.852, 4.010, 3.538, 3.230, 3.012, 2.847, 2.718, 2.613, 2.527,},
126 { 6.925, 4.849, 4.007, 3.535, 3.228, 3.009, 2.845, 2.715, 2.611, 2.524,},
127 { 6.922, 4.846, 4.004, 3.533, 3.225, 3.007, 2.842, 2.713, 2.609, 2.522,},
128 { 6.919, 4.844, 4.002, 3.530, 3.223, 3.004, 2.840, 2.711, 2.606, 2.520,},
129 { 6.915, 4.841, 3.999, 3.528, 3.221, 3.002, 2.838, 2.709, 2.604, 2.518,},
130 { 6.912, 4.838, 3.997, 3.525, 3.218, 3.000, 2.835, 2.706, 2.602, 2.515,},
131 { 6.909, 4.836, 3.995, 3.523, 3.216, 2.998, 2.833, 2.704, 2.600, 2.513,},
132 { 6.906, 4.833, 3.992, 3.521, 3.214, 2.996, 2.831, 2.702, 2.598, 2.511,},
133 { 6.904, 4.831, 3.990, 3.519, 3.212, 2.994, 2.829, 2.700, 2.596, 2.509,},
134 { 6.901, 4.829, 3.988, 3.517, 3.210, 2.992, 2.827, 2.698, 2.594, 2.507,},
135 { 6.898, 4.826, 3.986, 3.515, 3.208, 2.990, 2.825, 2.696, 2.592, 2.505,},
136 { 6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}
143 #define MINVARIANCE 0.0004 151 #define MINSAMPLESPERBUCKET 5 152 #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET) 153 #define MINSAMPLESNEEDED 1 161 #define BUCKETTABLESIZE 1024 162 #define NORMALEXTENT 3.0 207 #define Odd(N) ((N)%2) 208 #define Mirror(N,R) ((R) - (N) - 1) 209 #define Abs(N) (((N) < 0) ? (-(N)) : (N)) 219 #define SqrtOf2Pi 2.506628275 221 static const double kNormalVariance =
223 static const double kNormalMagnitude =
229 #define LOOKUPTABLESIZE 8 230 #define MAXDEGREESOFFREEDOM MAXBUCKETS 233 MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000
259 float m1[],
float m2[]);
314 int16_t N,
float* CoVariance,
float Independence);
318 uint32_t SampleCount,
322 uint32_t SampleCount,
333 double Integral(
double f1,
double f2,
double Dx);
377 void *FunctionParams,
387 double InvertMatrix(
const float* input,
int size,
float* inv);
410 Clusterer->
Root =
nullptr;
416 for (i = 0; i < SampleSize; i++) {
424 (ParamDesc[i].
Max + ParamDesc[i].
Min) / 2;
464 1) *
sizeof (
float));
468 Sample->
Left =
nullptr;
469 Sample->
Right =
nullptr;
473 Sample->
Mean[i] = Feature[i];
478 if (CharID >= Clusterer->
NumChar)
479 Clusterer->
NumChar = CharID + 1;
508 if (Clusterer->
Root ==
nullptr)
539 if (Clusterer !=
nullptr) {
541 if (Clusterer->
KDTree !=
nullptr)
543 if (Clusterer->
Root !=
nullptr)
579 if (Prototype->
Cluster !=
nullptr)
584 free(Prototype->
Mean);
612 *SearchState =
pop (*SearchState);
614 if (Cluster->
Left ==
nullptr)
616 *SearchState =
push (*SearchState, Cluster->
Right);
617 Cluster = Cluster->
Left;
629 return (Proto->
Mean[Dimension]);
640 switch (Proto->
Style) {
647 switch (Proto->
Distrib[Dimension]) {
693 while (context.
heap->
Pop(&HeapEntry)) {
694 PotentialCluster = HeapEntry.
data;
708 if (PotentialCluster->
Neighbor !=
nullptr) {
720 if (PotentialCluster->
Neighbor !=
nullptr) {
731 Clusterer->
KDTree =
nullptr;
746 CLUSTER *Cluster, int32_t Level) {
748 int next = context->
next;
776 #define MAXNEIGHBORS 2 777 #define MAXDISTANCE FLT_MAX 781 int NumberOfNeighbors;
787 &NumberOfNeighbors, (
void **)Neighbor, Dist);
791 BestNeighbor =
nullptr;
792 for (i = 0; i < NumberOfNeighbors; i++) {
793 if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
795 BestNeighbor = Neighbor[i];
857 float m1[],
float m2[]) {
861 for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
862 if (ParamDesc->Circular) {
866 if ((*m2 - *m1) > ParamDesc->HalfRange) {
867 *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
868 if (*m < ParamDesc->Min)
869 *m += ParamDesc->Range;
871 else if ((*m1 - *m2) > ParamDesc->HalfRange) {
872 *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
873 if (*m < ParamDesc->Min)
874 *m += ParamDesc->Range;
877 *m = (n1 * *m1 + n2 * *m2) / n;
880 *m = (n1 * *m1 + n2 * *m2) / n;
901 if (Clusterer->
Root !=
nullptr)
910 ClusterStack =
pop (ClusterStack);
912 if (Prototype !=
nullptr) {
916 ClusterStack =
push (ClusterStack, Cluster->
Right);
917 ClusterStack =
push (ClusterStack, Cluster->
Left);
958 if (Proto !=
nullptr) {
971 if (Proto !=
nullptr) {
995 if (Proto !=
nullptr)
998 if (Proto !=
nullptr)
1032 int32_t MinSamples) {
1078 const double kMagicSampleMargin = 0.0625;
1079 const double kFTableBoostMargin = 2.0;
1084 if (Left ==
nullptr || Right ==
nullptr)
1087 if (TotalDims < N + 1 || TotalDims < 2)
1089 std::vector<float> Covariance(static_cast<size_t>(N) * N);
1090 std::vector<float> Inverse(static_cast<size_t>(N) * N);
1091 std::vector<float> Delta(N);
1093 for (
int i = 0; i < N; ++i) {
1094 int row_offset = i * N;
1096 for (
int j = 0; j < N; ++j) {
1098 Covariance[j + row_offset] = Statistics->
CoVariance[j + row_offset];
1100 Covariance[j + row_offset] = 0.0f;
1103 for (
int j = 0; j < N; ++j) {
1105 Covariance[j + row_offset] = 1.0f;
1107 Covariance[j + row_offset] = 0.0f;
1111 double err =
InvertMatrix(&Covariance[0], N, &Inverse[0]);
1113 tprintf(
"Clustering error: Matrix inverse failed with error %g\n", err);
1116 for (
int dim = 0; dim < N; ++dim) {
1118 Delta[dim] = Left->
Mean[dim] - Right->
Mean[dim];
1126 for (
int x = 0; x < N; ++x) {
1128 for (
int y = 0; y < N; ++y) {
1129 temp +=
static_cast<double>(Inverse[y + N * x]) * Delta[y];
1131 Tsq += Delta[x] * temp;
1137 double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
1138 int Fx = EssentialN;
1142 int Fy = TotalDims - EssentialN - 1;
1146 double FTarget =
FTable[Fy][Fx];
1151 FTarget += kFTableBoostMargin;
1178 for (i = 0; i < Clusterer->
SampleSize; i++) {
1213 for (i = 0; i < Clusterer->
SampleSize; i++) {
1219 sqrt ((
double) Statistics->
1220 CoVariance[i * (Clusterer->
SampleSize + 1)]));
1249 double Confidence) {
1252 BUCKETS *UniformBuckets =
nullptr;
1253 BUCKETS *RandomBuckets =
nullptr;
1259 for (i = 0; i < Clusterer->
SampleSize; i++) {
1269 if (RandomBuckets ==
nullptr)
1278 if (UniformBuckets ==
nullptr)
1289 if (i < Clusterer->SampleSize) {
1329 (Statistics->
Min[i] + Statistics->
Max[i]) / 2;
1331 (Statistics->
Max[i] - Statistics->
Min[i]) / 2;
1367 uint32_t SampleCountAdjustedForBias;
1372 Statistics->
Min = (
float *)
Emalloc (N *
sizeof (
float));
1373 Statistics->
Max = (
float *)
Emalloc (N *
sizeof (
float));
1376 Distance = (
float *)
Emalloc (N *
sizeof (
float));
1381 for (i = 0; i < N; i++) {
1382 Statistics->
Min[i] = 0.0;
1383 Statistics->
Max[i] = 0.0;
1384 for (j = 0; j < N; j++, CoVariance++)
1389 while ((Sample =
NextSample (&SearchState)) !=
nullptr) {
1390 for (i = 0; i < N; i++) {
1391 Distance[i] = Sample->
Mean[i] - Cluster->
Mean[i];
1392 if (ParamDesc[i].Circular) {
1393 if (Distance[i] > ParamDesc[i].HalfRange)
1394 Distance[i] -= ParamDesc[i].
Range;
1395 if (Distance[i] < -ParamDesc[i].HalfRange)
1396 Distance[i] += ParamDesc[i].
Range;
1398 if (Distance[i] < Statistics->
Min[i])
1399 Statistics->
Min[i] = Distance[i];
1400 if (Distance[i] > Statistics->
Max[i])
1401 Statistics->
Max[i] = Distance[i];
1404 for (i = 0; i < N; i++)
1405 for (j = 0; j < N; j++, CoVariance++)
1406 *CoVariance += Distance[i] * Distance[j];
1413 SampleCountAdjustedForBias = Cluster->
SampleCount - 1;
1415 SampleCountAdjustedForBias = 1;
1417 for (i = 0; i < N; i++)
1418 for (j = 0; j < N; j++, CoVariance++) {
1419 *CoVariance /= SampleCountAdjustedForBias;
1431 return (Statistics);
1490 for (i = 0; i < N; i++, CoVariance += N + 1) {
1525 for (i = 0; i < N; i++) {
1545 Proto->
Mean = (
float *)
Emalloc (N *
sizeof (
float));
1547 for (i = 0; i < N; i++)
1580 int16_t N,
float* CoVariance,
float Independence) {
1584 float CorrelationCoeff;
1587 for (i = 0; i < N; i++, VARii += N + 1) {
1588 if (ParamDesc[i].NonEssential)
1591 VARjj = VARii + N + 1;
1592 CoVariance = VARii + 1;
1593 for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
1594 if (ParamDesc[j].NonEssential)
1597 if ((*VARii == 0.0) || (*VARjj == 0.0))
1598 CorrelationCoeff = 0.0;
1601 sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
1602 if (CorrelationCoeff > Independence)
1626 uint32_t SampleCount,
1627 double Confidence) {
1634 if (Buckets ==
nullptr) {
1635 Buckets =
MakeBuckets(Distribution, SampleCount, Confidence);
1670 uint32_t SampleCount,
1671 double Confidence) {
1676 double BucketProbability;
1677 double NextBucketBoundary;
1679 double ProbabilityDelta;
1680 double LastProbDensity;
1682 uint16_t CurrentBucket;
1698 Buckets->
Count[i] = 0;
1714 NextBucketBoundary = BucketProbability / 2;
1716 NextBucketBoundary = BucketProbability;
1722 ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
1723 ProbabilityDelta =
Integral (LastProbDensity, ProbDensity, 1.0);
1724 Probability += ProbabilityDelta;
1725 if (Probability > NextBucketBoundary) {
1726 if (CurrentBucket < Buckets->NumberOfBuckets - 1)
1728 NextBucketBoundary += BucketProbability;
1730 Buckets->
Bucket[i] = CurrentBucket;
1732 (float) (ProbabilityDelta * SampleCount);
1733 LastProbDensity = ProbDensity;
1737 (float) ((0.5 - Probability) * SampleCount);
1768 if (SampleCount < kCountTable[0])
1769 return kBucketsTable[0];
1772 if (SampleCount <= kCountTable[Next]) {
1773 Slope = (float) (kBucketsTable[Next] - kBucketsTable[Last]) /
1774 (float) (kCountTable[Next] - kCountTable[Last]);
1775 return ((uint16_t) (kBucketsTable[Last] +
1776 Slope * (SampleCount - kCountTable[Last])));
1779 return kBucketsTable[Last];
1800 #define CHIACCURACY 0.01 1801 #define MINALPHA (1e-200) 1817 SearchKey.
Alpha = Alpha;
1821 if (OldChiSquared ==
nullptr) {
1853 Distance = x - kNormalMean;
1854 return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
1865 static double UniformDistributionDensity = (double) 1.0 /
BUCKETTABLESIZE;
1868 return UniformDistributionDensity;
1870 return (
double) 0.0;
1882 return (f1 + f2) * Dx / 2.0;
1918 Buckets->
Count[i] = 0;
1920 if (StdDev == 0.0) {
1929 while ((Sample =
NextSample (&SearchState)) !=
nullptr) {
1936 Buckets->
Count[BucketID] += 1;
1945 while ((Sample =
NextSample (&SearchState)) !=
nullptr) {
1984 x -= ParamDesc->
Range;
1985 else if (x - Mean < -ParamDesc->HalfRange)
1986 x += ParamDesc->
Range;
1989 X = ((x -
Mean) / StdDev) * kNormalStdDev + kNormalMean;
1994 return (uint16_t) floor((
double) X);
2017 x -= ParamDesc->
Range;
2018 else if (x - Mean < -ParamDesc->HalfRange)
2019 x += ParamDesc->
Range;
2027 return (uint16_t) floor((
double) X);
2041 float FrequencyDifference;
2042 float TotalDifference;
2046 TotalDifference = 0.0;
2049 TotalDifference += (FrequencyDifference * FrequencyDifference) /
2068 free(Statistics->
Min);
2069 free(Statistics->
Max);
2094 if (Cluster !=
nullptr) {
2114 static uint8_t DegreeOffsets[] = { 3, 3, 1 };
2116 uint16_t AdjustedNumBuckets;
2118 AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
2119 if (
Odd (AdjustedNumBuckets))
2120 AdjustedNumBuckets++;
2121 return (AdjustedNumBuckets);
2136 uint16_t *DesiredNumberOfBuckets = (uint16_t *) arg2;
2150 return (arg1 == arg2);
2164 double AdjustFactor;
2166 AdjustFactor = (((double) NewSampleCount) /
2187 Buckets->
Count[i] = 0;
2209 return (ChiStruct->
Alpha == SearchKey->
Alpha);
2247 void *FunctionParams,
double InitialGuess,
double Accuracy)
2248 #define INITIALDELTA 0.1 2249 #define DELTARATIO 0.1 2257 double LastPosX, LastNegX;
2262 LastNegX = -FLT_MAX;
2263 f = (*Function) ((
CHISTRUCT *) FunctionParams, x);
2264 while (
Abs (LastPosX - LastNegX) > Accuracy) {
2273 ((*Function) ((
CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
2282 if (NewDelta < Delta)
2286 f = (*Function) ((
CHISTRUCT *) FunctionParams, x);
2320 for (i = 1; i <= N; i++) {
2321 Denominator *= 2 * i;
2323 SeriesTotal += PowerOfx / Denominator;
2325 return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->
Alpha);
2354 CLUSTER* Cluster,
float MaxIllegal)
2355 #define ILLEGAL_CHAR 2 2357 static BOOL8 *CharFlags =
nullptr;
2358 static int32_t NumFlags = 0;
2363 int32_t NumCharInCluster;
2364 int32_t NumIllegalInCluster;
2365 float PercentIllegal;
2369 NumIllegalInCluster = 0;
2371 if (Clusterer->
NumChar > NumFlags) {
2373 NumFlags = Clusterer->
NumChar;
2377 for (i = 0; i < NumFlags; i++)
2378 CharFlags[i] =
FALSE;
2382 while ((Sample =
NextSample (&SearchState)) !=
nullptr) {
2384 if (CharFlags[CharID] ==
FALSE) {
2385 CharFlags[CharID] =
TRUE;
2388 if (CharFlags[CharID] ==
TRUE) {
2389 NumIllegalInCluster++;
2393 PercentIllegal = (float) NumIllegalInCluster / NumCharInCluster;
2394 if (PercentIllegal > MaxIllegal) {
2418 for (row = 0; row < size; row++) {
2419 for (col = 0; col < size; col++) {
2420 U[row][col] = input[row*size + col];
2421 L[row][col] = row == col ? 1.0 : 0.0;
2422 U_inv[row][col] = 0.0;
2427 for (col = 0; col < size; ++col) {
2430 double best_pivot = -1.0;
2431 for (row = col; row < size; ++row) {
2432 if (
Abs(U[row][col]) > best_pivot) {
2433 best_pivot =
Abs(U[row][col]);
2438 if (best_row != col) {
2439 for (
int k = 0; k < size; ++k) {
2440 double tmp = U[best_row][k];
2441 U[best_row][k] = U[col][k];
2443 tmp = L[best_row][k];
2444 L[best_row][k] = L[col][k];
2449 for (row = col + 1; row < size; ++row) {
2450 double ratio = -U[row][col] / U[col][col];
2451 for (
int j = col; j < size; ++j) {
2452 U[row][j] += U[col][j] * ratio;
2454 for (
int k = 0; k < size; ++k) {
2455 L[row][k] += L[col][k] * ratio;
2460 for (col = 0; col < size; ++col) {
2461 U_inv[col][col] = 1.0 / U[col][col];
2462 for (row = col - 1; row >= 0; --row) {
2464 for (
int k = col; k > row; --k) {
2465 total += U[row][k] * U_inv[k][col];
2467 U_inv[row][col] = -total / U[row][row];
2471 for (row = 0; row < size; row++) {
2472 for (col = 0; col < size; col++) {
2474 for (
int k = row; k < size; ++k) {
2475 sum += U_inv[row][k] * L[k][col];
2477 inv[row*size + col] = sum;
2481 double error_sum = 0.0;
2482 for (row = 0; row < size; row++) {
2483 for (col = 0; col < size; col++) {
2485 for (
int k = 0; k < size; ++k) {
2486 sum +=
static_cast<double>(input[row * size + k]) * inv[k * size + col];
2489 error_sum +=
Abs(sum);
BUCKETS * MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence)
double Integral(double f1, double f2, double Dx)
PROTOTYPE * NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics)
PROTOTYPE * MakeDegenerateProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, int32_t MinSamples)
PROTOTYPE * MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
KDTREE * MakeKDTree(int16_t KeySize, const PARAM_DESC KeyDesc[])
bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal)
void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc)
BUCKETS * GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence)
double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy)
const double FTable[FTABLE_Y][FTABLE_X]
double(* SOLVEFUNC)(CHISTRUCT *, double)
int ListEntryMatch(void *arg1, void *arg2)
void FreeKDTree(KDTREE *Tree)
float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension)
uint16_t Bucket[BUCKETTABLESIZE]
void FreeStatistics(STATISTICS *Statistics)
void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics)
void KDStore(KDTREE *Tree, float *Key, void *Data)
void CreateClusterTree(CLUSTERER *Clusterer)
uint16_t OptimumNumberOfBuckets(uint32_t SampleCount)
LIST push(LIST list, void *element)
void FreeBuckets(BUCKETS *Buckets)
bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence)
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
DISTRIBUTION Distribution
PROTOTYPE * MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, int32_t CharID)
void InitBuckets(BUCKETS *Buckets)
STATISTICS * ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster)
PROTOTYPE * NewSimpleProto(int16_t N, CLUSTER *Cluster)
void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount)
CHISTRUCT * NewChiStruct(uint16_t DegreesOfFreedom, double Alpha)
void FreePrototype(void *arg)
uint16_t DegreesOfFreedom
PROTOTYPE * MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *NormalBuckets, double Confidence)
double ChiArea(CHISTRUCT *ChiParams, double x)
#define MAXDEGREESOFFREEDOM
void FreeProtoList(LIST *ProtoList)
void destroy_nodes(LIST list, void_dest destructor)
void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t Level)
void KDDelete(KDTREE *Tree, float Key[], void *Data)
int NumBucketsMatch(void *arg1, void *arg2)
void FreeCluster(CLUSTER *Cluster)
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
CLUSTER * MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster)
LIST search(LIST list, void *key, int_compare is_equal)
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
DLLSYM void tprintf(const char *format,...)
void KDNearestNeighborSearch(KDTREE *Tree, float Query[], int QuerySize, float MaxDistance, int *NumberOfResults, void **NBuffer, float DBuffer[])
PROTOTYPE * TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics)
PROTOTYPE * NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics)
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
double(* DENSITYFUNC)(int32_t)
#define InitSampleSearch(S, C)
PROTOTYPE * MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster)
uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev)
tesseract::GenericHeap< ClusterPair > ClusterHeap
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc, float Mean, float StdDev)
double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha)
double UniformDensity(int32_t x)
void FreeClusterer(CLUSTERER *Clusterer)
uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev)
CLUSTER * FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance)
int AlphaMatch(void *arg1, void *arg2)
uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets)
double NormalDensity(int32_t x)
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
bool DistributionOK(BUCKETS *Buckets)
void KDWalk(KDTREE *Tree, void_proc action, void *context)
double InvertMatrix(const float *input, int size, float *inv)
CLUSTER * NextSample(LIST *SearchState)
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
PROTOTYPE * NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics)