tesseract  5.0.0-alpha-619-ge9db
statistc.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: statistc.cpp (Formerly stats.c)
3  * Description: Simple statistical package for integer values.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1991, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #include "statistc.h"
25 #include <cstring>
26 #include <cmath>
27 #include <cstdlib>
28 #include "errcode.h"
29 #include <tesseract/helpers.h>
30 #include "scrollview.h"
31 #include "tprintf.h"
32 
34 
35 /**********************************************************************
36  * STATS::STATS
37  *
38  * Construct a new stats element by allocating and zeroing the memory.
39  **********************************************************************/
40 STATS::STATS(int32_t min_bucket_value, int32_t max_bucket_value_plus_1) {
41  if (max_bucket_value_plus_1 <= min_bucket_value) {
42  min_bucket_value = 0;
43  max_bucket_value_plus_1 = 1;
44  }
45  rangemin_ = min_bucket_value; // setup
46  rangemax_ = max_bucket_value_plus_1;
47  buckets_ = new int32_t[rangemax_ - rangemin_];
48  clear();
49 }
50 
51 /**********************************************************************
52  * STATS::set_range
53  *
54  * Alter the range on an existing stats element.
55  **********************************************************************/
56 bool STATS::set_range(int32_t min_bucket_value, int32_t max_bucket_value_plus_1) {
57  if (max_bucket_value_plus_1 <= min_bucket_value) {
58  return false;
59  }
60  if (rangemax_ - rangemin_ != max_bucket_value_plus_1 - min_bucket_value) {
61  delete [] buckets_;
62  buckets_ = new int32_t[max_bucket_value_plus_1 - min_bucket_value];
63  }
64  rangemin_ = min_bucket_value; // setup
65  rangemax_ = max_bucket_value_plus_1;
66  clear(); // zero it
67  return true;
68 }
69 
70 /**********************************************************************
71  * STATS::clear
72  *
73  * Clear out the STATS class by zeroing all the buckets.
74  **********************************************************************/
75 void STATS::clear() { // clear out buckets
76  total_count_ = 0;
77  if (buckets_ != nullptr)
78  memset(buckets_, 0, (rangemax_ - rangemin_) * sizeof(buckets_[0]));
79 }
80 
81 /**********************************************************************
82  * STATS::~STATS
83  *
84  * Destructor for a stats class.
85  **********************************************************************/
86 STATS::~STATS() { delete[] buckets_; }
87 
88 /**********************************************************************
89  * STATS::add
90  *
91  * Add a set of samples to (or delete from) a pile.
92  **********************************************************************/
93 void STATS::add(int32_t value, int32_t count) {
94  if (buckets_ == nullptr) {
95  return;
96  }
97  value = ClipToRange(value, rangemin_, rangemax_ - 1);
98  buckets_[value - rangemin_] += count;
99  total_count_ += count; // keep count of total
100 }
101 
102 /**********************************************************************
103  * STATS::mode
104  *
105  * Find the mode of a stats class.
106  **********************************************************************/
107 int32_t STATS::mode() const { // get mode of samples
108  if (buckets_ == nullptr) {
109  return rangemin_;
110  }
111  int32_t max = buckets_[0]; // max cell count
112  int32_t maxindex = 0; // index of max
113  for (int index = rangemax_ - rangemin_ - 1; index > 0; --index) {
114  if (buckets_[index] > max) {
115  max = buckets_[index]; // find biggest
116  maxindex = index;
117  }
118  }
119  return maxindex + rangemin_; // index of biggest
120 }
121 
122 /**********************************************************************
123  * STATS::mean
124  *
125  * Find the mean of a stats class.
126  **********************************************************************/
127 double STATS::mean() const { //get mean of samples
128  if (buckets_ == nullptr || total_count_ <= 0) {
129  return static_cast<double>(rangemin_);
130  }
131  int64_t sum = 0;
132  for (int index = rangemax_ - rangemin_ - 1; index >= 0; --index) {
133  sum += static_cast<int64_t>(index) * buckets_[index];
134  }
135  return static_cast<double>(sum) / total_count_ + rangemin_;
136 }
137 
138 /**********************************************************************
139  * STATS::sd
140  *
141  * Find the standard deviation of a stats class.
142  **********************************************************************/
143 double STATS::sd() const { //standard deviation
144  if (buckets_ == nullptr || total_count_ <= 0) {
145  return 0.0;
146  }
147  int64_t sum = 0;
148  double sqsum = 0.0;
149  for (int index = rangemax_ - rangemin_ - 1; index >= 0; --index) {
150  sum += static_cast<int64_t>(index) * buckets_[index];
151  sqsum += static_cast<double>(index) * index * buckets_[index];
152  }
153  double variance = static_cast<double>(sum) / total_count_;
154  variance = sqsum / total_count_ - variance * variance;
155  if (variance > 0.0)
156  return sqrt(variance);
157  return 0.0;
158 }
159 
160 /**********************************************************************
161  * STATS::ile
162  *
163  * Returns the fractile value such that frac fraction (in [0,1]) of samples
164  * has a value less than the return value.
165  **********************************************************************/
166 double STATS::ile(double frac) const {
167  if (buckets_ == nullptr || total_count_ == 0) {
168  return static_cast<double>(rangemin_);
169  }
170 #if 0
171  // TODO(rays) The existing code doesn't seem to be doing the right thing
172  // with target a double but this substitute crashes the code that uses it.
173  // Investigate and fix properly.
174  int target = IntCastRounded(frac * total_count_);
175  target = ClipToRange(target, 1, total_count_);
176 #else
177  double target = frac * total_count_;
178  target = ClipToRange(target, 1.0, static_cast<double>(total_count_));
179 #endif
180  int sum = 0;
181  int index = 0;
182  for (index = 0; index < rangemax_ - rangemin_ && sum < target;
183  sum += buckets_[index++]);
184  if (index > 0) {
185  ASSERT_HOST(buckets_[index - 1] > 0);
186  return rangemin_ + index -
187  static_cast<double>(sum - target) / buckets_[index - 1];
188  } else {
189  return static_cast<double>(rangemin_);
190  }
191 }
192 
193 /**********************************************************************
194  * STATS::min_bucket
195  *
196  * Find REAL minimum bucket - ile(0.0) isn't necessarily correct
197  **********************************************************************/
198 int32_t STATS::min_bucket() const { // Find min
199  if (buckets_ == nullptr || total_count_ == 0) {
200  return rangemin_;
201  }
202  int32_t min = 0;
203  for (min = 0; (min < rangemax_ - rangemin_) && (buckets_[min] == 0); min++);
204  return rangemin_ + min;
205 }
206 
207 /**********************************************************************
208  * STATS::max_bucket
209  *
210  * Find REAL maximum bucket - ile(1.0) isn't necessarily correct
211  **********************************************************************/
212 
213 int32_t STATS::max_bucket() const { // Find max
214  if (buckets_ == nullptr || total_count_ == 0) {
215  return rangemin_;
216  }
217  int32_t max;
218  for (max = rangemax_ - rangemin_ - 1; max > 0 && buckets_[max] == 0; max--);
219  return rangemin_ + max;
220 }
221 
222 /**********************************************************************
223  * STATS::median
224  *
225  * Finds a more useful estimate of median than ile(0.5).
226  *
227  * Overcomes a problem with ile() - if the samples are, for example,
228  * 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
229  * between 6 and 13 = 9.5
230  **********************************************************************/
231 double STATS::median() const { //get median
232  if (buckets_ == nullptr) {
233  return static_cast<double>(rangemin_);
234  }
235  double median = ile(0.5);
236  int median_pile = static_cast<int>(floor(median));
237  if ((total_count_ > 1) && (pile_count(median_pile) == 0)) {
238  int32_t min_pile;
239  int32_t max_pile;
240  /* Find preceding non zero pile */
241  for (min_pile = median_pile; pile_count(min_pile) == 0; min_pile--);
242  /* Find following non zero pile */
243  for (max_pile = median_pile; pile_count(max_pile) == 0; max_pile++);
244  median = (min_pile + max_pile) / 2.0;
245  }
246  return median;
247 }
248 
249 /**********************************************************************
250  * STATS::local_min
251  *
252  * Return true if this point is a local min.
253  **********************************************************************/
254 bool STATS::local_min(int32_t x) const {
255  if (buckets_ == nullptr) {
256  return false;
257  }
258  x = ClipToRange(x, rangemin_, rangemax_ - 1) - rangemin_;
259  if (buckets_[x] == 0)
260  return true;
261  int32_t index; // table index
262  for (index = x - 1; index >= 0 && buckets_[index] == buckets_[x]; --index);
263  if (index >= 0 && buckets_[index] < buckets_[x])
264  return false;
265  for (index = x + 1; index < rangemax_ - rangemin_ &&
266  buckets_[index] == buckets_[x]; ++index);
267  if (index < rangemax_ - rangemin_ && buckets_[index] < buckets_[x])
268  return false;
269  else
270  return true;
271 }
272 
273 /**********************************************************************
274  * STATS::smooth
275  *
276  * Apply a triangular smoothing filter to the stats.
277  * This makes the modes a bit more useful.
278  * The factor gives the height of the triangle, i.e. the weight of the
279  * centre.
280  **********************************************************************/
281 void STATS::smooth(int32_t factor) {
282  if (buckets_ == nullptr || factor < 2) {
283  return;
284  }
285  STATS result(rangemin_, rangemax_);
286  int entrycount = rangemax_ - rangemin_;
287  for (int entry = 0; entry < entrycount; entry++) {
288  //centre weight
289  int count = buckets_[entry] * factor;
290  for (int offset = 1; offset < factor; offset++) {
291  if (entry - offset >= 0)
292  count += buckets_[entry - offset] * (factor - offset);
293  if (entry + offset < entrycount)
294  count += buckets_[entry + offset] * (factor - offset);
295  }
296  result.add(entry + rangemin_, count);
297  }
298  total_count_ = result.total_count_;
299  memcpy(buckets_, result.buckets_, entrycount * sizeof(buckets_[0]));
300 }
301 
302 /**********************************************************************
303  * STATS::cluster
304  *
305  * Cluster the samples into max_cluster clusters.
306  * Each call runs one iteration. The array of clusters must be
307  * max_clusters+1 in size as cluster 0 is used to indicate which samples
308  * have been used.
309  * The return value is the current number of clusters.
310  **********************************************************************/
311 
312 int32_t STATS::cluster(float lower, // thresholds
313  float upper,
314  float multiple, // distance threshold
315  int32_t max_clusters, // max no to make
316  STATS *clusters) { // array of clusters
317  bool new_cluster; // added one
318  float *centres; // cluster centres
319  int32_t entry; // bucket index
320  int32_t cluster; // cluster index
321  int32_t best_cluster; // one to assign to
322  int32_t new_centre = 0; // residual mode
323  int32_t new_mode; // pile count of new_centre
324  int32_t count; // pile to place
325  float dist; // from cluster
326  float min_dist; // from best_cluster
327  int32_t cluster_count; // no of clusters
328 
329  if (buckets_ == nullptr || max_clusters < 1)
330  return 0;
331  centres = new float[max_clusters + 1];
332  for (cluster_count = 1; cluster_count <= max_clusters
333  && clusters[cluster_count].buckets_ != nullptr
334  && clusters[cluster_count].total_count_ > 0;
335  cluster_count++) {
336  centres[cluster_count] =
337  static_cast<float>(clusters[cluster_count].ile(0.5));
338  new_centre = clusters[cluster_count].mode();
339  for (entry = new_centre - 1; centres[cluster_count] - entry < lower
340  && entry >= rangemin_
341  && pile_count(entry) <= pile_count(entry + 1);
342  entry--) {
343  count = pile_count(entry) - clusters[0].pile_count(entry);
344  if (count > 0) {
345  clusters[cluster_count].add(entry, count);
346  clusters[0].add (entry, count);
347  }
348  }
349  for (entry = new_centre + 1; entry - centres[cluster_count] < lower
350  && entry < rangemax_
351  && pile_count(entry) <= pile_count(entry - 1);
352  entry++) {
353  count = pile_count(entry) - clusters[0].pile_count(entry);
354  if (count > 0) {
355  clusters[cluster_count].add(entry, count);
356  clusters[0].add(entry, count);
357  }
358  }
359  }
360  cluster_count--;
361 
362  if (cluster_count == 0) {
363  clusters[0].set_range(rangemin_, rangemax_);
364  }
365  do {
366  new_cluster = false;
367  new_mode = 0;
368  for (entry = 0; entry < rangemax_ - rangemin_; entry++) {
369  count = buckets_[entry] - clusters[0].buckets_[entry];
370  //remaining pile
371  if (count > 0) { //any to handle
372  min_dist = static_cast<float>(INT32_MAX);
373  best_cluster = 0;
374  for (cluster = 1; cluster <= cluster_count; cluster++) {
375  dist = entry + rangemin_ - centres[cluster];
376  //find distance
377  if (dist < 0)
378  dist = -dist;
379  if (dist < min_dist) {
380  min_dist = dist; //find least
381  best_cluster = cluster;
382  }
383  }
384  if (min_dist > upper //far enough for new
385  && (best_cluster == 0
386  || entry + rangemin_ > centres[best_cluster] * multiple
387  || entry + rangemin_ < centres[best_cluster] / multiple)) {
388  if (count > new_mode) {
389  new_mode = count;
390  new_centre = entry + rangemin_;
391  }
392  }
393  }
394  }
395  // need new and room
396  if (new_mode > 0 && cluster_count < max_clusters) {
397  cluster_count++;
398  new_cluster = true;
399  if (!clusters[cluster_count].set_range(rangemin_, rangemax_)) {
400  delete [] centres;
401  return 0;
402  }
403  centres[cluster_count] = static_cast<float>(new_centre);
404  clusters[cluster_count].add(new_centre, new_mode);
405  clusters[0].add(new_centre, new_mode);
406  for (entry = new_centre - 1; centres[cluster_count] - entry < lower
407  && entry >= rangemin_
408  && pile_count (entry) <= pile_count(entry + 1); entry--) {
409  count = pile_count(entry) - clusters[0].pile_count(entry);
410  if (count > 0) {
411  clusters[cluster_count].add(entry, count);
412  clusters[0].add(entry, count);
413  }
414  }
415  for (entry = new_centre + 1; entry - centres[cluster_count] < lower
416  && entry < rangemax_
417  && pile_count (entry) <= pile_count(entry - 1); entry++) {
418  count = pile_count(entry) - clusters[0].pile_count(entry);
419  if (count > 0) {
420  clusters[cluster_count].add(entry, count);
421  clusters[0].add (entry, count);
422  }
423  }
424  centres[cluster_count] =
425  static_cast<float>(clusters[cluster_count].ile(0.5));
426  }
427  } while (new_cluster && cluster_count < max_clusters);
428  delete [] centres;
429  return cluster_count;
430 }
431 
432 // Helper tests that the current index is still part of the peak and gathers
433 // the data into the peak, returning false when the peak is ended.
434 // src_buckets[index] - used_buckets[index] is the unused part of the histogram.
435 // prev_count is the histogram count of the previous index on entry and is
436 // updated to the current index on return.
437 // total_count and total_value are accumulating the mean of the peak.
438 static bool GatherPeak(int index, const int* src_buckets, int* used_buckets,
439  int* prev_count, int* total_count, double* total_value) {
440  int pile_count = src_buckets[index] - used_buckets[index];
441  if (pile_count <= *prev_count && pile_count > 0) {
442  // Accumulate count and index.count product.
443  *total_count += pile_count;
444  *total_value += index * pile_count;
445  // Mark this index as used
446  used_buckets[index] = src_buckets[index];
447  *prev_count = pile_count;
448  return true;
449  } else {
450  return false;
451  }
452 }
453 
454 // Finds (at most) the top max_modes modes, well actually the whole peak around
455 // each mode, returning them in the given modes vector as a <mean of peak,
456 // total count of peak> pair in order of decreasing total count.
457 // Since the mean is the key and the count the data in the pair, a single call
458 // to sort on the output will re-sort by increasing mean of peak if that is
459 // more useful than decreasing total count.
460 // Returns the actual number of modes found.
461 int STATS::top_n_modes(int max_modes,
462  GenericVector<KDPairInc<float, int> >* modes) const {
463  if (max_modes <= 0) return 0;
464  int src_count = rangemax_ - rangemin_;
465  // Used copies the counts in buckets_ as they get used.
466  STATS used(rangemin_, rangemax_);
467  modes->truncate(0);
468  // Total count of the smallest peak found so far.
469  int least_count = 1;
470  // Mode that is used as a seed for each peak
471  int max_count = 0;
472  do {
473  // Find an unused mode.
474  max_count = 0;
475  int max_index = 0;
476  for (int src_index = 0; src_index < src_count; src_index++) {
477  int pile_count = buckets_[src_index] - used.buckets_[src_index];
478  if (pile_count > max_count) {
479  max_count = pile_count;
480  max_index = src_index;
481  }
482  }
483  if (max_count > 0) {
484  // Copy the bucket count to used so it doesn't get found again.
485  used.buckets_[max_index] = max_count;
486  // Get the entire peak.
487  double total_value = max_index * max_count;
488  int total_count = max_count;
489  int prev_pile = max_count;
490  for (int offset = 1; max_index + offset < src_count; ++offset) {
491  if (!GatherPeak(max_index + offset, buckets_, used.buckets_,
492  &prev_pile, &total_count, &total_value))
493  break;
494  }
495  prev_pile = buckets_[max_index];
496  for (int offset = 1; max_index - offset >= 0; ++offset) {
497  if (!GatherPeak(max_index - offset, buckets_, used.buckets_,
498  &prev_pile, &total_count, &total_value))
499  break;
500  }
501  if (total_count > least_count || modes->size() < max_modes) {
502  // We definitely want this mode, so if we have enough discard the least.
503  if (modes->size() == max_modes)
504  modes->truncate(max_modes - 1);
505  int target_index = 0;
506  // Linear search for the target insertion point.
507  while (target_index < modes->size() &&
508  (*modes)[target_index].data >= total_count)
509  ++target_index;
510  auto peak_mean =
511  static_cast<float>(total_value / total_count + rangemin_);
512  modes->insert(KDPairInc<float, int>(peak_mean, total_count),
513  target_index);
514  least_count = modes->back().data;
515  }
516  }
517  } while (max_count > 0);
518  return modes->size();
519 }
520 
521 /**********************************************************************
522  * STATS::print
523  *
524  * Prints a summary and table of the histogram.
525  **********************************************************************/
526 void STATS::print() const {
527  if (buckets_ == nullptr) {
528  return;
529  }
530  int32_t min = min_bucket() - rangemin_;
531  int32_t max = max_bucket() - rangemin_;
532 
533  int num_printed = 0;
534  for (int index = min; index <= max; index++) {
535  if (buckets_[index] != 0) {
536  tprintf("%4d:%-3d ", rangemin_ + index, buckets_[index]);
537  if (++num_printed % 8 == 0)
538  tprintf ("\n");
539  }
540  }
541  tprintf ("\n");
542  print_summary();
543 }
544 
545 
546 
547 /**********************************************************************
548  * STATS::print_summary
549  *
550  * Print a summary of the stats.
551  **********************************************************************/
552 void STATS::print_summary() const {
553  if (buckets_ == nullptr) {
554  return;
555  }
556  int32_t min = min_bucket();
557  int32_t max = max_bucket();
558  tprintf("Total count=%d\n", total_count_);
559  tprintf("Min=%.2f Really=%d\n", ile(0.0), min);
560  tprintf("Lower quartile=%.2f\n", ile(0.25));
561  tprintf("Median=%.2f, ile(0.5)=%.2f\n", median(), ile(0.5));
562  tprintf("Upper quartile=%.2f\n", ile(0.75));
563  tprintf("Max=%.2f Really=%d\n", ile(1.0), max);
564  tprintf("Range=%d\n", max + 1 - min);
565  tprintf("Mean= %.2f\n", mean());
566  tprintf("SD= %.2f\n", sd());
567 }
568 
569 
570 /**********************************************************************
571  * STATS::plot
572  *
573  * Draw a histogram of the stats table.
574  **********************************************************************/
575 
576 #ifndef GRAPHICS_DISABLED
577 void STATS::plot(ScrollView* window, // to draw in
578  float xorigin, // bottom left
579  float yorigin,
580  float xscale, // one x unit
581  float yscale, // one y unit
582  ScrollView::Color colour) const { // colour to draw in
583  if (buckets_ == nullptr) {
584  return;
585  }
586  window->Pen(colour);
587 
588  for (int index = 0; index < rangemax_ - rangemin_; index++) {
589  window->Rectangle(xorigin + xscale * index, yorigin,
590  xorigin + xscale * (index + 1),
591  yorigin + yscale * buckets_[index]);
592  }
593 }
594 #endif
595 
596 
597 /**********************************************************************
598  * STATS::plotline
599  *
600  * Draw a histogram of the stats table. (Line only)
601  **********************************************************************/
602 
603 #ifndef GRAPHICS_DISABLED
604 void STATS::plotline(ScrollView* window, // to draw in
605  float xorigin, // bottom left
606  float yorigin,
607  float xscale, // one x unit
608  float yscale, // one y unit
609  ScrollView::Color colour) const { // colour to draw in
610  if (buckets_ == nullptr) {
611  return;
612  }
613  window->Pen(colour);
614  window->SetCursor(xorigin, yorigin + yscale * buckets_[0]);
615  for (int index = 0; index < rangemax_ - rangemin_; index++) {
616  window->DrawTo(xorigin + xscale * index,
617  yorigin + yscale * buckets_[index]);
618  }
619 }
620 #endif
621 
622 
623 /**********************************************************************
624  * choose_nth_item
625  *
626  * Returns the index of what would b the nth item in the array
627  * if the members were sorted, without actually sorting.
628  **********************************************************************/
629 
630 int32_t choose_nth_item(int32_t index, float *array, int32_t count) {
631  int32_t next_sample; // next one to do
632  int32_t next_lesser; // space for new
633  int32_t prev_greater; // last one saved
634  int32_t equal_count; // no of equal ones
635  float pivot; // proposed median
636  float sample; // current sample
637 
638  if (count <= 1)
639  return 0;
640  if (count == 2) {
641  if (array[0] < array[1]) {
642  return index >= 1 ? 1 : 0;
643  }
644  else {
645  return index >= 1 ? 0 : 1;
646  }
647  }
648  else {
649  if (index < 0)
650  index = 0; // ensure legal
651  else if (index >= count)
652  index = count - 1;
653  equal_count = static_cast<int32_t>(rand() % count);
654  pivot = array[equal_count];
655  // fill gap
656  array[equal_count] = array[0];
657  next_lesser = 0;
658  prev_greater = count;
659  equal_count = 1;
660  for (next_sample = 1; next_sample < prev_greater;) {
661  sample = array[next_sample];
662  if (sample < pivot) {
663  // shuffle
664  array[next_lesser++] = sample;
665  next_sample++;
666  }
667  else if (sample > pivot) {
668  prev_greater--;
669  // juggle
670  array[next_sample] = array[prev_greater];
671  array[prev_greater] = sample;
672  }
673  else {
674  equal_count++;
675  next_sample++;
676  }
677  }
678  for (next_sample = next_lesser; next_sample < prev_greater;)
679  array[next_sample++] = pivot;
680  if (index < next_lesser)
681  return choose_nth_item (index, array, next_lesser);
682  else if (index < prev_greater)
683  return next_lesser; // in equal bracket
684  else
685  return choose_nth_item (index - prev_greater,
686  array + prev_greater,
687  count - prev_greater) + prev_greater;
688  }
689 }
690 
691 /**********************************************************************
692  * choose_nth_item
693  *
694  * Returns the index of what would be the nth item in the array
695  * if the members were sorted, without actually sorting.
696  **********************************************************************/
697 int32_t choose_nth_item(int32_t index, void *array, int32_t count, size_t size,
698  int (*compar)(const void*, const void*)) {
699  int result; // of compar
700  int32_t next_sample; // next one to do
701  int32_t next_lesser; // space for new
702  int32_t prev_greater; // last one saved
703  int32_t equal_count; // no of equal ones
704  int32_t pivot; // proposed median
705 
706  if (count <= 1)
707  return 0;
708  if (count == 2) {
709  if (compar (array, static_cast<char *>(array) + size) < 0) {
710  return index >= 1 ? 1 : 0;
711  }
712  else {
713  return index >= 1 ? 0 : 1;
714  }
715  }
716  if (index < 0)
717  index = 0; // ensure legal
718  else if (index >= count)
719  index = count - 1;
720  pivot = static_cast<int32_t>(rand () % count);
721  swap_entries (array, size, pivot, 0);
722  next_lesser = 0;
723  prev_greater = count;
724  equal_count = 1;
725  for (next_sample = 1; next_sample < prev_greater;) {
726  result =
727  compar (static_cast<char *>(array) + size * next_sample,
728  static_cast<char *>(array) + size * next_lesser);
729  if (result < 0) {
730  swap_entries (array, size, next_lesser++, next_sample++);
731  // shuffle
732  }
733  else if (result > 0) {
734  prev_greater--;
735  swap_entries(array, size, prev_greater, next_sample);
736  }
737  else {
738  equal_count++;
739  next_sample++;
740  }
741  }
742  if (index < next_lesser)
743  return choose_nth_item (index, array, next_lesser, size, compar);
744  else if (index < prev_greater)
745  return next_lesser; // in equal bracket
746  else
747  return choose_nth_item (index - prev_greater,
748  static_cast<char *>(array) + size * prev_greater,
749  count - prev_greater, size,
750  compar) + prev_greater;
751 }
752 
753 /**********************************************************************
754  * swap_entries
755  *
756  * Swap 2 entries of arbitrary size in-place in a table.
757  **********************************************************************/
758 void swap_entries(void *array, // array of entries
759  size_t size, // size of entry
760  int32_t index1, // entries to swap
761  int32_t index2) {
762  char tmp;
763  char *ptr1; // to entries
764  char *ptr2;
765  size_t count; // of bytes
766 
767  ptr1 = static_cast<char *>(array) + index1 * size;
768  ptr2 = static_cast<char *>(array) + index2 * size;
769  for (count = 0; count < size; count++) {
770  tmp = *ptr1;
771  *ptr1++ = *ptr2;
772  *ptr2++ = tmp; // tedious!
773  }
774 }
STATS::STATS
STATS()=default
choose_nth_item
int32_t choose_nth_item(int32_t index, float *array, int32_t count)
Definition: statistc.cpp:609
ClipToRange
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:106
STATS::mean
double mean() const
Definition: statistc.cpp:119
ScrollView
Definition: scrollview.h:97
STATS::min_bucket
int32_t min_bucket() const
Definition: statistc.cpp:187
STATS::print_summary
void print_summary() const
Definition: statistc.cpp:534
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
STATS::top_n_modes
int top_n_modes(int max_modes, GenericVector< tesseract::KDPairInc< float, int > > *modes) const
Definition: statistc.cpp:445
STATS::plotline
void plotline(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:584
STATS::max_bucket
int32_t max_bucket() const
Definition: statistc.cpp:201
STATS::pile_count
int32_t pile_count(int32_t value) const
Definition: statistc.h:75
ScrollView::Pen
void Pen(Color color)
Definition: scrollview.cpp:717
ScrollView::DrawTo
void DrawTo(int x, int y)
Definition: scrollview.cpp:524
IntCastRounded
int IntCastRounded(double x)
Definition: helpers.h:173
STATS::smooth
void smooth(int32_t factor)
Definition: statistc.cpp:266
statistc.h
swap_entries
void swap_entries(void *array, size_t size, int32_t index1, int32_t index2)
Definition: statistc.cpp:735
STATS::sd
double sd() const
Definition: statistc.cpp:134
STATS::plot
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:558
helpers.h
STATS::median
double median() const
Definition: statistc.cpp:218
STATS
Definition: statistc.h:30
tprintf.h
STATS::~STATS
~STATS()
Definition: statistc.cpp:81
sample
Definition: cluster.h:31
GenericVector
Definition: baseapi.h:40
tesseract::KDPairInc
Definition: kdpair.h:51
STATS::mode
int32_t mode() const
Definition: statistc.cpp:100
STATS::ile
double ile(double frac) const
Definition: statistc.cpp:156
STATS::local_min
bool local_min(int32_t x) const
Definition: statistc.cpp:240
count
int count(LIST var_list)
Definition: oldlist.cpp:79
STATS::add
void add(int32_t value, int32_t count)
Definition: statistc.cpp:87
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
errcode.h
ScrollView::SetCursor
void SetCursor(int x, int y)
Definition: scrollview.cpp:518
ScrollView::Color
Color
Definition: scrollview.h:100
ScrollView::Rectangle
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:599
STATS::cluster
int32_t cluster(float lower, float upper, float multiple, int32_t max_clusters, STATS *clusters)
Definition: statistc.cpp:296
scrollview.h
STATS::set_range
bool set_range(int32_t min_bucket_value, int32_t max_bucket_value_plus_1)
Definition: statistc.cpp:53
STATS::print
void print() const
Definition: statistc.cpp:509
STATS::clear
void clear()
Definition: statistc.cpp:71