tesseract  5.0.0-alpha-619-ge9db
STATS Class Reference

#include <statistc.h>

Public Member Functions

 STATS (int32_t min_bucket_value, int32_t max_bucket_value_plus_1)
 
 STATS ()=default
 
 ~STATS ()
 
bool set_range (int32_t min_bucket_value, int32_t max_bucket_value_plus_1)
 
void clear ()
 
void add (int32_t value, int32_t count)
 
int32_t mode () const
 
double mean () const
 
double sd () const
 
double ile (double frac) const
 
int32_t min_bucket () const
 
int32_t max_bucket () const
 
double median () const
 
int32_t pile_count (int32_t value) const
 
int32_t get_total () const
 
bool local_min (int32_t x) const
 
void smooth (int32_t factor)
 
int32_t cluster (float lower, float upper, float multiple, int32_t max_clusters, STATS *clusters)
 
int top_n_modes (int max_modes, GenericVector< tesseract::KDPairInc< float, int > > *modes) const
 
void print () const
 
void print_summary () const
 
void plot (ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
 
void plotline (ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
 

Detailed Description

Definition at line 30 of file statistc.h.

Constructor & Destructor Documentation

◆ STATS() [1/2]

STATS::STATS ( int32_t  min_bucket_value,
int32_t  max_bucket_value_plus_1 
)

Definition at line 38 of file statistc.cpp.

40  {
41  if (max_bucket_value_plus_1 <= min_bucket_value) {
42  min_bucket_value = 0;
43  max_bucket_value_plus_1 = 1;
44  }
45  rangemin_ = min_bucket_value; // setup
46  rangemax_ = max_bucket_value_plus_1;
47  buckets_ = new int32_t[rangemax_ - rangemin_];

◆ STATS() [2/2]

STATS::STATS ( )
default

◆ ~STATS()

STATS::~STATS ( )

Definition at line 81 of file statistc.cpp.

Member Function Documentation

◆ add()

void STATS::add ( int32_t  value,
int32_t  count 
)

Definition at line 87 of file statistc.cpp.

93  {
94  if (buckets_ == nullptr) {

◆ clear()

void STATS::clear ( )

Definition at line 71 of file statistc.cpp.

75  { // clear out buckets

◆ cluster()

int32_t STATS::cluster ( float  lower,
float  upper,
float  multiple,
int32_t  max_clusters,
STATS clusters 
)

Definition at line 296 of file statistc.cpp.

316  { // array of clusters
317  bool new_cluster; // added one
318  float *centres; // cluster centres
319  int32_t entry; // bucket index
320  int32_t cluster; // cluster index
321  int32_t best_cluster; // one to assign to
322  int32_t new_centre = 0; // residual mode
323  int32_t new_mode; // pile count of new_centre
324  int32_t count; // pile to place
325  float dist; // from cluster
326  float min_dist; // from best_cluster
327  int32_t cluster_count; // no of clusters
328 
329  if (buckets_ == nullptr || max_clusters < 1)
330  return 0;
331  centres = new float[max_clusters + 1];
332  for (cluster_count = 1; cluster_count <= max_clusters
333  && clusters[cluster_count].buckets_ != nullptr
334  && clusters[cluster_count].total_count_ > 0;
335  cluster_count++) {
336  centres[cluster_count] =
337  static_cast<float>(clusters[cluster_count].ile(0.5));
338  new_centre = clusters[cluster_count].mode();
339  for (entry = new_centre - 1; centres[cluster_count] - entry < lower
340  && entry >= rangemin_
341  && pile_count(entry) <= pile_count(entry + 1);
342  entry--) {
343  count = pile_count(entry) - clusters[0].pile_count(entry);
344  if (count > 0) {
345  clusters[cluster_count].add(entry, count);
346  clusters[0].add (entry, count);
347  }
348  }
349  for (entry = new_centre + 1; entry - centres[cluster_count] < lower
350  && entry < rangemax_
351  && pile_count(entry) <= pile_count(entry - 1);
352  entry++) {
353  count = pile_count(entry) - clusters[0].pile_count(entry);
354  if (count > 0) {
355  clusters[cluster_count].add(entry, count);
356  clusters[0].add(entry, count);
357  }
358  }
359  }
360  cluster_count--;
361 
362  if (cluster_count == 0) {
363  clusters[0].set_range(rangemin_, rangemax_);
364  }
365  do {
366  new_cluster = false;
367  new_mode = 0;
368  for (entry = 0; entry < rangemax_ - rangemin_; entry++) {
369  count = buckets_[entry] - clusters[0].buckets_[entry];
370  //remaining pile
371  if (count > 0) { //any to handle
372  min_dist = static_cast<float>(INT32_MAX);
373  best_cluster = 0;
374  for (cluster = 1; cluster <= cluster_count; cluster++) {
375  dist = entry + rangemin_ - centres[cluster];
376  //find distance
377  if (dist < 0)
378  dist = -dist;
379  if (dist < min_dist) {
380  min_dist = dist; //find least
381  best_cluster = cluster;
382  }
383  }
384  if (min_dist > upper //far enough for new
385  && (best_cluster == 0
386  || entry + rangemin_ > centres[best_cluster] * multiple
387  || entry + rangemin_ < centres[best_cluster] / multiple)) {
388  if (count > new_mode) {
389  new_mode = count;
390  new_centre = entry + rangemin_;
391  }
392  }
393  }
394  }
395  // need new and room
396  if (new_mode > 0 && cluster_count < max_clusters) {
397  cluster_count++;
398  new_cluster = true;
399  if (!clusters[cluster_count].set_range(rangemin_, rangemax_)) {
400  delete [] centres;
401  return 0;
402  }
403  centres[cluster_count] = static_cast<float>(new_centre);
404  clusters[cluster_count].add(new_centre, new_mode);
405  clusters[0].add(new_centre, new_mode);
406  for (entry = new_centre - 1; centres[cluster_count] - entry < lower
407  && entry >= rangemin_
408  && pile_count (entry) <= pile_count(entry + 1); entry--) {
409  count = pile_count(entry) - clusters[0].pile_count(entry);
410  if (count > 0) {
411  clusters[cluster_count].add(entry, count);
412  clusters[0].add(entry, count);
413  }
414  }

◆ get_total()

int32_t STATS::get_total ( ) const
inline

Definition at line 83 of file statistc.h.

84  {
85  return total_count_; // total of all piles

◆ ile()

double STATS::ile ( double  frac) const

Definition at line 156 of file statistc.cpp.

166  {
167  if (buckets_ == nullptr || total_count_ == 0) {
168  return static_cast<double>(rangemin_);
169  }
170 #if 0
171  // TODO(rays) The existing code doesn't seem to be doing the right thing
172  // with target a double but this substitute crashes the code that uses it.
173  // Investigate and fix properly.
174  int target = IntCastRounded(frac * total_count_);
175  target = ClipToRange(target, 1, total_count_);
176 #else
177  double target = frac * total_count_;
178  target = ClipToRange(target, 1.0, static_cast<double>(total_count_));
179 #endif
180  int sum = 0;
181  int index = 0;

◆ local_min()

bool STATS::local_min ( int32_t  x) const

Definition at line 240 of file statistc.cpp.

254  {
255  if (buckets_ == nullptr) {
256  return false;
257  }

◆ max_bucket()

int32_t STATS::max_bucket ( ) const

Definition at line 201 of file statistc.cpp.

◆ mean()

double STATS::mean ( ) const

Definition at line 119 of file statistc.cpp.

127  { //get mean of samples
128  if (buckets_ == nullptr || total_count_ <= 0) {

◆ median()

double STATS::median ( ) const

Definition at line 218 of file statistc.cpp.

231  { //get median
232  if (buckets_ == nullptr) {
233  return static_cast<double>(rangemin_);
234  }

◆ min_bucket()

int32_t STATS::min_bucket ( ) const

Definition at line 187 of file statistc.cpp.

188  {
189  return static_cast<double>(rangemin_);
190  }
191 }
192 
193 /**********************************************************************
194  * STATS::min_bucket

◆ mode()

int32_t STATS::mode ( ) const

Definition at line 100 of file statistc.cpp.

107  { // get mode of samples
108  if (buckets_ == nullptr) {
109  return rangemin_;
110  }
111  int32_t max = buckets_[0]; // max cell count
112  int32_t maxindex = 0; // index of max
113  for (int index = rangemax_ - rangemin_ - 1; index > 0; --index) {

◆ pile_count()

int32_t STATS::pile_count ( int32_t  value) const
inline

Definition at line 75 of file statistc.h.

76  {
77  if (value <= rangemin_)
78  return buckets_[0];
79  if (value >= rangemax_ - 1)
80  return buckets_[rangemax_ - rangemin_ - 1];
81  return buckets_[value - rangemin_];

◆ plot()

void STATS::plot ( ScrollView window,
float  xorigin,
float  yorigin,
float  xscale,
float  yscale,
ScrollView::Color  colour 
) const

Definition at line 558 of file statistc.cpp.

◆ plotline()

void STATS::plotline ( ScrollView window,
float  xorigin,
float  yorigin,
float  xscale,
float  yscale,
ScrollView::Color  colour 
) const

Definition at line 584 of file statistc.cpp.

588  {
589  window->Rectangle(xorigin + xscale * index, yorigin,
590  xorigin + xscale * (index + 1),
591  yorigin + yscale * buckets_[index]);
592  }
593 }
594 #endif
595 
596 
597 /**********************************************************************
598  * STATS::plotline
599  *

◆ print()

void STATS::print ( ) const

Definition at line 509 of file statistc.cpp.

526  {

◆ print_summary()

void STATS::print_summary ( ) const

Definition at line 534 of file statistc.cpp.

534  {
535  if (buckets_[index] != 0) {
536  tprintf("%4d:%-3d ", rangemin_ + index, buckets_[index]);
537  if (++num_printed % 8 == 0)
538  tprintf ("\n");
539  }
540  }
541  tprintf ("\n");
542  print_summary();
543 }
544 
545 
546 
547 /**********************************************************************
548  * STATS::print_summary
549  *

◆ sd()

double STATS::sd ( ) const

Definition at line 134 of file statistc.cpp.

143  { //standard deviation
144  if (buckets_ == nullptr || total_count_ <= 0) {
145  return 0.0;
146  }
147  int64_t sum = 0;
148  double sqsum = 0.0;
149  for (int index = rangemax_ - rangemin_ - 1; index >= 0; --index) {

◆ set_range()

bool STATS::set_range ( int32_t  min_bucket_value,
int32_t  max_bucket_value_plus_1 
)

Definition at line 53 of file statistc.cpp.

56  {
57  if (max_bucket_value_plus_1 <= min_bucket_value) {
58  return false;
59  }
60  if (rangemax_ - rangemin_ != max_bucket_value_plus_1 - min_bucket_value) {
61  delete [] buckets_;
62  buckets_ = new int32_t[max_bucket_value_plus_1 - min_bucket_value];
63  }
64  rangemin_ = min_bucket_value; // setup
65  rangemax_ = max_bucket_value_plus_1;

◆ smooth()

void STATS::smooth ( int32_t  factor)

Definition at line 266 of file statistc.cpp.

281  {
282  if (buckets_ == nullptr || factor < 2) {
283  return;
284  }
285  STATS result(rangemin_, rangemax_);

◆ top_n_modes()

int STATS::top_n_modes ( int  max_modes,
GenericVector< tesseract::KDPairInc< float, int > > *  modes 
) const

Definition at line 445 of file statistc.cpp.

449  {
450  return false;
451  }
452 }
453 
454 // Finds (at most) the top max_modes modes, well actually the whole peak around
455 // each mode, returning them in the given modes vector as a <mean of peak,
456 // total count of peak> pair in order of decreasing total count.
457 // Since the mean is the key and the count the data in the pair, a single call
458 // to sort on the output will re-sort by increasing mean of peak if that is
459 // more useful than decreasing total count.
460 // Returns the actual number of modes found.
461 int STATS::top_n_modes(int max_modes,
462  GenericVector<KDPairInc<float, int> >* modes) const {
463  if (max_modes <= 0) return 0;
464  int src_count = rangemax_ - rangemin_;
465  // Used copies the counts in buckets_ as they get used.
466  STATS used(rangemin_, rangemax_);
467  modes->truncate(0);
468  // Total count of the smallest peak found so far.
469  int least_count = 1;
470  // Mode that is used as a seed for each peak
471  int max_count = 0;
472  do {
473  // Find an unused mode.
474  max_count = 0;
475  int max_index = 0;
476  for (int src_index = 0; src_index < src_count; src_index++) {
477  int pile_count = buckets_[src_index] - used.buckets_[src_index];
478  if (pile_count > max_count) {
479  max_count = pile_count;
480  max_index = src_index;
481  }
482  }
483  if (max_count > 0) {
484  // Copy the bucket count to used so it doesn't get found again.
485  used.buckets_[max_index] = max_count;
486  // Get the entire peak.
487  double total_value = max_index * max_count;
488  int total_count = max_count;
489  int prev_pile = max_count;
490  for (int offset = 1; max_index + offset < src_count; ++offset) {
491  if (!GatherPeak(max_index + offset, buckets_, used.buckets_,
492  &prev_pile, &total_count, &total_value))
493  break;
494  }
495  prev_pile = buckets_[max_index];
496  for (int offset = 1; max_index - offset >= 0; ++offset) {
497  if (!GatherPeak(max_index - offset, buckets_, used.buckets_,
498  &prev_pile, &total_count, &total_value))
499  break;
500  }
501  if (total_count > least_count || modes->size() < max_modes) {
502  // We definitely want this mode, so if we have enough discard the least.
503  if (modes->size() == max_modes)

The documentation for this class was generated from the following files:
ClipToRange
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:106
STATS::print_summary
void print_summary() const
Definition: statistc.cpp:534
STATS::top_n_modes
int top_n_modes(int max_modes, GenericVector< tesseract::KDPairInc< float, int > > *modes) const
Definition: statistc.cpp:445
STATS::pile_count
int32_t pile_count(int32_t value) const
Definition: statistc.h:75
IntCastRounded
int IntCastRounded(double x)
Definition: helpers.h:173
STATS
Definition: statistc.h:30
GenericVector
Definition: baseapi.h:40
tesseract::KDPairInc
Definition: kdpair.h:51
STATS::mode
int32_t mode() const
Definition: statistc.cpp:100
STATS::ile
double ile(double frac) const
Definition: statistc.cpp:156
count
int count(LIST var_list)
Definition: oldlist.cpp:79
GenericVector::truncate
void truncate(int size)
Definition: genericvector.h:132
STATS::add
void add(int32_t value, int32_t count)
Definition: statistc.cpp:87
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
ScrollView::Rectangle
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:599
GenericVector::size
int size() const
Definition: genericvector.h:71
STATS::cluster
int32_t cluster(float lower, float upper, float multiple, int32_t max_clusters, STATS *clusters)
Definition: statistc.cpp:296
STATS::set_range
bool set_range(int32_t min_bucket_value, int32_t max_bucket_value_plus_1)
Definition: statistc.cpp:53