tesseract  5.0.0-alpha-619-ge9db
statistc.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: statistc.h (Formerly stats.h)
3  * Description: Class description for STATS class.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1991, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifndef TESSERACT_CCSTRUCT_STATISTC_H_
20 #define TESSERACT_CCSTRUCT_STATISTC_H_
21 
22 #include <cstdio>
23 #include "kdpair.h"
24 #include "scrollview.h"
25 
26 template <typename T> class GenericVector;
27 
28 
29 // Simple histogram-based statistics for integer values in a known
30 // range, such that the range is small compared to the number of samples.
31 class STATS {
32  public:
33  // The histogram buckets are in the range
34  // [min_bucket_value, max_bucket_value_plus_1 - 1] i.e.
35  // [min_bucket_value, max_bucket_value].
36  // Any data under min_bucket value is silently mapped to min_bucket_value,
37  // and likewise, any data over max_bucket_value is silently mapped to
38  // max_bucket_value.
39  // In the internal array, min_bucket_value maps to 0 and
40  // max_bucket_value_plus_1 - min_bucket_value to the array size.
41  // TODO(rays) This is ugly. Convert the second argument to
42  // max_bucket_value and all the code that uses it.
43  STATS(int32_t min_bucket_value, int32_t max_bucket_value_plus_1);
44  STATS() = default; // empty for arrays
45 
46  ~STATS();
47 
48  // (Re)Sets the range and clears the counts.
49  // See the constructor for info on max and min values.
50  bool set_range(int32_t min_bucket_value, int32_t max_bucket_value_plus_1);
51 
52  void clear(); // empty buckets
53 
54  void add(int32_t value, int32_t count);
55 
56  // "Accessors" return various statistics on the data.
57  int32_t mode() const; // get mode of samples
58  double mean() const; // get mean of samples
59  double sd() const; // standard deviation
60  // Returns the fractile value such that frac fraction (in [0,1]) of samples
61  // has a value less than the return value.
62  double ile(double frac) const;
63  // Returns the minimum used entry in the histogram (ie the minimum of the
64  // data, NOT the minimum of the supplied range, nor is it an index.)
65  // Would normally be called min(), but that is a reserved word in VC++.
66  int32_t min_bucket() const; // Find min
67  // Returns the maximum used entry in the histogram (ie the maximum of the
68  // data, NOT the maximum of the supplied range, nor is it an index.)
69  int32_t max_bucket() const; // Find max
70  // Finds a more useful estimate of median than ile(0.5).
71  // Overcomes a problem with ile() - if the samples are, for example,
72  // 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
73  // between 6 and 13 = 9.5
74  double median() const; // get median of samples
75  // Returns the count of the given value.
76  int32_t pile_count(int32_t value) const {
77  if (value <= rangemin_)
78  return buckets_[0];
79  if (value >= rangemax_ - 1)
80  return buckets_[rangemax_ - rangemin_ - 1];
81  return buckets_[value - rangemin_];
82  }
83  // Returns the total count of all buckets.
84  int32_t get_total() const {
85  return total_count_; // total of all piles
86  }
87  // Returns true if x is a local min.
88  bool local_min(int32_t x) const;
89 
90  // Apply a triangular smoothing filter to the stats.
91  // This makes the modes a bit more useful.
92  // The factor gives the height of the triangle, i.e. the weight of the
93  // centre.
94  void smooth(int32_t factor);
95 
96  // Cluster the samples into max_cluster clusters.
97  // Each call runs one iteration. The array of clusters must be
98  // max_clusters+1 in size as cluster 0 is used to indicate which samples
99  // have been used.
100  // The return value is the current number of clusters.
101  int32_t cluster(float lower, // thresholds
102  float upper,
103  float multiple, // distance threshold
104  int32_t max_clusters, // max no to make
105  STATS *clusters); // array of clusters
106 
107 // Finds (at most) the top max_modes modes, well actually the whole peak around
108 // each mode, returning them in the given modes vector as a <mean of peak,
109 // total count of peak> pair in order of decreasing total count.
110 // Since the mean is the key and the count the data in the pair, a single call
111 // to sort on the output will re-sort by increasing mean of peak if that is
112 // more useful than decreasing total count.
113 // Returns the actual number of modes found.
114  int top_n_modes(
115  int max_modes,
117 
118  // Prints a summary and table of the histogram.
119  void print() const;
120  // Prints summary stats only of the histogram.
121  void print_summary() const;
122 
123  #ifndef GRAPHICS_DISABLED
124  // Draws the histogram as a series of rectangles.
125  void plot(ScrollView* window, // window to draw in
126  float xorigin, // origin of histo
127  float yorigin, // gram
128  float xscale, // size of one unit
129  float yscale, // size of one uint
130  ScrollView::Color colour) const; // colour to draw in
131 
132  // Draws a line graph of the histogram.
133  void plotline(ScrollView* window, // window to draw in
134  float xorigin, // origin of histo
135  float yorigin, // gram
136  float xscale, // size of one unit
137  float yscale, // size of one uint
138  ScrollView::Color colour) const; // colour to draw in
139  #endif // GRAPHICS_DISABLED
140 
141  private:
142  int32_t rangemin_ = 0; // min of range
143  // rangemax_ is not well named as it is really one past the max.
144  int32_t rangemax_ = 0; // max of range
145  int32_t total_count_ = 0; // no of samples
146  int32_t* buckets_ = nullptr; // array of cells
147 };
148 
149 // Returns the nth ordered item from the array, as if they were
150 // ordered, but without ordering them, in linear time.
151 // The array does get shuffled!
152 int32_t choose_nth_item(int32_t index, // index to choose
153  float *array, // array of items
154  int32_t count); // no of items
155 // Generic version uses a defined comparator (with qsort semantics).
156 int32_t choose_nth_item(int32_t index, // index to choose
157  void *array, // array of items
158  int32_t count, // no of items
159  size_t size, // element size
160  int (*compar)(const void*, const void*)); // comparator
161 // Swaps 2 entries in an array in-place.
162 void swap_entries(void *array, // array of entries
163  size_t size, // size of entry
164  int32_t index1, // entries to swap
165  int32_t index2);
166 
167 #endif // TESSERACT_CCSTRUCT_STATISTC_H_
STATS::STATS
STATS()=default
STATS::get_total
int32_t get_total() const
Definition: statistc.h:83
STATS::mean
double mean() const
Definition: statistc.cpp:119
ScrollView
Definition: scrollview.h:97
STATS::min_bucket
int32_t min_bucket() const
Definition: statistc.cpp:187
STATS::print_summary
void print_summary() const
Definition: statistc.cpp:534
choose_nth_item
int32_t choose_nth_item(int32_t index, float *array, int32_t count)
Definition: statistc.cpp:609
STATS::top_n_modes
int top_n_modes(int max_modes, GenericVector< tesseract::KDPairInc< float, int > > *modes) const
Definition: statistc.cpp:445
STATS::plotline
void plotline(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:584
STATS::max_bucket
int32_t max_bucket() const
Definition: statistc.cpp:201
STATS::pile_count
int32_t pile_count(int32_t value) const
Definition: statistc.h:75
STATS::smooth
void smooth(int32_t factor)
Definition: statistc.cpp:266
swap_entries
void swap_entries(void *array, size_t size, int32_t index1, int32_t index2)
Definition: statistc.cpp:735
STATS::sd
double sd() const
Definition: statistc.cpp:134
STATS::plot
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:558
kdpair.h
STATS::median
double median() const
Definition: statistc.cpp:218
STATS
Definition: statistc.h:30
STATS::~STATS
~STATS()
Definition: statistc.cpp:81
GenericVector
Definition: baseapi.h:40
tesseract::KDPairInc
Definition: kdpair.h:51
STATS::mode
int32_t mode() const
Definition: statistc.cpp:100
STATS::ile
double ile(double frac) const
Definition: statistc.cpp:156
STATS::local_min
bool local_min(int32_t x) const
Definition: statistc.cpp:240
count
int count(LIST var_list)
Definition: oldlist.cpp:79
STATS::add
void add(int32_t value, int32_t count)
Definition: statistc.cpp:87
ScrollView::Color
Color
Definition: scrollview.h:100
STATS::cluster
int32_t cluster(float lower, float upper, float multiple, int32_t max_clusters, STATS *clusters)
Definition: statistc.cpp:296
scrollview.h
STATS::set_range
bool set_range(int32_t min_bucket_value, int32_t max_bucket_value_plus_1)
Definition: statistc.cpp:53
STATS::print
void print() const
Definition: statistc.cpp:509
STATS::clear
void clear()
Definition: statistc.cpp:71