tesseract  5.0.0-alpha-619-ge9db
detlinefit.cpp
Go to the documentation of this file.
1 // File: detlinefit.cpp
3 // Description: Deterministic least median squares line fitting.
4 // Author: Ray Smith
5 // Created: Thu Feb 28 14:45:01 PDT 2008
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include "detlinefit.h"
21 #include "statistc.h"
22 #include "tprintf.h"
23 
24 #include <algorithm>
25 #include <cfloat> // for FLT_MAX
26 
27 namespace tesseract {
28 
29 // The number of points to consider at each end.
30 const int kNumEndPoints = 3;
31 // The minimum number of points at which to switch to number of points
32 // for badly fitted lines.
33 // To ensure a sensible error metric, kMinPointsForErrorCount should be at
34 // least kMaxRealDistance / (1 - %ile) where %ile is the fractile used in
35 // ComputeUpperQuartileError.
36 const int kMinPointsForErrorCount = 16;
37 // The maximum real distance to use before switching to number of
38 // mis-fitted points, which will get square-rooted for true distance.
39 const int kMaxRealDistance = 2.0;
40 
41 DetLineFit::DetLineFit() : square_length_(0.0) {
42 }
43 
44 // Delete all Added points.
46  pts_.clear();
47  distances_.clear();
48 }
49 
50 // Add a new point. Takes a copy - the pt doesn't need to stay in scope.
51 void DetLineFit::Add(const ICOORD& pt) {
52  pts_.push_back(PointWidth(pt, 0));
53 }
54 // Associates a half-width with the given point if a point overlaps the
55 // previous point by more than half the width, and its distance is further
56 // than the previous point, then the more distant point is ignored in the
57 // distance calculation. Useful for ignoring i dots and other diacritics.
58 void DetLineFit::Add(const ICOORD& pt, int halfwidth) {
59  pts_.push_back(PointWidth(pt, halfwidth));
60 }
61 
62 // Fits a line to the points, ignoring the skip_first initial points and the
63 // skip_last final points, returning the fitted line as a pair of points,
64 // and the upper quartile error.
65 double DetLineFit::Fit(int skip_first, int skip_last,
66  ICOORD* pt1, ICOORD* pt2) {
67  // Do something sensible with no points.
68  if (pts_.empty()) {
69  pt1->set_x(0);
70  pt1->set_y(0);
71  *pt2 = *pt1;
72  return 0.0;
73  }
74  // Count the points and find the first and last kNumEndPoints.
75  int pt_count = pts_.size();
76  ICOORD* starts[kNumEndPoints];
77  if (skip_first >= pt_count) skip_first = pt_count - 1;
78  int start_count = 0;
79  int end_i = std::min(skip_first + kNumEndPoints, pt_count);
80  for (int i = skip_first; i < end_i; ++i) {
81  starts[start_count++] = &pts_[i].pt;
82  }
83  ICOORD* ends[kNumEndPoints];
84  if (skip_last >= pt_count) skip_last = pt_count - 1;
85  int end_count = 0;
86  end_i = std::max(0, pt_count - kNumEndPoints - skip_last);
87  for (int i = pt_count - 1 - skip_last; i >= end_i; --i) {
88  ends[end_count++] = &pts_[i].pt;
89  }
90  // 1 or 2 points need special treatment.
91  if (pt_count <= 2) {
92  *pt1 = *starts[0];
93  if (pt_count > 1)
94  *pt2 = *ends[0];
95  else
96  *pt2 = *pt1;
97  return 0.0;
98  }
99  // Although with between 2 and 2*kNumEndPoints-1 points, there will be
100  // overlap in the starts, ends sets, this is OK and taken care of by the
101  // if (*start != *end) test below, which also tests for equal input points.
102  double best_uq = -1.0;
103  // Iterate each pair of points and find the best fitting line.
104  for (int i = 0; i < start_count; ++i) {
105  ICOORD* start = starts[i];
106  for (int j = 0; j < end_count; ++j) {
107  ICOORD* end = ends[j];
108  if (*start != *end) {
109  ComputeDistances(*start, *end);
110  // Compute the upper quartile error from the line.
111  double dist = EvaluateLineFit();
112  if (dist < best_uq || best_uq < 0.0) {
113  best_uq = dist;
114  *pt1 = *start;
115  *pt2 = *end;
116  }
117  }
118  }
119  }
120  // Finally compute the square root to return the true distance.
121  return best_uq > 0.0 ? sqrt(best_uq) : best_uq;
122 }
123 
124 // Constrained fit with a supplied direction vector. Finds the best line_pt,
125 // that is one of the supplied points having the median cross product with
126 // direction, ignoring points that have a cross product outside of the range
127 // [min_dist, max_dist]. Returns the resulting error metric using the same
128 // reduced set of points.
129 // *Makes use of floating point arithmetic*
130 double DetLineFit::ConstrainedFit(const FCOORD& direction,
131  double min_dist, double max_dist,
132  bool debug, ICOORD* line_pt) {
133  ComputeConstrainedDistances(direction, min_dist, max_dist);
134  // Do something sensible with no points or computed distances.
135  if (pts_.empty() || distances_.empty()) {
136  line_pt->set_x(0);
137  line_pt->set_y(0);
138  return 0.0;
139  }
140  int median_index = distances_.choose_nth_item(distances_.size() / 2);
141  *line_pt = distances_[median_index].data;
142  if (debug) {
143  tprintf("Constrained fit to dir %g, %g = %d, %d :%d distances:\n",
144  direction.x(), direction.y(),
145  line_pt->x(), line_pt->y(), distances_.size());
146  for (int i = 0; i < distances_.size(); ++i) {
147  tprintf("%d: %d, %d -> %g\n", i, distances_[i].data.x(),
148  distances_[i].data.y(), distances_[i].key);
149  }
150  tprintf("Result = %d\n", median_index);
151  }
152  // Center distances on the fitted point.
153  double dist_origin = direction * *line_pt;
154  for (int i = 0; i < distances_.size(); ++i) {
155  distances_[i].key -= dist_origin;
156  }
157  return sqrt(EvaluateLineFit());
158 }
159 
160 // Returns true if there were enough points at the last call to Fit or
161 // ConstrainedFit for the fitted points to be used on a badly fitted line.
163  return distances_.size() >= kMinPointsForErrorCount;
164 }
165 
166 // Backwards compatible fit returning a gradient and constant.
167 // Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this
168 // function in preference to the LMS class.
169 double DetLineFit::Fit(float* m, float* c) {
170  ICOORD start, end;
171  double error = Fit(&start, &end);
172  if (end.x() != start.x()) {
173  *m = static_cast<float>(end.y() - start.y()) / (end.x() - start.x());
174  *c = start.y() - *m * start.x();
175  } else {
176  *m = 0.0f;
177  *c = 0.0f;
178  }
179  return error;
180 }
181 
182 // Backwards compatible constrained fit with a supplied gradient.
183 // Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible
184 // to avoid potential difficulties with infinite gradients.
185 double DetLineFit::ConstrainedFit(double m, float* c) {
186  // Do something sensible with no points.
187  if (pts_.empty()) {
188  *c = 0.0f;
189  return 0.0;
190  }
191  double cos = 1.0 / sqrt(1.0 + m * m);
192  FCOORD direction(cos, m * cos);
193  ICOORD line_pt;
194  double error = ConstrainedFit(direction, -FLT_MAX, FLT_MAX, false, &line_pt);
195  *c = line_pt.y() - line_pt.x() * m;
196  return error;
197 }
198 
199 // Computes and returns the squared evaluation metric for a line fit.
200 double DetLineFit::EvaluateLineFit() {
201  // Compute the upper quartile error from the line.
202  double dist = ComputeUpperQuartileError();
203  if (distances_.size() >= kMinPointsForErrorCount &&
205  // Use the number of mis-fitted points as the error metric, as this
206  // gives a better measure of fit for badly fitted lines where more
207  // than a quarter are badly fitted.
208  double threshold = kMaxRealDistance * sqrt(square_length_);
209  dist = NumberOfMisfittedPoints(threshold);
210  }
211  return dist;
212 }
213 
214 // Computes the absolute error distances of the points from the line,
215 // and returns the squared upper-quartile error distance.
216 double DetLineFit::ComputeUpperQuartileError() {
217  int num_errors = distances_.size();
218  if (num_errors == 0) return 0.0;
219  // Get the absolute values of the errors.
220  for (int i = 0; i < num_errors; ++i) {
221  if (distances_[i].key < 0) distances_[i].key = -distances_[i].key;
222  }
223  // Now get the upper quartile distance.
224  int index = distances_.choose_nth_item(3 * num_errors / 4);
225  double dist = distances_[index].key;
226  // The true distance is the square root of the dist squared / square_length.
227  // Don't bother with the square root. Just return the square distance.
228  return square_length_ > 0.0 ? dist * dist / square_length_ : 0.0;
229 }
230 
231 // Returns the number of sample points that have an error more than threshold.
232 int DetLineFit::NumberOfMisfittedPoints(double threshold) const {
233  int num_misfits = 0;
234  int num_dists = distances_.size();
235  // Get the absolute values of the errors.
236  for (int i = 0; i < num_dists; ++i) {
237  if (distances_[i].key > threshold)
238  ++num_misfits;
239  }
240  return num_misfits;
241 }
242 
243 // Computes all the cross product distances of the points from the line,
244 // storing the actual (signed) cross products in distances.
245 // Ignores distances of points that are further away than the previous point,
246 // and overlaps the previous point by at least half.
247 void DetLineFit::ComputeDistances(const ICOORD& start, const ICOORD& end) {
248  distances_.truncate(0);
249  ICOORD line_vector = end;
250  line_vector -= start;
251  square_length_ = line_vector.sqlength();
252  int line_length = IntCastRounded(sqrt(square_length_));
253  // Compute the distance of each point from the line.
254  int prev_abs_dist = 0;
255  int prev_dot = 0;
256  for (int i = 0; i < pts_.size(); ++i) {
257  ICOORD pt_vector = pts_[i].pt;
258  pt_vector -= start;
259  int dot = line_vector % pt_vector;
260  // Compute |line_vector||pt_vector|sin(angle between)
261  int dist = line_vector * pt_vector;
262  int abs_dist = dist < 0 ? -dist : dist;
263  if (abs_dist > prev_abs_dist && i > 0) {
264  // Ignore this point if it overlaps the previous one.
265  int separation = abs(dot - prev_dot);
266  if (separation < line_length * pts_[i].halfwidth ||
267  separation < line_length * pts_[i - 1].halfwidth)
268  continue;
269  }
270  distances_.push_back(DistPointPair(dist, pts_[i].pt));
271  prev_abs_dist = abs_dist;
272  prev_dot = dot;
273  }
274 }
275 
276 // Computes all the cross product distances of the points perpendicular to
277 // the given direction, ignoring distances outside of the give distance range,
278 // storing the actual (signed) cross products in distances_.
279 void DetLineFit::ComputeConstrainedDistances(const FCOORD& direction,
280  double min_dist, double max_dist) {
281  distances_.truncate(0);
282  square_length_ = direction.sqlength();
283  // Compute the distance of each point from the line.
284  for (int i = 0; i < pts_.size(); ++i) {
285  FCOORD pt_vector = pts_[i].pt;
286  // Compute |line_vector||pt_vector|sin(angle between)
287  double dist = direction * pt_vector;
288  if (min_dist <= dist && dist <= max_dist)
289  distances_.push_back(DistPointPair(dist, pts_[i].pt));
290  }
291 }
292 
293 } // namespace tesseract.
ICOORD::set_x
void set_x(int16_t xin)
rewrite function
Definition: points.h:60
FCOORD::y
float y() const
Definition: points.h:209
ICOORD::sqlength
float sqlength() const
find sq length
Definition: points.h:72
ICOORD
integer coordinate
Definition: points.h:30
FCOORD::x
float x() const
Definition: points.h:206
detlinefit.h
IntCastRounded
int IntCastRounded(double x)
Definition: helpers.h:173
ICOORD::x
int16_t x() const
access function
Definition: points.h:51
FCOORD
Definition: points.h:187
ICOORD::set_y
void set_y(int16_t yin)
rewrite function
Definition: points.h:64
tesseract::DetLineFit::DetLineFit
DetLineFit()
Definition: detlinefit.cpp:41
tesseract::DetLineFit::Clear
void Clear()
Definition: detlinefit.cpp:45
tesseract::kMinPointsForErrorCount
const int kMinPointsForErrorCount
Definition: detlinefit.cpp:36
statistc.h
tesseract::DetLineFit::SufficientPointsForIndependentFit
bool SufficientPointsForIndependentFit() const
Definition: detlinefit.cpp:162
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
tesseract::kNumEndPoints
const int kNumEndPoints
Definition: detlinefit.cpp:30
tesseract::DetLineFit::ConstrainedFit
double ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist, bool debug, ICOORD *line_pt)
Definition: detlinefit.cpp:130
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
tesseract
Definition: baseapi.h:65
tprintf.h
tesseract::kMaxRealDistance
const int kMaxRealDistance
Definition: detlinefit.cpp:39
GenericVector::clear
void clear()
Definition: genericvector.h:857
tesseract::DetLineFit::Fit
double Fit(ICOORD *pt1, ICOORD *pt2)
Definition: detlinefit.h:75
tesseract::DetLineFit::Add
void Add(const ICOORD &pt)
Definition: detlinefit.cpp:51
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
FCOORD::sqlength
float sqlength() const
find sq length
Definition: points.h:222
GenericVector::size
int size() const
Definition: genericvector.h:71
ICOORD::y
int16_t y() const
access_function
Definition: points.h:55