All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
errorcounter.h
Go to the documentation of this file.
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
15 
16 #ifndef THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
17 #define THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
18 
19 #include "genericvector.h"
20 #include "matrix.h"
21 #include "statistc.h"
22 
23 struct Pix;
24 template <typename T> class UnicityTable;
25 
26 namespace tesseract {
27 
28 struct FontInfo;
29 class FontInfoTable;
30 class SampleIterator;
31 class ShapeClassifier;
32 class TrainingSample;
33 struct UnicharRating;
34 
35 // Enumeration of the different types of error count.
36 // Error counts work as follows:
37 //
38 // Ground truth is a valid unichar-id / font-id pair:
39 // Number of classifier answers?
40 // 0 >0
41 // CT_REJECT unichar-id matches top shape?
42 // __________ yes! no
43 // CT_UNICHAR_TOP_OK CT_UNICHAR_TOP1_ERR
44 // Top shape-id has multiple unichars? 2nd shape unichar id matches?
45 // yes! no yes! no
46 // CT_OK_MULTI_UNICHAR | _____ CT_UNICHAR_TOP2_ERR
47 // Font attributes match? Any unichar-id matches?
48 // yes! no yes! no
49 // CT_FONT_ATTR_OK CT_FONT_ATTR_ERR ______ CT_UNICHAR_TOPN_ERR
50 // | __________________ _________________
51 // Top shape-id has multiple font attrs?
52 // yes! no
53 // CT_OK_MULTI_FONT
54 // _____________________________
55 //
56 // Note that multiple counts may be activated for a single sample!
57 //
58 // Ground truth is for a fragment/n-gram that is NOT in the unicharset.
59 // This is called junk and is expected to be rejected:
60 // Number of classifier answers?
61 // 0 >0
62 // CT_REJECTED_JUNK CT_ACCEPTED_JUNK
63 //
64 // Also, CT_NUM_RESULTS stores the mean number of results, and CT_RANK stores
65 // the mean rank of the correct result, counting from 0, and with an error
66 // receiving the number of answers as the correct rank.
67 //
68 // Keep in sync with the ReportString function.
69 enum CountTypes {
70  CT_UNICHAR_TOP_OK, // Top shape contains correct unichar id.
71  // The rank of the results in TOP1, TOP2, TOPN is determined by a gap of
72  // kRatingEpsilon from the first result in each group. The real top choice
73  // is measured using TOPTOP.
74  CT_UNICHAR_TOP1_ERR, // Top shape does not contain correct unichar id.
75  CT_UNICHAR_TOP2_ERR, // Top 2 shapes don't contain correct unichar id.
76  CT_UNICHAR_TOPN_ERR, // No output shape contains correct unichar id.
77  CT_UNICHAR_TOPTOP_ERR, // Very top choice not correct.
78  CT_OK_MULTI_UNICHAR, // Top shape id has correct unichar id, and others.
79  CT_OK_JOINED, // Top shape id is correct but marked joined.
80  CT_OK_BROKEN, // Top shape id is correct but marked broken.
81  CT_REJECT, // Classifier hates this.
82  CT_FONT_ATTR_ERR, // Top unichar OK, but font attributes incorrect.
83  CT_OK_MULTI_FONT, // CT_FONT_ATTR_OK but there are multiple font attrs.
84  CT_NUM_RESULTS, // Number of answers produced.
85  CT_RANK, // Rank of correct answer.
86  CT_REJECTED_JUNK, // Junk that was correctly rejected.
87  CT_ACCEPTED_JUNK, // Junk that was incorrectly classified otherwise.
88 
89  CT_SIZE // Number of types for array sizing.
90 };
91 
92 // Class to encapsulate all the functionality and sub-structures required
93 // to count errors for an isolated character classifier (ShapeClassifier).
94 class ErrorCounter {
95  public:
96  // Computes and returns the unweighted boosting_mode error rate of the given
97  // classifier. Can be used for testing, or inside an iterative training
98  // system, including one that uses boosting.
99  // report_levels:
100  // 0 = no output.
101  // 1 = bottom-line error rate.
102  // 2 = bottom-line error rate + time.
103  // 3 = font-level error rate + time.
104  // 4 = list of all errors + short classifier debug output on 16 errors.
105  // 5 = list of all errors + short classifier debug output on 25 errors.
106  // * The boosting_mode determines which error type is used for computing the
107  // scaled_error output, and setting the is_error flag in the samples.
108  // * The fontinfo_table is used to get string font names for the debug
109  // output, and also to count font attributes errors.
110  // * The page_images vector may contain a Pix* (which may be NULL) for each
111  // page index assigned to the samples.
112  // * The it provides encapsulated iteration over some sample set.
113  // * The outputs unichar_error, scaled_error and totals_report are all
114  // optional.
115  // * If not NULL, unichar error gets the top1 unichar error rate.
116  // * Scaled_error gets the error chosen by boosting_mode weighted by the
117  // weights on the samples.
118  // * Fonts_report gets a string summarizing the error rates for each font in
119  // both human-readable form and as a tab-separated list of error counts.
120  // The human-readable form is all before the first tab.
121  // * The return value is the un-weighted version of the scaled_error.
122  static double ComputeErrorRate(ShapeClassifier* classifier,
123  int report_level, CountTypes boosting_mode,
124  const FontInfoTable& fontinfo_table,
125  const GenericVector<Pix*>& page_images,
126  SampleIterator* it,
127  double* unichar_error,
128  double* scaled_error,
129  STRING* fonts_report);
130  // Tests a pair of classifiers, debugging errors of the new against the old.
131  // See errorcounter.h for description of arguments.
132  // Iterates over the samples, calling the classifiers in normal/silent mode.
133  // If the new_classifier makes a boosting_mode error that the old_classifier
134  // does not, and the appropriate, it will then call the new_classifier again
135  // with a debug flag and a keep_this argument to find out what is going on.
136  static void DebugNewErrors(ShapeClassifier* new_classifier,
137  ShapeClassifier* old_classifier,
138  CountTypes boosting_mode,
139  const FontInfoTable& fontinfo_table,
140  const GenericVector<Pix*>& page_images,
141  SampleIterator* it);
142 
143  private:
144  // Simple struct to hold an array of counts.
145  struct Counts {
146  Counts();
147  // Adds other into this for computing totals.
148  void operator+=(const Counts& other);
149 
150  int n[CT_SIZE];
151  };
152 
153  // Constructor is private. Only anticipated use of ErrorCounter is via
154  // the static ComputeErrorRate.
155  ErrorCounter(const UNICHARSET& unicharset, int fontsize);
156  ~ErrorCounter();
157 
158  // Accumulates the errors from the classifier results on a single sample.
159  // Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
160  // boosting_mode selects the type of error to be used for boosting and the
161  // is_error_ member of sample is set according to whether the required type
162  // of error occurred. The font_table provides access to font properties
163  // for error counting and shape_table is used to understand the relationship
164  // between unichar_ids and shape_ids in the results
165  bool AccumulateErrors(bool debug, CountTypes boosting_mode,
166  const FontInfoTable& font_table,
167  const GenericVector<UnicharRating>& results,
169 
170  // Accumulates counts for junk. Counts only whether the junk was correctly
171  // rejected or not.
172  bool AccumulateJunk(bool debug, const GenericVector<UnicharRating>& results,
173  TrainingSample* sample);
174 
175  // Creates a report of the error rate. The report_level controls the detail
176  // that is reported to stderr via tprintf:
177  // 0 -> no output.
178  // >=1 -> bottom-line error rate.
179  // >=3 -> font-level error rate.
180  // boosting_mode determines the return value. It selects which (un-weighted)
181  // error rate to return.
182  // The fontinfo_table from MasterTrainer provides the names of fonts.
183  // The it determines the current subset of the training samples.
184  // If not NULL, the top-choice unichar error rate is saved in unichar_error.
185  // If not NULL, the report string is saved in fonts_report.
186  // (Ignoring report_level).
187  double ReportErrors(int report_level, CountTypes boosting_mode,
188  const FontInfoTable& fontinfo_table,
189  const SampleIterator& it,
190  double* unichar_error,
191  STRING* fonts_report);
192 
193  // Sets the report string to a combined human and machine-readable report
194  // string of the error rates.
195  // Returns false if there is no data, leaving report unchanged, unless
196  // even_if_empty is true.
197  static bool ReportString(bool even_if_empty, const Counts& counts,
198  STRING* report);
199 
200  // Computes the error rates and returns in rates which is an array of size
201  // CT_SIZE. Returns false if there is no data, leaving rates unchanged.
202  static bool ComputeRates(const Counts& counts, double rates[CT_SIZE]);
203 
204 
205  // Total scaled error used by boosting algorithms.
206  double scaled_error_;
207  // Difference in result rating to be thought of as an "equal" choice.
208  double rating_epsilon_;
209  // Vector indexed by font_id from the samples of error accumulators.
210  GenericVector<Counts> font_counts_;
211  // Counts of the results that map each unichar_id (from samples) to an
212  // incorrect shape_id.
213  GENERIC_2D_ARRAY<int> unichar_counts_;
214  // Count of the number of times each shape_id occurs, is correct, and multi-
215  // unichar.
216  GenericVector<int> multi_unichar_counts_;
217  // Histogram of scores (as percent) for correct answers.
218  STATS ok_score_hist_;
219  // Histogram of scores (as percent) for incorrect answers.
220  STATS bad_score_hist_;
221  // Unicharset for printing character ids in results.
222  const UNICHARSET& unicharset_;
223 };
224 
225 } // namespace tesseract.
226 
227 #endif /* THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ */
Definition: statistc.h:33
static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix * > &page_images, SampleIterator *it, double *unichar_error, double *scaled_error, STRING *fonts_report)
Definition: cluster.h:32
static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix * > &page_images, SampleIterator *it)
Definition: strngs.h:44
ICOORD & operator+=(ICOORD &op1, const ICOORD &op2)
Definition: ipoints.h:86