tesseract  5.0.0-alpha-619-ge9db
blamer.h
Go to the documentation of this file.
1 // File: blamer.h
3 // Description: Module allowing precise error causes to be allocated.
4 // Author: Rike Antonova
5 // Refactored: Ray Smith
6 //
7 // (C) Copyright 2013, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_CCSTRUCT_BLAMER_H_
21 #define TESSERACT_CCSTRUCT_BLAMER_H_
22 
23 #include <cstdint> // for int16_t
24 #include <cstring> // for memcpy
25 #include "boxword.h" // for BoxWord
26 #include <tesseract/genericvector.h> // for GenericVector
27 #ifndef DISABLED_LEGACY_ENGINE
28 #include "params_training_featdef.h" // for ParamsTrainingBundle, ParamsTra...
29 #endif // ndef DISABLED_LEGACY_ENGINE
30 #include "ratngs.h" // for BLOB_CHOICE_LIST (ptr only)
31 #include "rect.h" // for TBOX
32 #include <tesseract/strngs.h> // for STRING
33 #include "tprintf.h" // for tprintf
34 #include <tesseract/unichar.h> // for UNICHAR_ID
35 
36 class DENORM;
37 class MATRIX;
38 class UNICHARSET;
39 class WERD_RES;
40 
41 struct MATRIX_COORD;
42 struct TWERD;
43 
44 namespace tesseract {
45  class LMPainPoints;
46 }
47 
48 static const int16_t kBlamerBoxTolerance = 5;
49 
50 // Enum for expressing the source of error.
51 // Note: Please update kIncorrectResultReasonNames when modifying this enum.
53  // The text recorded in best choice == truth text
55  // Either: Top choice is incorrect and is a dictionary word (language model
56  // is unlikely to help correct such errors, so blame the classifier).
57  // Or: the correct unichar was not included in shortlist produced by the
58  // classifier at all.
60  // Chopper have not found one or more splits that correspond to the correct
61  // character bounding boxes recorded in BlamerBundle::truth_word.
63  // Classifier did include correct unichars for each blob in the correct
64  // segmentation, however its rating could have been too bad to allow the
65  // language model to pull out the correct choice. On the other hand the
66  // strength of the language model might have been too weak to favor the
67  // correct answer, this we call this case a classifier-language model
68  // tradeoff error.
70  // Page layout failed to produce the correct bounding box. Blame page layout
71  // if the truth was not found for the word, which implies that the bounding
72  // box of the word was incorrect (no truth word had a similar bounding box).
74  // SegSearch heuristic prevented one or more blobs from the correct
75  // segmentation state to be classified (e.g. the blob was too wide).
77  // The correct segmentaiton state was not explored because of poor SegSearch
78  // pain point prioritization. We blame SegSearch pain point prioritization
79  // if the best rating of a choice constructed from correct segmentation is
80  // better than that of the best choice (i.e. if we got to explore the correct
81  // segmentation state, language model would have picked the correct choice).
83  // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
84  // and thus use the old language model (permuters).
85  // TODO(antonova): integrate the new language mode with chopper
87  // If there is an incorrect adaptive template match with a better score than
88  // a correct one (either pre-trained or adapted), mark this as adaption error.
90  // split_and_recog_word() failed to find a suitable split in truth.
92  // Truth is not available for this word (e.g. when words in corrected content
93  // file are turned into ~~~~ because an appropriate alignment was not found.
95  // The text recorded in best choice != truth text, but none of the above
96  // reasons are set.
98 
100 };
101 
102 // Blamer-related information to determine the source of errors.
103 struct BlamerBundle {
104  static const char *IncorrectReasonName(IncorrectResultReason irr);
105  BlamerBundle() : truth_has_char_boxes_(false),
106  incorrect_result_reason_(IRR_CORRECT),
107  lattice_data_(nullptr) { ClearResults(); }
108  BlamerBundle(const BlamerBundle &other) {
109  this->CopyTruth(other);
110  this->CopyResults(other);
111  }
112  ~BlamerBundle() { delete[] lattice_data_; }
113 
114  // Accessors.
115  STRING TruthString() const {
116  STRING truth_str;
117  for (int i = 0; i < truth_text_.size(); ++i)
118  truth_str += truth_text_[i];
119  return truth_str;
120  }
122  return incorrect_result_reason_;
123  }
124  bool NoTruth() const {
125  return incorrect_result_reason_ == IRR_NO_TRUTH ||
126  incorrect_result_reason_ == IRR_PAGE_LAYOUT;
127  }
128  bool HasDebugInfo() const {
129  return debug_.length() > 0 || misadaption_debug_.length() > 0;
130  }
131  const STRING& debug() const {
132  return debug_;
133  }
134  const STRING& misadaption_debug() const {
135  return misadaption_debug_;
136  }
137  void UpdateBestRating(float rating) {
138  if (rating < best_correctly_segmented_rating_)
139  best_correctly_segmented_rating_ = rating;
140  }
142  return correct_segmentation_cols_.size();
143  }
144  // Returns true if the given ratings matrix col,row position is included
145  // in the correct segmentation path at the given index.
146  bool MatrixPositionCorrect(int index, const MATRIX_COORD& coord) {
147  return correct_segmentation_cols_[index] == coord.col &&
148  correct_segmentation_rows_[index] == coord.row;
149  }
151  best_choice_is_dict_and_top_choice_ = value;
152  }
153  const char* lattice_data() const {
154  return lattice_data_;
155  }
156  int lattice_size() const {
157  return lattice_size_; // size of lattice_data in bytes
158  }
159  void set_lattice_data(const char* data, int size) {
160  lattice_size_ = size;
161  delete [] lattice_data_;
162  lattice_data_ = new char[lattice_size_];
163  memcpy(lattice_data_, data, lattice_size_);
164  }
165 #ifndef DISABLED_LEGACY_ENGINE
167  return params_training_bundle_;
168  }
169  // Adds a new ParamsTrainingHypothesis to the current hypothesis list.
171  params_training_bundle_.AddHypothesis(hypo);
172  }
173 #endif // ndef DISABLED_LEGACY_ENGINE
174 
175  // Functions to setup the blamer.
176  // Whole word string, whole word bounding box.
177  void SetWordTruth(const UNICHARSET& unicharset,
178  const char* truth_str, const TBOX& word_box);
179  // Single "character" string, "character" bounding box.
180  // May be called multiple times to indicate the characters in a word.
181  void SetSymbolTruth(const UNICHARSET& unicharset,
182  const char* char_str, const TBOX& char_box);
183  // Marks that there is something wrong with the truth text, like it contains
184  // reject characters.
185  void SetRejectedTruth();
186 
187  // Returns true if the provided word_choice is correct.
188  bool ChoiceIsCorrect(const WERD_CHOICE* word_choice) const;
189 
190  void ClearResults() {
191  norm_truth_word_.DeleteAllBoxes();
192  norm_box_tolerance_ = 0;
193  if (!NoTruth()) incorrect_result_reason_ = IRR_CORRECT;
194  debug_ = "";
195  segsearch_is_looking_for_blame_ = false;
196  best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;
197  correct_segmentation_cols_.clear();
198  correct_segmentation_rows_.clear();
199  best_choice_is_dict_and_top_choice_ = false;
200  delete[] lattice_data_;
201  lattice_data_ = nullptr;
202  lattice_size_ = 0;
203  }
204  void CopyTruth(const BlamerBundle &other) {
205  truth_has_char_boxes_ = other.truth_has_char_boxes_;
206  truth_word_ = other.truth_word_;
207  truth_text_ = other.truth_text_;
208  incorrect_result_reason_ =
209  (other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
210  }
211  void CopyResults(const BlamerBundle &other) {
212  norm_truth_word_ = other.norm_truth_word_;
213  norm_box_tolerance_ = other.norm_box_tolerance_;
214  incorrect_result_reason_ = other.incorrect_result_reason_;
215  segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;
216  best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;
217  correct_segmentation_cols_ = other.correct_segmentation_cols_;
218  correct_segmentation_rows_ = other.correct_segmentation_rows_;
219  best_choice_is_dict_and_top_choice_ =
220  other.best_choice_is_dict_and_top_choice_;
221  if (other.lattice_data_ != nullptr) {
222  lattice_data_ = new char[other.lattice_size_];
223  memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
224  lattice_size_ = other.lattice_size_;
225  } else {
226  lattice_data_ = nullptr;
227  }
228  }
229  const char *IncorrectReason() const;
230 
231  // Appends choice and truth details to the given debug string.
232  void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
233  STRING *debug);
234 
235  // Sets up the norm_truth_word from truth_word using the given DENORM.
236  void SetupNormTruthWord(const DENORM& denorm);
237 
238  // Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
239  // bundles) where the right edge/ of the left-hand word is word1_right,
240  // and the left edge of the right-hand word is word2_left.
241  void SplitBundle(int word1_right, int word2_left, bool debug,
242  BlamerBundle* bundle1, BlamerBundle* bundle2) const;
243  // "Joins" the blames from bundle1 and bundle2 into *this.
244  void JoinBlames(const BlamerBundle& bundle1, const BlamerBundle& bundle2,
245  bool debug);
246 
247  // If a blob with the same bounding box as one of the truth character
248  // bounding boxes is not classified as the corresponding truth character
249  // blames character classifier for incorrect answer.
250  void BlameClassifier(const UNICHARSET& unicharset,
251  const TBOX& blob_box,
252  const BLOB_CHOICE_LIST& choices,
253  bool debug);
254 
255 
256  // Checks whether chops were made at all the character bounding box
257  // boundaries in word->truth_word. If not - blames the chopper for an
258  // incorrect answer.
259  void SetChopperBlame(const WERD_RES* word, bool debug);
260  // Blames the classifier or the language model if, after running only the
261  // chopper, best_choice is incorrect and no blame has been yet set.
262  // Blames the classifier if best_choice is classifier's top choice and is a
263  // dictionary word (i.e. language model could not have helped).
264  // Otherwise, blames the language model (formerly permuter word adjustment).
266  const WERD_RES* word,
267  const UNICHARSET& unicharset, bool valid_permuter, bool debug);
268  // Sets up the correct_segmentation_* to mark the correct bounding boxes.
269  void SetupCorrectSegmentation(const TWERD* word, bool debug);
270 
271  // Returns true if a guided segmentation search is needed.
272  bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
273  // Setup ready to guide the segmentation search to the correct segmentation.
274  void InitForSegSearch(const WERD_CHOICE* best_choice,
275  MATRIX* ratings, UNICHAR_ID wildcard_id,
276  bool debug, STRING* debug_str,
277  tesseract::LMPainPoints* pain_points,
278  double max_char_wh_ratio, WERD_RES* word_res);
279  // Returns true if the guided segsearch is in progress.
280  bool GuidedSegsearchStillGoing() const;
281  // The segmentation search has ended. Sets the blame appropriately.
282  void FinishSegSearch(const WERD_CHOICE *best_choice,
283  bool debug, STRING *debug_str);
284 
285  // If the bundle is null or still does not indicate the correct result,
286  // fix it and use some backup reason for the blame.
287  static void LastChanceBlame(bool debug, WERD_RES* word);
288 
289  // Sets the misadaption debug if this word is incorrect, as this word is
290  // being adapted to.
291  void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);
292 
293  private:
294  // Copy assignment operator (currently unused, therefore private).
295  BlamerBundle& operator=(const BlamerBundle& other);
296  void SetBlame(IncorrectResultReason irr, const STRING &msg,
297  const WERD_CHOICE *choice, bool debug) {
298  incorrect_result_reason_ = irr;
299  debug_ = IncorrectReason();
300  debug_ += " to blame: ";
301  FillDebugString(msg, choice, &debug_);
302  if (debug) tprintf("SetBlame(): %s", debug_.c_str());
303  }
304 
305  private:
306  // Set to true when bounding boxes for individual unichars are recorded.
307  bool truth_has_char_boxes_;
308  // The true_word (in the original image coordinate space) contains ground
309  // truth bounding boxes for this WERD_RES.
310  tesseract::BoxWord truth_word_;
311  // Same as above, but in normalized coordinates
312  // (filled in by WERD_RES::SetupForRecognition()).
313  tesseract::BoxWord norm_truth_word_;
314  // Tolerance for bounding box comparisons in normalized space.
315  int norm_box_tolerance_;
316  // Contains ground truth unichar for each of the bounding boxes in truth_word.
317  GenericVector<STRING> truth_text_;
318  // The reason for incorrect OCR result.
319  IncorrectResultReason incorrect_result_reason_;
320  // Debug text associated with the blame.
321  STRING debug_;
322  // Misadaption debug information (filled in if this word was misadapted to).
323  STRING misadaption_debug_;
324  // Variables used by the segmentation search when looking for the blame.
325  // Set to true while segmentation search is continued after the usual
326  // termination condition in order to look for the blame.
327  bool segsearch_is_looking_for_blame_;
328  // Best rating for correctly segmented path
329  // (set and used by SegSearch when looking for blame).
330  float best_correctly_segmented_rating_;
331  // Vectors populated by SegSearch to indicate column and row indices that
332  // correspond to blobs with correct bounding boxes.
333  GenericVector<int> correct_segmentation_cols_;
334  GenericVector<int> correct_segmentation_rows_;
335  // Set to true if best choice is a dictionary word and
336  // classifier's top choice.
337  bool best_choice_is_dict_and_top_choice_;
338  // Serialized segmentation search lattice.
339  char *lattice_data_;
340  int lattice_size_; // size of lattice_data in bytes
341  // Information about hypotheses (paths) explored by the segmentation search.
342 #ifndef DISABLED_LEGACY_ENGINE
343  tesseract::ParamsTrainingBundle params_training_bundle_;
344 #endif // ndef DISABLED_LEGACY_ENGINE
345 };
346 
347 
348 #endif // TESSERACT_CCSTRUCT_BLAMER_H_
BlamerBundle::BlamerBundle
BlamerBundle()
Definition: blamer.h:105
BlamerBundle::~BlamerBundle
~BlamerBundle()
Definition: blamer.h:112
IRR_SEGSEARCH_PP
Definition: blamer.h:82
strngs.h
BlamerBundle::BlameClassifierOrLangModel
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, bool valid_permuter, bool debug)
Definition: blamer.cpp:375
IRR_NO_TRUTH_SPLIT
Definition: blamer.h:91
BlamerBundle::ChoiceIsCorrect
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:117
BlamerBundle::TruthString
STRING TruthString() const
Definition: blamer.h:115
BlamerBundle::correct_segmentation_length
int correct_segmentation_length() const
Definition: blamer.h:141
BlamerBundle::SetSymbolTruth
void SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str, const TBOX &char_box)
Definition: blamer.cpp:92
BlamerBundle::BlameClassifier
void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box, const BLOB_CHOICE_LIST &choices, bool debug)
Definition: blamer.cpp:263
WERD_CHOICE
Definition: ratngs.h:261
TWERD
Definition: blobs.h:416
IRR_UNKNOWN
Definition: blamer.h:97
BlamerBundle::SetupNormTruthWord
void SetupNormTruthWord(const DENORM &denorm)
Definition: blamer.cpp:151
BlamerBundle::lattice_data
const char * lattice_data() const
Definition: blamer.h:153
params_training_featdef.h
MATRIX
Definition: matrix.h:574
tesseract::ParamsTrainingBundle
Definition: params_training_featdef.h:132
STRING
Definition: strngs.h:45
BlamerBundle::misadaption_debug
const STRING & misadaption_debug() const
Definition: blamer.h:134
WERD_RES
Definition: pageres.h:160
IncorrectResultReason
IncorrectResultReason
Definition: blamer.h:52
BlamerBundle::SetChopperBlame
void SetChopperBlame(const WERD_RES *word, bool debug)
Definition: blamer.cpp:316
WERD_CHOICE::kBadRating
static const float kBadRating
Definition: ratngs.h:263
rect.h
BlamerBundle::BlamerBundle
BlamerBundle(const BlamerBundle &other)
Definition: blamer.h:108
BlamerBundle::JoinBlames
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
Definition: blamer.cpp:231
BlamerBundle::MatrixPositionCorrect
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:146
BlamerBundle::FillDebugString
void FillDebugString(const STRING &msg, const WERD_CHOICE *choice, STRING *debug)
Definition: blamer.cpp:129
IRR_CHOPPER
Definition: blamer.h:62
tesseract::ParamsTrainingBundle::AddHypothesis
ParamsTrainingHypothesis & AddHypothesis(const ParamsTrainingHypothesis &other)
Definition: params_training_featdef.h:142
ratngs.h
BlamerBundle::CopyTruth
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:204
BlamerBundle::SetWordTruth
void SetWordTruth(const UNICHARSET &unicharset, const char *truth_str, const TBOX &word_box)
Definition: blamer.cpp:72
BlamerBundle::IncorrectReasonName
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:62
genericvector.h
BlamerBundle::UpdateBestRating
void UpdateBestRating(float rating)
Definition: blamer.h:137
IRR_CLASS_LM_TRADEOFF
Definition: blamer.h:69
IRR_ADAPTION
Definition: blamer.h:89
IRR_CLASSIFIER
Definition: blamer.h:59
BlamerBundle::HasDebugInfo
bool HasDebugInfo() const
Definition: blamer.h:128
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
BlamerBundle::set_lattice_data
void set_lattice_data(const char *data, int size)
Definition: blamer.h:159
BlamerBundle::GuidedSegsearchStillGoing
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:512
tesseract::ParamsTrainingHypothesis
Definition: params_training_featdef.h:106
BlamerBundle::InitForSegSearch
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, bool debug, STRING *debug_str, tesseract::LMPainPoints *pain_points, double max_char_wh_ratio, WERD_RES *word_res)
Definition: blamer.cpp:478
UNICHARSET
Definition: unicharset.h:145
BlamerBundle::ClearResults
void ClearResults()
Definition: blamer.h:190
tesseract::LMPainPoints
Definition: lm_pain_points.h:56
IRR_CORRECT
Definition: blamer.h:54
tesseract
Definition: baseapi.h:65
IRR_NO_TRUTH
Definition: blamer.h:94
BlamerBundle::set_best_choice_is_dict_and_top_choice
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:150
tprintf.h
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
GenericVector< STRING >
IRR_NUM_REASONS
Definition: blamer.h:99
IRR_PAGE_LAYOUT
Definition: blamer.h:73
BlamerBundle::SetMisAdaptionDebug
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:585
STRING::length
int32_t length() const
Definition: strngs.cpp:187
BlamerBundle::debug
const STRING & debug() const
Definition: blamer.h:131
MATRIX_COORD
Definition: matrix.h:604
unichar.h
MATRIX_COORD::col
int col
Definition: matrix.h:632
BlamerBundle::IncorrectReason
const char * IncorrectReason() const
Definition: blamer.cpp:66
GenericVector::clear
void clear()
Definition: genericvector.h:857
BlamerBundle::lattice_size
int lattice_size() const
Definition: blamer.h:156
BlamerBundle::params_training_bundle
const tesseract::ParamsTrainingBundle & params_training_bundle() const
Definition: blamer.h:166
BlamerBundle::CopyResults
void CopyResults(const BlamerBundle &other)
Definition: blamer.h:211
tesseract::BoxWord::DeleteAllBoxes
void DeleteAllBoxes()
Definition: boxword.cpp:174
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
BlamerBundle::SetRejectedTruth
void SetRejectedTruth()
Definition: blamer.cpp:111
BlamerBundle::NoTruth
bool NoTruth() const
Definition: blamer.h:124
MATRIX_COORD::row
int row
Definition: matrix.h:633
BlamerBundle::AddHypothesis
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:170
IRR_CLASS_OLD_LM_TRADEOFF
Definition: blamer.h:86
BlamerBundle::incorrect_result_reason
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:121
BlamerBundle::SplitBundle
void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1, BlamerBundle *bundle2) const
Definition: blamer.cpp:175
BlamerBundle
Definition: blamer.h:103
BlamerBundle::GuidedSegsearchNeeded
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const
Definition: blamer.cpp:469
GenericVector::size
int size() const
Definition: genericvector.h:71
boxword.h
BlamerBundle::FinishSegSearch
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, STRING *debug_str)
Definition: blamer.cpp:517
IRR_SEGSEARCH_HEUR
Definition: blamer.h:76
BlamerBundle::SetupCorrectSegmentation
void SetupCorrectSegmentation(const TWERD *word, bool debug)
Definition: blamer.cpp:413
TBOX
Definition: rect.h:33
DENORM
Definition: normalis.h:49
tesseract::BoxWord
Definition: boxword.h:36
BlamerBundle::LastChanceBlame
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:558