tesseract  4.0.0-1-g2a2b
segsearch.cpp
Go to the documentation of this file.
1 // File: segsearch.cpp
3 // Description: Segmentation search functions.
4 // Author: Daria Antonova
5 // Created: Mon Jun 23 11:26:43 PDT 2008
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include <cstdint> // for INT32_MAX
21 #include "blamer.h" // for BlamerBundle
22 #include "errcode.h" // for ASSERT_HOST
23 #include "genericvector.h" // for GenericVector
24 #include "lm_pain_points.h" // for LMPainPoints, LM_PPTYPE_SHAPE, LMPainPoi...
25 #include "lm_state.h" // for BestChoiceBundle, ViterbiStateEntry
26 #include "matrix.h" // for MATRIX_COORD, MATRIX
27 #include "pageres.h" // for WERD_RES
28 #include "params.h" // for BoolParam, IntParam, DoubleParam
29 #include "ratngs.h" // for BLOB_CHOICE_LIST, BLOB_CHOICE_IT
30 #include "strngs.h" // for STRING
31 #include "tesscallback.h" // for TessResultCallback2
32 #include "tprintf.h" // for tprintf
33 #include "wordrec.h" // for Wordrec, SegSearchPending (ptr only)
34 
35 namespace tesseract {
36 
37 void Wordrec::DoSegSearch(WERD_RES* word_res) {
38  BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
39  // Run Segmentation Search.
40  SegSearch(word_res, &best_choice_bundle, nullptr);
41 }
42 
44  BestChoiceBundle* best_choice_bundle,
45  BlamerBundle* blamer_bundle) {
50  // Compute scaling factor that will help us recover blob outline length
51  // from classifier rating and certainty for the blob.
52  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
54  InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle,
55  blamer_bundle);
56 
57  if (!SegSearchDone(0)) { // find a better choice
58  if (chop_enable && word_res->chopped_word != nullptr) {
59  improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle,
60  blamer_bundle, &pain_points, &pending);
61  }
62  if (chop_debug) SEAM::PrintSeams("Final seam list:", word_res->seam_array);
63 
64  if (blamer_bundle != nullptr &&
65  !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) {
66  blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer);
67  }
68  }
69  // Keep trying to find a better path by fixing the "pain points".
70 
71  MATRIX_COORD pain_point;
72  float pain_point_priority;
73  int num_futile_classifications = 0;
74  STRING blamer_debug;
75  while (wordrec_enable_assoc &&
76  (!SegSearchDone(num_futile_classifications) ||
77  (blamer_bundle != nullptr &&
78  blamer_bundle->GuidedSegsearchStillGoing()))) {
79  // Get the next valid "pain point".
80  bool found_nothing = true;
81  LMPainPointsType pp_type;
82  while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) !=
83  LM_PPTYPE_NUM) {
84  if (!pain_point.Valid(*word_res->ratings)) {
85  word_res->ratings->IncreaseBandSize(
86  pain_point.row - pain_point.col + 1);
87  }
88  if (pain_point.Valid(*word_res->ratings) &&
89  !word_res->ratings->Classified(pain_point.col, pain_point.row,
90  getDict().WildcardID())) {
91  found_nothing = false;
92  break;
93  }
94  }
95  if (found_nothing) {
96  if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
97  break;
98  }
99  ProcessSegSearchPainPoint(pain_point_priority, pain_point,
101  &pending, word_res, &pain_points, blamer_bundle);
102 
103  UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending,
104  word_res, &pain_points, best_choice_bundle,
105  blamer_bundle);
106  if (!best_choice_bundle->updated) ++num_futile_classifications;
107 
108  if (segsearch_debug_level > 0) {
109  tprintf("num_futile_classifications %d\n", num_futile_classifications);
110  }
111 
112  best_choice_bundle->updated = false; // reset updated
113 
114  // See if it's time to terminate SegSearch or time for starting a guided
115  // search for the true path to find the blame for the incorrect best_choice.
116  if (SegSearchDone(num_futile_classifications) &&
117  blamer_bundle != nullptr &&
118  blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) {
119  InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle,
120  &blamer_debug);
121  }
122  } // end while loop exploring alternative paths
123  if (blamer_bundle != nullptr) {
124  blamer_bundle->FinishSegSearch(word_res->best_choice,
125  wordrec_debug_blamer, &blamer_debug);
126  }
127 
128  if (segsearch_debug_level > 0) {
129  tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
130  language_model_->AcceptableChoiceFound());
131  }
132 }
133 
134 // Setup and run just the initial segsearch on an established matrix,
135 // without doing any additional chopping or joining.
136 // (Internal factored version that can be used as part of the main SegSearch.)
137 void Wordrec::InitialSegSearch(WERD_RES* word_res, LMPainPoints* pain_points,
139  BestChoiceBundle* best_choice_bundle,
140  BlamerBundle* blamer_bundle) {
141  if (segsearch_debug_level > 0) {
142  tprintf("Starting SegSearch on ratings matrix%s:\n",
143  wordrec_enable_assoc ? " (with assoc)" : "");
144  word_res->ratings->print(getDict().getUnicharset());
145  }
146 
147  pain_points->GenerateInitial(word_res);
148 
149  // Compute scaling factor that will help us recover blob outline length
150  // from classifier rating and certainty for the blob.
151  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
152 
155  segsearch_max_char_wh_ratio, rating_cert_scale);
156 
157  // Initialize blamer-related information: map character boxes recorded in
158  // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
159  // ratings matrix. We expect this step to succeed, since when running the
160  // chopper we checked that the correct chops are present.
161  if (blamer_bundle != nullptr) {
162  blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
164  }
165 
166  // pending[col] tells whether there is update work to do to combine
167  // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
168  // As the language model state is updated, pending entries are modified to
169  // minimize duplication of work. It is important that during the update the
170  // children are considered in the non-decreasing order of their column, since
171  // this guarantees that all the parents would be up to date before an update
172  // of a child is done.
173  pending->init_to_size(word_res->ratings->dimension(), SegSearchPending());
174 
175  // Search the ratings matrix for the initial best path.
176  (*pending)[0].SetColumnClassified();
177  UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res,
178  pain_points, best_choice_bundle, blamer_bundle);
179 }
180 
182  float rating_cert_scale,
183  int starting_col,
185  WERD_RES *word_res,
186  LMPainPoints *pain_points,
187  BestChoiceBundle *best_choice_bundle,
188  BlamerBundle *blamer_bundle) {
189  MATRIX *ratings = word_res->ratings;
190  ASSERT_HOST(ratings->dimension() == pending->size());
191  ASSERT_HOST(ratings->dimension() == best_choice_bundle->beam.size());
192  for (int col = starting_col; col < ratings->dimension(); ++col) {
193  if (!(*pending)[col].WorkToDo()) continue;
194  int first_row = col;
195  int last_row = std::min(ratings->dimension() - 1,
196  col + ratings->bandwidth() - 1);
197  if ((*pending)[col].SingleRow() >= 0) {
198  first_row = last_row = (*pending)[col].SingleRow();
199  }
200  if (segsearch_debug_level > 0) {
201  tprintf("\n\nUpdateSegSearchNodes: col=%d, rows=[%d,%d], alljust=%d\n",
202  col, first_row, last_row,
203  (*pending)[col].IsRowJustClassified(INT32_MAX));
204  }
205  // Iterate over the pending list for this column.
206  for (int row = first_row; row <= last_row; ++row) {
207  // Update language model state of this child+parent pair.
208  BLOB_CHOICE_LIST *current_node = ratings->get(col, row);
209  LanguageModelState *parent_node =
210  col == 0 ? nullptr : best_choice_bundle->beam[col - 1];
211  if (current_node != nullptr &&
212  language_model_->UpdateState((*pending)[col].IsRowJustClassified(row),
213  col, row, current_node, parent_node,
214  pain_points, word_res,
215  best_choice_bundle, blamer_bundle) &&
216  row + 1 < ratings->dimension()) {
217  // Since the language model state of this entry changed, process all
218  // the child column.
219  (*pending)[row + 1].RevisitWholeColumn();
220  if (segsearch_debug_level > 0) {
221  tprintf("Added child col=%d to pending\n", row + 1);
222  }
223  } // end if UpdateState.
224  } // end for row.
225  } // end for col.
226  if (best_choice_bundle->best_vse != nullptr) {
227  ASSERT_HOST(word_res->StatesAllValid());
228  if (best_choice_bundle->best_vse->updated) {
229  pain_points->GenerateFromPath(rating_cert_scale,
230  best_choice_bundle->best_vse, word_res);
231  if (!best_choice_bundle->fixpt.empty()) {
232  pain_points->GenerateFromAmbigs(best_choice_bundle->fixpt,
233  best_choice_bundle->best_vse, word_res);
234  }
235  }
236  }
237  // The segsearch is completed. Reset all updated flags on all VSEs and reset
238  // all pendings.
239  for (int col = 0; col < pending->size(); ++col) {
240  (*pending)[col].Clear();
241  ViterbiStateEntry_IT
242  vse_it(&best_choice_bundle->beam[col]->viterbi_state_entries);
243  for (vse_it.mark_cycle_pt(); !vse_it.cycled_list(); vse_it.forward()) {
244  vse_it.data()->updated = false;
245  }
246  }
247 }
248 
250  float pain_point_priority,
251  const MATRIX_COORD &pain_point, const char* pain_point_type,
252  GenericVector<SegSearchPending>* pending, WERD_RES *word_res,
253  LMPainPoints *pain_points, BlamerBundle *blamer_bundle) {
254  if (segsearch_debug_level > 0) {
255  tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n",
256  pain_point_type, pain_point_priority,
257  pain_point.col, pain_point.row);
258  }
259  ASSERT_HOST(pain_points != nullptr);
260  MATRIX *ratings = word_res->ratings;
261  // Classify blob [pain_point.col pain_point.row]
262  if (!pain_point.Valid(*ratings)) {
263  ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col);
264  }
265  ASSERT_HOST(pain_point.Valid(*ratings));
266  BLOB_CHOICE_LIST *classified = classify_piece(word_res->seam_array,
267  pain_point.col, pain_point.row,
268  pain_point_type,
269  word_res->chopped_word,
270  blamer_bundle);
271  BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row);
272  if (lst == nullptr) {
273  ratings->put(pain_point.col, pain_point.row, classified);
274  } else {
275  // We can not delete old BLOB_CHOICEs, since they might contain
276  // ViterbiStateEntries that are parents of other "active" entries.
277  // Thus if the matrix cell already contains classifications we add
278  // the new ones to the beginning of the list.
279  BLOB_CHOICE_IT it(lst);
280  it.add_list_before(classified);
281  delete classified; // safe to delete, since empty after add_list_before()
282  classified = nullptr;
283  }
284 
285  if (segsearch_debug_level > 0) {
286  print_ratings_list("Updated ratings matrix with a new entry:",
287  ratings->get(pain_point.col, pain_point.row),
288  getDict().getUnicharset());
289  ratings->print(getDict().getUnicharset());
290  }
291 
292  // Insert initial "pain points" to join the newly classified blob
293  // with its left and right neighbors.
294  if (classified != nullptr && !classified->empty()) {
295  if (pain_point.col > 0) {
296  pain_points->GeneratePainPoint(
297  pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0,
298  true, segsearch_max_char_wh_ratio, word_res);
299  }
300  if (pain_point.row + 1 < ratings->dimension()) {
301  pain_points->GeneratePainPoint(
302  pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0,
303  true, segsearch_max_char_wh_ratio, word_res);
304  }
305  }
306  (*pending)[pain_point.col].SetBlobClassified(pain_point.row);
307 }
308 
309 // Resets enough of the results so that the Viterbi search is re-run.
310 // Needed when the n-gram model is enabled, as the multi-length comparison
311 // implementation will re-value existing paths to worse values.
313  BestChoiceBundle* best_choice_bundle,
315  // TODO(rays) More refactoring required here.
316  // Delete existing viterbi states.
317  for (int col = 0; col < best_choice_bundle->beam.size(); ++col) {
318  best_choice_bundle->beam[col]->Clear();
319  }
320  // Reset best_choice_bundle.
321  word_res->ClearWordChoices();
322  best_choice_bundle->best_vse = nullptr;
323  // Clear out all existing pendings and add a new one for the first column.
324  (*pending)[0].SetColumnClassified();
325  for (int i = 1; i < pending->size(); ++i)
326  (*pending)[i].Clear();
327 }
328 
330  LMPainPoints *pain_points,
331  BlamerBundle *blamer_bundle,
332  STRING *blamer_debug) {
333  pain_points->Clear(); // Clear pain points heap.
335  pain_points, &LMPainPoints::GenerateForBlamer,
336  static_cast<double>(segsearch_max_char_wh_ratio), word_res);
337  blamer_bundle->InitForSegSearch(word_res->best_choice, word_res->ratings,
338  getDict().WildcardID(), wordrec_debug_blamer,
339  blamer_debug, pp_cb);
340  delete pp_cb;
341 }
342 
343 } // namespace tesseract
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:117
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:506
bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority, bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res)
double segsearch_max_char_wh_ratio
Definition: wordrec.h:244
int size() const
Definition: genericvector.h:71
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:249
double certainty_scale
Definition: dict.h:611
int segsearch_max_pain_points
Definition: wordrec.h:240
void SetChopperBlame(const WERD_RES *word, bool debug)
Definition: blamer.cpp:315
Struct to store information maintained by various language model components.
Definition: lm_state.h:195
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
bool Valid(const MATRIX &m) const
Definition: matrix.h:615
void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
Definition: segsearch.cpp:312
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:116
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, bool debug, STRING *debug_str, TessResultCallback2< bool, int, int > *pp_cb)
Definition: blamer.cpp:478
int bandwidth() const
Definition: matrix.h:535
static const char * PainPointDescription(LMPainPointsType type)
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const
Definition: blamer.cpp:466
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:54
PointerVector< LanguageModelState > beam
Definition: lm_state.h:233
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
LMPainPointsType Deque(MATRIX_COORD *pp, float *priority)
void init_to_size(int size, const T &t)
void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
Definition: segsearch.cpp:329
void GenerateInitial(WERD_RES *word_res)
int dimension() const
Definition: matrix.h:533
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void put(ICOORD pos, const T &thing)
Definition: matrix.h:220
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
Definition: lm_state.h:229
void DoSegSearch(WERD_RES *word_res)
Definition: segsearch.cpp:37
void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res)
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:181
GenericVector< SEAM * > seam_array
Definition: pageres.h:217
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:217
Definition: strngs.h:45
void SetupCorrectSegmentation(const TWERD *word, bool debug)
Definition: blamer.cpp:412
bool Classified(int col, int row, int wildcard_id) const
Definition: matrix.cpp:41
ViterbiStateEntry * best_vse
Best ViterbiStateEntry and BLOB_CHOICE.
Definition: lm_state.h:235
void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:137
bool updated
Flag to indicate whether anything was changed.
Definition: lm_state.h:227
MATRIX * ratings
Definition: pageres.h:231
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:230
bool SegSearchDone(int num_futile_classifications)
Definition: wordrec.h:491
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
void ClearWordChoices()
Definition: pageres.cpp:1178
int segsearch_debug_level
Definition: wordrec.h:238
bool wordrec_enable_assoc
Definition: wordrec.h:199
void improve_by_chopping(float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
Definition: chopper.cpp:462
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:836
virtual Dict & getDict()
Definition: classify.h:107
static void PrintSeams(const char *label, const GenericVector< SEAM *> &seams)
Definition: seam.cpp:173
Definition: matrix.h:575
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, STRING *debug_str)
Definition: blamer.cpp:511
bool wordrec_debug_blamer
Definition: wordrec.h:236
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM *> &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:55
TWERD * chopped_word
Definition: pageres.h:215
bool StatesAllValid()
Definition: pageres.cpp:464
void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:43
WERD_CHOICE * best_choice
Definition: pageres.h:235
void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res)
T get(ICOORD pos) const
Definition: matrix.h:228
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool GenerateForBlamer(double max_char_wh_ratio, WERD_RES *word_res, int col, int row)