tesseract  4.0.0-1-g2a2b
chopper.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: chopper.cpp (Formerly chopper.c)
5  * Description:
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Tue Jul 30 16:18:52 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  **************************************************************************/
25 
26 /*----------------------------------------------------------------------
27  I n c l u d e s
28 ----------------------------------------------------------------------*/
29 
30 #include "chopper.h"
31 #include "blamer.h" // for BlamerBundle, IRR_CORRECT
32 #include "blobs.h" // for TPOINT, TBLOB, EDGEPT, TESSLINE, divisible_blob
33 #include "callcpp.h" // for Red
34 #include "dict.h" // for Dict
35 #include "host.h" // for FALSE, TRUE
36 #include "lm_pain_points.h" // for LMPainPoints
37 #include "lm_state.h" // for BestChoiceBundle
38 #include "matrix.h" // for MATRIX
39 #include "normalis.h" // for DENORM
40 #include "pageres.h" // for WERD_RES
41 #include "params.h" // for IntParam, BoolParam
42 #include "ratngs.h" // for BLOB_CHOICE (ptr only), BLOB_CHOICE_LIST (ptr ...
43 #include "rect.h" // for TBOX
44 #include "render.h" // for display_blob
45 #include "seam.h" // for SEAM
46 #include "split.h" // for remove_edgept
47 #include "stopper.h" // for DANGERR
48 #include "tprintf.h" // for tprintf
49 #include "wordrec.h" // for Wordrec, SegSearchPending (ptr only)
50 
51 class CHAR_FRAGMENT;
52 
53 template <typename T> class GenericVector;
54 
55 // Include automatically generated configuration file if running autoconf.
56 #ifdef HAVE_CONFIG_H
57 #include "config_auto.h"
58 #endif
59 
60 // Even though the limit on the number of chunks may now be removed, keep
61 // the same limit for repeatable behavior, and it may be a speed advantage.
62 static const int kMaxNumChunks = 64;
63 
64 /*----------------------------------------------------------------------
65  F u n c t i o n s
66 ----------------------------------------------------------------------*/
72 void preserve_outline(EDGEPT *start) {
73  EDGEPT *srcpt;
74 
75  if (start == nullptr)
76  return;
77  srcpt = start;
78  do {
79  srcpt->flags[1] = 1;
80  srcpt = srcpt->next;
81  }
82  while (srcpt != start);
83  srcpt->flags[1] = 2;
84 }
85 
86 
87 /**************************************************************************/
89  TESSLINE *outline;
90 
91  for (outline = srcline; outline != nullptr; outline = outline->next) {
92  preserve_outline (outline->loop);
93  }
94 }
95 
96 
103  EDGEPT *srcpt;
104  EDGEPT *real_start;
105 
106  if (start == nullptr)
107  return nullptr;
108  srcpt = start;
109  do {
110  if (srcpt->flags[1] == 2)
111  break;
112  srcpt = srcpt->next;
113  }
114  while (srcpt != start);
115  real_start = srcpt;
116  do {
117  srcpt = srcpt->next;
118  if (srcpt->prev->flags[1] == 0) {
119  remove_edgept(srcpt->prev);
120  }
121  }
122  while (srcpt != real_start);
123  return real_start;
124 }
125 
126 
127 /******************************************************************************/
129  TESSLINE *outline;
130 
131  for (outline = srcline; outline != nullptr; outline = outline->next) {
132  outline->loop = restore_outline (outline->loop);
133  outline->start = outline->loop->pos;
134  }
135 }
136 
137 // Helper runs all the checks on a seam to make sure it is valid.
138 // Returns the seam if OK, otherwise deletes the seam and returns nullptr.
139 static SEAM* CheckSeam(int debug_level, int32_t blob_number, TWERD* word,
140  TBLOB* blob, TBLOB* other_blob,
141  const GenericVector<SEAM*>& seams, SEAM* seam) {
142  if (seam == nullptr || blob->outlines == nullptr || other_blob->outlines == nullptr ||
143  total_containment(blob, other_blob) || check_blob(other_blob) ||
144  !seam->ContainedByBlob(*blob) || !seam->ContainedByBlob(*other_blob) ||
145  any_shared_split_points(seams, seam) ||
146  !seam->PrepareToInsertSeam(seams, word->blobs, blob_number, false)) {
147  word->blobs.remove(blob_number + 1);
148  if (seam) {
149  seam->UndoSeam(blob, other_blob);
150  delete seam;
151  seam = nullptr;
152 #ifndef GRAPHICS_DISABLED
153  if (debug_level) {
154  if (debug_level >2)
155  display_blob(blob, Red);
156  tprintf("\n** seam being removed ** \n");
157  }
158 #endif
159  } else {
160  delete other_blob;
161  }
162  return nullptr;
163  }
164  return seam;
165 }
166 
167 
174 namespace tesseract {
175 SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number,
176  bool italic_blob,
177  const GenericVector<SEAM*>& seams) {
180  TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
181  // Insert it into the word.
182  word->blobs.insert(other_blob, blob_number + 1);
183 
184  SEAM *seam = nullptr;
185  if (prioritize_division) {
186  TPOINT location;
187  if (divisible_blob(blob, italic_blob, &location)) {
188  seam = new SEAM(0.0f, location);
189  }
190  }
191  if (seam == nullptr)
192  seam = pick_good_seam(blob);
193  if (chop_debug) {
194  if (seam != nullptr)
195  seam->Print("Good seam picked=");
196  else
197  tprintf("\n** no seam picked *** \n");
198  }
199  if (seam) {
200  seam->ApplySeam(italic_blob, blob, other_blob);
201  }
202 
203  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
204  seams, seam);
205  if (seam == nullptr) {
209  // If the blob can simply be divided into outlines, then do that.
210  TPOINT location;
211  if (divisible_blob(blob, italic_blob, &location)) {
212  other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
213  word->blobs.insert(other_blob, blob_number + 1);
214  seam = new SEAM(0.0f, location);
215  seam->ApplySeam(italic_blob, blob, other_blob);
216  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
217  seams, seam);
218  }
219  }
220  }
221  if (seam != nullptr) {
222  // Make sure this seam doesn't get chopped again.
223  seam->Finalize();
224  }
225  return seam;
226 }
227 
228 
229 SEAM *Wordrec::chop_numbered_blob(TWERD *word, int32_t blob_number,
230  bool italic_blob,
231  const GenericVector<SEAM*>& seams) {
232  return attempt_blob_chop(word, word->blobs[blob_number], blob_number,
233  italic_blob, seams);
234 }
235 
236 
238  bool italic_blob, WERD_RES *word_res,
239  int *blob_number) {
240  TWERD *word = word_res->chopped_word;
241  for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) {
242  TBLOB *blob = word->blobs[*blob_number];
243  TPOINT topleft, botright;
244  topleft.x = blob->bounding_box().left();
245  topleft.y = blob->bounding_box().top();
246  botright.x = blob->bounding_box().right();
247  botright.y = blob->bounding_box().bottom();
248 
249  TPOINT original_topleft, original_botright;
250  word_res->denorm.DenormTransform(nullptr, topleft, &original_topleft);
251  word_res->denorm.DenormTransform(nullptr, botright, &original_botright);
252 
253  TBOX original_box = TBOX(original_topleft.x, original_botright.y,
254  original_botright.x, original_topleft.y);
255 
256  bool almost_equal_box = false;
257  int num_overlap = 0;
258  for (int i = 0; i < boxes.size(); i++) {
259  if (original_box.overlap_fraction(boxes[i]) > 0.125)
260  num_overlap++;
261  if (original_box.almost_equal(boxes[i], 3))
262  almost_equal_box = true;
263  }
264 
265  TPOINT location;
266  if (divisible_blob(blob, italic_blob, &location) ||
267  (!almost_equal_box && num_overlap > 1)) {
268  SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
269  italic_blob, word_res->seam_array);
270  if (seam != nullptr)
271  return seam;
272  }
273  }
274 
275  *blob_number = -1;
276  return nullptr;
277 }
278 
279 } // namespace tesseract
280 
281 
288  int length;
289  int index;
290 
291  length = seams.size();
292  for (index = 0; index < length; index++)
293  if (seam->SharesPosition(*seams[index])) return TRUE;
294  return FALSE;
295 }
296 
297 
303 int check_blob(TBLOB *blob) {
304  TESSLINE *outline;
305  EDGEPT *edgept;
306 
307  for (outline = blob->outlines; outline != nullptr; outline = outline->next) {
308  edgept = outline->loop;
309  do {
310  if (edgept == nullptr)
311  break;
312  edgept = edgept->next;
313  }
314  while (edgept != outline->loop);
315  if (edgept == nullptr)
316  return 1;
317  }
318  return 0;
319 }
320 
321 
322 namespace tesseract {
336  DANGERR *fixpt,
337  bool split_next_to_fragment,
338  bool italic_blob,
339  WERD_RES* word,
340  int* blob_number) {
341  float rating_ceiling = FLT_MAX;
342  SEAM *seam = nullptr;
343  do {
344  *blob_number = select_blob_to_split_from_fixpt(fixpt);
345  if (chop_debug) tprintf("blob_number from fixpt = %d\n", *blob_number);
346  bool split_point_from_dict = (*blob_number != -1);
347  if (split_point_from_dict) {
348  fixpt->clear();
349  } else {
350  *blob_number = select_blob_to_split(blob_choices, rating_ceiling,
351  split_next_to_fragment);
352  }
353  if (chop_debug) tprintf("blob_number = %d\n", *blob_number);
354  if (*blob_number == -1)
355  return nullptr;
356 
357  // TODO(rays) it may eventually help to allow italic_blob to be true,
358  seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob,
359  word->seam_array);
360  if (seam != nullptr)
361  return seam; // Success!
362  if (blob_choices[*blob_number] == nullptr)
363  return nullptr;
364  if (!split_point_from_dict) {
365  // We chopped the worst rated blob, try something else next time.
366  rating_ceiling = blob_choices[*blob_number]->rating();
367  }
368  } while (true);
369  return seam;
370 }
371 
380  const GenericVector<BLOB_CHOICE*>& blob_choices,
381  WERD_RES* word_res,
382  int* blob_number) {
383  if (prioritize_division) {
384  return chop_overlapping_blob(boxes, true, word_res, blob_number);
385  } else {
386  return improve_one_blob(blob_choices, nullptr, false, true, word_res,
387  blob_number);
388  }
389 }
390 
400  int num_blobs = word->chopped_word->NumBlobs();
401  if (word->ratings == nullptr) {
402  word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks);
403  }
404  if (word->ratings->get(0, 0) == nullptr) {
405  // Run initial classification.
406  for (int b = 0; b < num_blobs; ++b) {
407  BLOB_CHOICE_LIST* choices = classify_piece(word->seam_array, b, b,
408  "Initial:", word->chopped_word,
409  word->blamer_bundle);
410  word->ratings->put(b, b, choices);
411  }
412  } else {
413  // Blobs have been pre-classified. Set matrix cell for all blob choices
414  for (int col = 0; col < word->ratings->dimension(); ++col) {
415  for (int row = col; row < word->ratings->dimension() &&
416  row < col + word->ratings->bandwidth(); ++row) {
417  BLOB_CHOICE_LIST* choices = word->ratings->get(col, row);
418  if (choices != nullptr) {
419  BLOB_CHOICE_IT bc_it(choices);
420  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
421  bc_it.data()->set_matrix_cell(col, row);
422  }
423  }
424  }
425  }
426  }
427 
428  // Run Segmentation Search.
429  BestChoiceBundle best_choice_bundle(word->ratings->dimension());
430  SegSearch(word, &best_choice_bundle, word->blamer_bundle);
431 
432  if (word->best_choice == nullptr) {
433  // SegSearch found no valid paths, so just use the leading diagonal.
435  }
436  word->RebuildBestState();
437  // If we finished without a hyphen at the end of the word, let the next word
438  // be found in the dictionary.
439  if (word->word->flag(W_EOL) &&
440  !getDict().has_hyphen_end(*word->best_choice)) {
441  getDict().reset_hyphen_vars(true);
442  }
443 
444  if (word->blamer_bundle != nullptr && this->fill_lattice_ != nullptr) {
445  CallFillLattice(*word->ratings, word->best_choices,
446  *word->uch_set, word->blamer_bundle);
447  }
448  if (wordrec_debug_level > 0) {
449  tprintf("Final Ratings Matrix:\n");
450  word->ratings->print(getDict().getUnicharset());
451  }
452  word->FilterWordChoices(getDict().stopper_debug_level);
453 }
454 
462 void Wordrec::improve_by_chopping(float rating_cert_scale,
463  WERD_RES* word,
464  BestChoiceBundle* best_choice_bundle,
465  BlamerBundle* blamer_bundle,
466  LMPainPoints* pain_points,
468  int blob_number;
469  do { // improvement loop.
470  // Make a simple vector of BLOB_CHOICEs to make it easy to pick which
471  // one to chop.
472  GenericVector<BLOB_CHOICE*> blob_choices;
473  int num_blobs = word->ratings->dimension();
474  for (int i = 0; i < num_blobs; ++i) {
475  BLOB_CHOICE_LIST* choices = word->ratings->get(i, i);
476  if (choices == nullptr || choices->empty()) {
477  blob_choices.push_back(nullptr);
478  } else {
479  BLOB_CHOICE_IT bc_it(choices);
480  blob_choices.push_back(bc_it.data());
481  }
482  }
483  SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt,
484  false, false, word, &blob_number);
485  if (seam == nullptr) break;
486  // A chop has been made. We have to correct all the data structures to
487  // take into account the extra bottom-level blob.
488  // Put the seam into the seam_array and correct everything else on the
489  // word: ratings matrix (including matrix location in the BLOB_CHOICES),
490  // states in WERD_CHOICEs, and blob widths.
491  word->InsertSeam(blob_number, seam);
492  // Insert a new entry in the beam array.
493  best_choice_bundle->beam.insert(new LanguageModelState, blob_number);
494  // Fixpts are outdated, but will get recalculated.
495  best_choice_bundle->fixpt.clear();
496  // Remap existing pain points.
497  pain_points->RemapForSplit(blob_number);
498  // Insert a new pending at the chop point.
499  pending->insert(SegSearchPending(), blob_number);
500 
501  // Classify the two newly created blobs using ProcessSegSearchPainPoint,
502  // as that updates the pending correctly and adds new pain points.
503  MATRIX_COORD pain_point(blob_number, blob_number);
504  ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word,
505  pain_points, blamer_bundle);
506  pain_point.col = blob_number + 1;
507  pain_point.row = blob_number + 1;
508  ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word,
509  pain_points, blamer_bundle);
510  if (language_model_->language_model_ngram_on) {
511  // N-gram evaluation depends on the number of blobs in a chunk, so we
512  // have to re-evaluate everything in the word.
513  ResetNGramSearch(word, best_choice_bundle, pending);
514  blob_number = 0;
515  }
516  // Run language model incrementally. (Except with the n-gram model on.)
517  UpdateSegSearchNodes(rating_cert_scale, blob_number, pending,
518  word, pain_points, best_choice_bundle, blamer_bundle);
519  } while (!language_model_->AcceptableChoiceFound() &&
520  word->ratings->dimension() < kMaxNumChunks);
521 
522  // If after running only the chopper best_choice is incorrect and no blame
523  // has been yet set, blame the classifier if best_choice is classifier's
524  // top choice and is a dictionary word (i.e. language model could not have
525  // helped). Otherwise blame the tradeoff between the classifier and
526  // the old language model (permuters).
527  if (word->blamer_bundle != nullptr &&
529  !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) {
530  bool valid_permuter = word->best_choice != nullptr &&
533  getDict().getUnicharset(),
534  valid_permuter,
536  }
537 }
538 
539 
540 /**********************************************************************
541  * select_blob_to_split
542  *
543  * These are the results of the last classification. Find a likely
544  * place to apply splits. If none, return -1.
545  **********************************************************************/
547  const GenericVector<BLOB_CHOICE*>& blob_choices,
548  float rating_ceiling, bool split_next_to_fragment) {
549  BLOB_CHOICE *blob_choice;
550  int x;
551  float worst = -FLT_MAX;
552  int worst_index = -1;
553  float worst_near_fragment = -FLT_MAX;
554  int worst_index_near_fragment = -1;
555  const CHAR_FRAGMENT **fragments = nullptr;
556 
557  if (chop_debug) {
558  if (rating_ceiling < FLT_MAX)
559  tprintf("rating_ceiling = %8.4f\n", rating_ceiling);
560  else
561  tprintf("rating_ceiling = No Limit\n");
562  }
563 
564  if (split_next_to_fragment && blob_choices.size() > 0) {
565  fragments = new const CHAR_FRAGMENT *[blob_choices.length()];
566  if (blob_choices[0] != nullptr) {
567  fragments[0] = getDict().getUnicharset().get_fragment(
568  blob_choices[0]->unichar_id());
569  } else {
570  fragments[0] = nullptr;
571  }
572  }
573 
574  for (x = 0; x < blob_choices.size(); ++x) {
575  if (blob_choices[x] == nullptr) {
576  delete[] fragments;
577  return x;
578  } else {
579  blob_choice = blob_choices[x];
580  // Populate fragments for the following position.
581  if (split_next_to_fragment && x+1 < blob_choices.size()) {
582  if (blob_choices[x + 1] != nullptr) {
583  fragments[x + 1] = getDict().getUnicharset().get_fragment(
584  blob_choices[x + 1]->unichar_id());
585  } else {
586  fragments[x + 1] = nullptr;
587  }
588  }
589  if (blob_choice->rating() < rating_ceiling &&
590  blob_choice->certainty() < tessedit_certainty_threshold) {
591  // Update worst and worst_index.
592  if (blob_choice->rating() > worst) {
593  worst_index = x;
594  worst = blob_choice->rating();
595  }
596  if (split_next_to_fragment) {
597  // Update worst_near_fragment and worst_index_near_fragment.
598  bool expand_following_fragment =
599  (x + 1 < blob_choices.size() &&
600  fragments[x+1] != nullptr && !fragments[x+1]->is_beginning());
601  bool expand_preceding_fragment =
602  (x > 0 && fragments[x-1] != nullptr && !fragments[x-1]->is_ending());
603  if ((expand_following_fragment || expand_preceding_fragment) &&
604  blob_choice->rating() > worst_near_fragment) {
605  worst_index_near_fragment = x;
606  worst_near_fragment = blob_choice->rating();
607  if (chop_debug) {
608  tprintf("worst_index_near_fragment=%d"
609  " expand_following_fragment=%d"
610  " expand_preceding_fragment=%d\n",
611  worst_index_near_fragment,
612  expand_following_fragment,
613  expand_preceding_fragment);
614  }
615  }
616  }
617  }
618  }
619  }
620  delete[] fragments;
621  // TODO(daria): maybe a threshold of badness for
622  // worst_near_fragment would be useful.
623  return worst_index_near_fragment != -1 ?
624  worst_index_near_fragment : worst_index;
625 }
626 
627 /**********************************************************************
628  * select_blob_to_split_from_fixpt
629  *
630  * Given the fix point from a dictionary search, if there is a single
631  * dangerous blob that maps to multiple characters, return that blob
632  * index as a place we need to split. If none, return -1.
633  **********************************************************************/
635  if (!fixpt)
636  return -1;
637  for (int i = 0; i < fixpt->size(); i++) {
638  if ((*fixpt)[i].begin + 1 == (*fixpt)[i].end &&
639  (*fixpt)[i].dangerous &&
640  (*fixpt)[i].correct_is_ngram) {
641  return (*fixpt)[i].begin;
642  }
643  }
644  return -1;
645 }
646 
647 
648 } // namespace tesseract
649 
650 
651 /**********************************************************************
652  * total_containment
653  *
654  * Check to see if one of these outlines is totally contained within
655  * the bounding box of the other.
656  **********************************************************************/
657 int16_t total_containment(TBLOB *blob1, TBLOB *blob2) {
658  TBOX box1 = blob1->bounding_box();
659  TBOX box2 = blob2->bounding_box();
660  return box1.contains(box2) || box2.contains(box1);
661 }
int repair_unchopped_blobs
Definition: wordrec.h:206
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:117
bool is_beginning() const
Definition: unicharset.h:106
WERD_CHOICE_LIST best_choices
Definition: pageres.h:243
bool allow_blob_division
Definition: classify.h:423
TESSLINE * next
Definition: blobs.h:265
float certainty() const
Definition: ratngs.h:83
int size() const
Definition: genericvector.h:71
#define TRUE
Definition: capi.h:51
Definition: blobs.h:402
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:904
TPOINT pos
Definition: blobs.h:170
int check_blob(TBLOB *blob)
Definition: chopper.cpp:303
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:249
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
int wordrec_max_join_chunks
Definition: wordrec.h:233
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:28
uint8_t permuter() const
Definition: ratngs.h:346
Definition: seam.h:44
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:424
void remove(int index)
int wordrec_debug_level
Definition: wordrec.h:231
void ApplySeam(bool italic_blob, TBLOB *blob, TBLOB *other_blob) const
Definition: seam.cpp:124
SEAM * pick_good_seam(TBLOB *blob)
Definition: findseam.cpp:224
Definition: rect.h:34
int NumBlobs() const
Definition: blobs.h:432
Definition: werd.h:35
void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:264
Struct to store information maintained by various language model components.
Definition: lm_state.h:195
void RemapForSplit(int index)
void remove_edgept(EDGEPT *point)
Definition: split.cpp:206
SEAM * improve_one_blob(const GenericVector< BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
Definition: chopper.cpp:335
int any_shared_split_points(const GenericVector< SEAM *> &seams, SEAM *seam)
Definition: chopper.cpp:287
void preserve_outline(EDGEPT *start)
Definition: chopper.cpp:72
TPOINT start
Definition: blobs.h:262
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location)
Definition: blobs.cpp:923
bool prioritize_division
Definition: classify.h:428
void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
Definition: segsearch.cpp:312
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:116
int bandwidth() const
Definition: matrix.h:535
double overlap_fraction(const TBOX &box) const
Definition: rect.h:388
bool PrepareToInsertSeam(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int insert_index, bool modify)
Definition: seam.cpp:82
PointerVector< LanguageModelState > beam
Definition: lm_state.h:233
void Print(const char *label) const
Definition: seam.cpp:160
Definition: callcpp.h:32
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:519
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, bool valid_permuter, bool debug)
Definition: blamer.cpp:374
int16_t left() const
Definition: rect.h:72
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
void insert(const T &t, int index)
void DenormTransform(const DENORM *last_denorm, const TPOINT &pt, TPOINT *original) const
Definition: normalis.cpp:390
int16_t top() const
Definition: rect.h:58
DENORM denorm
Definition: pageres.h:204
int select_blob_to_split_from_fixpt(DANGERR *fixpt)
Definition: chopper.cpp:634
void Finalize()
Definition: seam.h:116
EDGEPT * loop
Definition: blobs.h:264
#define FALSE
Definition: capi.h:52
void chop_word_main(WERD_RES *word)
Definition: chopper.cpp:399
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:118
bool SharesPosition(const SEAM &other) const
Definition: seam.h:95
int length() const
Definition: genericvector.h:85
int dimension() const
Definition: matrix.h:533
Definition: blobs.h:83
EDGEPT * prev
Definition: blobs.h:177
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void display_blob(TBLOB *blob, C_COL color)
Definition: render.cpp:61
TBOX bounding_box() const
Definition: blobs.cpp:478
void preserve_outline_tree(TESSLINE *srcline)
Definition: chopper.cpp:88
void put(ICOORD pos, const T &thing)
Definition: matrix.h:220
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
Definition: lm_state.h:229
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:181
GenericVector< SEAM * > seam_array
Definition: pageres.h:217
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:217
double tessedit_certainty_threshold
Definition: wordrec.h:207
bool is_ending() const
Definition: unicharset.h:109
int push_back(T object)
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
int16_t x
Definition: blobs.h:78
EDGEPT * restore_outline(EDGEPT *start)
Definition: chopper.cpp:102
float rating() const
Definition: ratngs.h:80
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729
static TBLOB * ShallowCopy(const TBLOB &src)
Definition: blobs.cpp:345
bool contains(const FCOORD pt) const
Definition: rect.h:333
MATRIX * ratings
Definition: pageres.h:231
const UNICHARSET * uch_set
Definition: pageres.h:206
SEAM * chop_overlapping_blob(const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:237
BlamerBundle * blamer_bundle
Definition: pageres.h:246
void restore_outline_tree(TESSLINE *srcline)
Definition: chopper.cpp:128
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
bool ContainedByBlob(const TBLOB &blob) const
Definition: seam.h:79
char flags[EDGEPTFLAGS]
Definition: blobs.h:175
int16_t right() const
Definition: rect.h:79
int select_blob_to_split(const GenericVector< BLOB_CHOICE *> &blob_choices, float rating_ceiling, bool split_next_to_fragment)
Definition: chopper.cpp:546
void UndoSeam(TBLOB *blob, TBLOB *other_blob) const
Definition: seam.cpp:140
void improve_by_chopping(float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
Definition: chopper.cpp:462
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:144
void RebuildBestState()
Definition: pageres.cpp:814
virtual Dict & getDict()
Definition: classify.h:107
Definition: matrix.h:575
Definition: blobs.h:268
SEAM * attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
Definition: chopper.cpp:175
bool wordrec_debug_blamer
Definition: wordrec.h:236
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM *> &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:55
TWERD * chopped_word
Definition: pageres.h:215
Definition: blobs.h:57
int16_t y
Definition: blobs.h:79
int16_t bottom() const
Definition: rect.h:65
SEAM * chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
Definition: chopper.cpp:229
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:379
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:459
int16_t total_containment(TBLOB *blob1, TBLOB *blob2)
Definition: chopper.cpp:657
TESSLINE * outlines
Definition: blobs.h:384
EDGEPT * next
Definition: blobs.h:176
void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:43
WERD_CHOICE * best_choice
Definition: pageres.h:235
T get(ICOORD pos) const
Definition: matrix.h:228
WERD * word
Definition: pageres.h:189