tesseract  5.0.0-alpha-619-ge9db
pageres.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pageres.cpp (Formerly page_res.c)
3  * Description: Hierarchy of results classes from PAGE_RES to WERD_RES
4  * and an iterator class to iterate over the words.
5  * Main purposes:
6  * Easy way to iterate over the words without a 3-nested loop.
7  * Holds data used during word recognition.
8  * Holds information about alternative spacing paths.
9  * Author: Phil Cheatle
10  *
11  * (C) Copyright 1992, Hewlett-Packard Ltd.
12  ** Licensed under the Apache License, Version 2.0 (the "License");
13  ** you may not use this file except in compliance with the License.
14  ** You may obtain a copy of the License at
15  ** http://www.apache.org/licenses/LICENSE-2.0
16  ** Unless required by applicable law or agreed to in writing, software
17  ** distributed under the License is distributed on an "AS IS" BASIS,
18  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19  ** See the License for the specific language governing permissions and
20  ** limitations under the License.
21  *
22  **********************************************************************/
23 
24 #include "pageres.h"
25 #include <cassert> // for assert
26 #include <cstdint> // for INT32_MAX
27 #include <cstring> // for strlen
28 #include "blamer.h" // for BlamerBundle
29 #include "blobs.h" // for TWERD, TBLOB
30 #include "boxword.h" // for BoxWord
31 #include "errcode.h" // for ASSERT_HOST
32 #include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
33 #include "ocrrow.h" // for ROW, ROW_IT
34 #include "pdblock.h" // for PDBLK
35 #include "polyblk.h" // for POLY_BLOCK
36 #include <tesseract/publictypes.h> // for OcrEngineMode, OEM_LSTM_ONLY
37 #include "seam.h" // for SEAM, start_seam_list
38 #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
39 #include "tprintf.h" // for tprintf
40 
41 struct Pix;
42 
45 
46 // Gain factor for computing thresholds that determine the ambiguity of a word.
47 static const double kStopperAmbiguityThresholdGain = 8.0;
48 // Constant offset for computing thresholds that determine the ambiguity of a
49 // word.
50 static const double kStopperAmbiguityThresholdOffset = 1.5;
51 // Max number of broken pieces to associate.
53 // Max ratio of word box height to line size to allow it to be processed as
54 // a line with other words.
55 const double kMaxWordSizeRatio = 1.25;
56 // Max ratio of line box height to line size to allow a new word to be added.
57 const double kMaxLineSizeRatio = 1.25;
58 // Max ratio of word gap to line size to allow a new word to be added.
59 const double kMaxWordGapRatio = 2.0;
60 
61 // Computes and returns a threshold of certainty difference used to determine
62 // which words to keep, based on the adjustment factors of the two words.
63 // TODO(rays) This is horrible. Replace with an enhance params training model.
64 static double StopperAmbigThreshold(double f1, double f2) {
65  return (f2 - f1) * kStopperAmbiguityThresholdGain -
66  kStopperAmbiguityThresholdOffset;
67 }
68 
69 /*************************************************************************
70  * PAGE_RES::PAGE_RES
71  *
72  * Constructor for page results
73  *************************************************************************/
75  bool merge_similar_words,
76  BLOCK_LIST *the_block_list,
77  WERD_CHOICE **prev_word_best_choice_ptr) {
78  Init();
79  BLOCK_IT block_it(the_block_list);
80  BLOCK_RES_IT block_res_it(&block_res_list);
81  for (block_it.mark_cycle_pt();
82  !block_it.cycled_list(); block_it.forward()) {
83  block_res_it.add_to_end(new BLOCK_RES(merge_similar_words,
84  block_it.data()));
85  }
86  prev_word_best_choice = prev_word_best_choice_ptr;
87 }
88 
89 /*************************************************************************
90  * BLOCK_RES::BLOCK_RES
91  *
92  * Constructor for BLOCK results
93  *************************************************************************/
94 
95 BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
96  ROW_IT row_it (the_block->row_list ());
97  ROW_RES_IT row_res_it(&row_res_list);
98 
99  char_count = 0;
100  rej_count = 0;
101  font_class = -1; //not assigned
102  x_height = -1.0;
103  font_assigned = false;
104  row_count = 0;
105 
106  block = the_block;
107 
108  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
109  row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
110  }
111 }
112 
113 /*************************************************************************
114  * ROW_RES::ROW_RES
115  *
116  * Constructor for ROW results
117  *************************************************************************/
118 
119 ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
120  WERD_IT word_it(the_row->word_list());
121  WERD_RES_IT word_res_it(&word_res_list);
122  WERD_RES *combo = nullptr; // current combination of fuzzies
123  WERD *copy_word;
124 
125  char_count = 0;
126  rej_count = 0;
128 
129  row = the_row;
130  bool add_next_word = false;
131  TBOX union_box;
132  float line_height = the_row->x_height() + the_row->ascenders() -
133  the_row->descenders();
134  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
135  auto* word_res = new WERD_RES(word_it.data());
136  word_res->x_height = the_row->x_height();
137  if (add_next_word) {
138  ASSERT_HOST(combo != nullptr);
139  // We are adding this word to the combination.
140  word_res->part_of_combo = true;
141  combo->copy_on(word_res);
142  } else if (merge_similar_words) {
143  union_box = word_res->word->bounding_box();
144  add_next_word = !word_res->word->flag(W_REP_CHAR) &&
145  union_box.height() <= line_height * kMaxWordSizeRatio;
146  word_res->odd_size = !add_next_word;
147  }
148  WERD* next_word = word_it.data_relative(1);
149  if (merge_similar_words) {
150  if (add_next_word && !next_word->flag(W_REP_CHAR)) {
151  // Next word will be added on if all of the following are true:
152  // Not a rep char.
153  // Box height small enough.
154  // Union box height small enough.
155  // Horizontal gap small enough.
156  TBOX next_box = next_word->bounding_box();
157  int prev_right = union_box.right();
158  union_box += next_box;
159  if (next_box.height() > line_height * kMaxWordSizeRatio ||
160  union_box.height() > line_height * kMaxLineSizeRatio ||
161  next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
162  add_next_word = false;
163  }
164  }
165  next_word->set_flag(W_FUZZY_NON, add_next_word);
166  } else {
167  add_next_word = next_word->flag(W_FUZZY_NON);
168  }
169  if (add_next_word) {
170  if (combo == nullptr) {
171  copy_word = new WERD;
172  *copy_word = *(word_it.data()); // deep copy
173  combo = new WERD_RES(copy_word);
174  combo->x_height = the_row->x_height();
175  combo->combination = true;
176  word_res_it.add_to_end(combo);
177  }
178  word_res->part_of_combo = true;
179  } else {
180  combo = nullptr;
181  }
182  word_res_it.add_to_end(word_res);
183  }
184 }
185 
186 
187 WERD_RES& WERD_RES::operator=(const WERD_RES & source) {
188  this->ELIST_LINK::operator=(source);
189  Clear();
190  if (source.combination) {
191  word = new WERD;
192  *word = *(source.word); // deep copy
193  } else {
194  word = source.word; // pt to same word
195  }
196  if (source.bln_boxes != nullptr)
197  bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
198  if (source.chopped_word != nullptr)
199  chopped_word = new TWERD(*source.chopped_word);
200  if (source.rebuild_word != nullptr)
201  rebuild_word = new TWERD(*source.rebuild_word);
202  // TODO(rays) Do we ever need to copy the seam_array?
203  blob_row = source.blob_row;
204  denorm = source.denorm;
205  if (source.box_word != nullptr)
206  box_word = new tesseract::BoxWord(*source.box_word);
207  best_state = source.best_state;
208  correct_text = source.correct_text;
209  blob_widths = source.blob_widths;
210  blob_gaps = source.blob_gaps;
211  // None of the uses of operator= require the ratings matrix to be copied,
212  // so don't as it would be really slow.
213 
214  // Copy the cooked choices.
215  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&source.best_choices));
216  WERD_CHOICE_IT wc_dest_it(&best_choices);
217  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
218  const WERD_CHOICE *choice = wc_it.data();
219  wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));
220  }
221  if (!wc_dest_it.empty()) {
222  wc_dest_it.move_to_first();
223  best_choice = wc_dest_it.data();
224  } else {
225  best_choice = nullptr;
226  }
227 
228  if (source.raw_choice != nullptr) {
229  raw_choice = new WERD_CHOICE(*source.raw_choice);
230  } else {
231  raw_choice = nullptr;
232  }
233  if (source.ep_choice != nullptr) {
234  ep_choice = new WERD_CHOICE(*source.ep_choice);
235  } else {
236  ep_choice = nullptr;
237  }
238  reject_map = source.reject_map;
239  combination = source.combination;
240  part_of_combo = source.part_of_combo;
241  CopySimpleFields(source);
242  if (source.blamer_bundle != nullptr) {
243  blamer_bundle = new BlamerBundle(*(source.blamer_bundle));
244  }
245  return *this;
246 }
247 
248 // Copies basic fields that don't involve pointers that might be useful
249 // to copy when making one WERD_RES from another.
250 void WERD_RES::CopySimpleFields(const WERD_RES& source) {
251  tess_failed = source.tess_failed;
252  tess_accepted = source.tess_accepted;
254  done = source.done;
256  small_caps = source.small_caps;
257  odd_size = source.odd_size;
258  fontinfo = source.fontinfo;
259  fontinfo2 = source.fontinfo2;
262  x_height = source.x_height;
263  caps_height = source.caps_height;
265  guessed_x_ht = source.guessed_x_ht;
267  reject_spaces = source.reject_spaces;
268  uch_set = source.uch_set;
269  tesseract = source.tesseract;
270 }
271 
272 // Initializes a blank (default constructed) WERD_RES from one that has
273 // already been recognized.
274 // Use SetupFor*Recognition afterwards to complete the setup and make
275 // it ready for a retry recognition.
276 void WERD_RES::InitForRetryRecognition(const WERD_RES& source) {
277  word = source.word;
278  CopySimpleFields(source);
279  if (source.blamer_bundle != nullptr) {
280  blamer_bundle = new BlamerBundle();
282  }
283 }
284 
285 // Sets up the members used in recognition: bln_boxes, chopped_word,
286 // seam_array, denorm. Returns false if
287 // the word is empty and sets up fake results. If use_body_size is
288 // true and row->body_size is set, then body_size will be used for
289 // blob normalization instead of xheight + ascrise. This flag is for
290 // those languages that are using CJK pitch model and thus it has to
291 // be true if and only if tesseract->textord_use_cjk_fp_model is
292 // true.
293 // If allow_detailed_fx is true, the feature extractor will receive fine
294 // precision outline information, allowing smoother features and better
295 // features on low resolution images.
296 // The norm_mode_hint sets the default mode for normalization in absence
297 // of any of the above flags.
298 // norm_box is used to override the word bounding box to determine the
299 // normalization scale and offset.
300 // Returns false if the word is empty and sets up fake results.
301 bool WERD_RES::SetupForRecognition(const UNICHARSET& unicharset_in,
302  tesseract::Tesseract* tess, Pix* pix,
303  int norm_mode,
304  const TBOX* norm_box,
305  bool numeric_mode,
306  bool use_body_size,
307  bool allow_detailed_fx,
308  ROW *row, const BLOCK* block) {
309  auto norm_mode_hint =
310  static_cast<tesseract::OcrEngineMode>(norm_mode);
311  tesseract = tess;
312  POLY_BLOCK* pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
313  if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY &&
314  word->cblob_list()->empty()) ||
315  (pb != nullptr && !pb->IsText())) {
316  // Empty words occur when all the blobs have been moved to the rej_blobs
317  // list, which seems to occur frequently in junk.
318  SetupFake(unicharset_in);
319  word->set_flag(W_REP_CHAR, false);
320  return false;
321  }
322  ClearResults();
323  SetupWordScript(unicharset_in);
324  chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);
325  float word_xheight = use_body_size && row != nullptr && row->body_size() > 0.0f
326  ? row->body_size() : x_height;
327  chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
328  word_xheight, baseline_shift, numeric_mode,
329  norm_mode_hint, norm_box, &denorm);
330  blob_row = row;
331  SetupBasicsFromChoppedWord(unicharset_in);
333  int num_blobs = chopped_word->NumBlobs();
334  ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);
335  tess_failed = false;
336  return true;
337 }
338 
339 // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
340 // accumulators from a made chopped word. We presume the fields are already
341 // empty.
342 void WERD_RES::SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in) {
347 }
348 
349 // Sets up the members used in recognition for an empty recognition result:
350 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
351 void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
352  ClearResults();
353  SetupWordScript(unicharset_in);
354  chopped_word = new TWERD;
355  rebuild_word = new TWERD;
358  int blob_count = word->cblob_list()->length();
359  if (blob_count > 0) {
360  auto** fake_choices = new BLOB_CHOICE*[blob_count];
361  // For non-text blocks, just pass any blobs through to the box_word
362  // and call the word failed with a fake classification.
363  C_BLOB_IT b_it(word->cblob_list());
364  int blob_id = 0;
365  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
366  TBOX box = b_it.data()->bounding_box();
367  box_word->InsertBox(box_word->length(), box);
368  fake_choices[blob_id++] = new BLOB_CHOICE;
369  }
370  FakeClassifyWord(blob_count, fake_choices);
371  delete [] fake_choices;
372  } else {
373  auto* word = new WERD_CHOICE(&unicharset_in);
374  word->make_bad();
376  // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.
377  LogNewCookedChoice(1, false, word);
378  }
379  tess_failed = true;
380  done = true;
381 }
382 
383 void WERD_RES::SetupWordScript(const UNICHARSET& uch) {
384  uch_set = &uch;
385  int script = uch.default_sid();
386  word->set_script_id(script);
388  word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
389 }
390 
391 // Sets up the blamer_bundle if it is not null, using the initialized denorm.
393  if (blamer_bundle != nullptr) {
395  }
396 }
397 
398 // Computes the blob_widths and blob_gaps from the chopped_word.
401  blob_gaps.truncate(0);
402  int num_blobs = chopped_word->NumBlobs();
403  for (int b = 0; b < num_blobs; ++b) {
404  TBLOB *blob = chopped_word->blobs[b];
405  TBOX box = blob->bounding_box();
406  blob_widths.push_back(box.width());
407  if (b + 1 < num_blobs) {
409  chopped_word->blobs[b + 1]->bounding_box().left() - box.right());
410  }
411  }
412 }
413 
414 // Updates internal data to account for a new SEAM (chop) at the given
415 // blob_number. Fixes the ratings matrix and states in the choices, as well
416 // as the blob widths and gaps.
417 void WERD_RES::InsertSeam(int blob_number, SEAM* seam) {
418  // Insert the seam into the SEAMS array.
419  seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
420  seam_array.insert(seam, blob_number);
421  if (ratings != nullptr) {
422  // Expand the ratings matrix.
423  ratings = ratings->ConsumeAndMakeBigger(blob_number);
424  // Fix all the segmentation states.
425  if (raw_choice != nullptr)
426  raw_choice->UpdateStateForSplit(blob_number);
427  WERD_CHOICE_IT wc_it(&best_choices);
428  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
429  WERD_CHOICE* choice = wc_it.data();
430  choice->UpdateStateForSplit(blob_number);
431  }
433  }
434 }
435 
436 // Returns true if all the word choices except the first have adjust_factors
437 // worse than the given threshold.
438 bool WERD_RES::AlternativeChoiceAdjustmentsWorseThan(float threshold) const {
439  // The choices are not changed by this iteration.
440  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
441  for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
442  WERD_CHOICE* choice = wc_it.data();
443  if (choice->adjust_factor() <= threshold)
444  return false;
445  }
446  return true;
447 }
448 
449 // Returns true if the current word is ambiguous (by number of answers or
450 // by dangerous ambigs.)
451 bool WERD_RES::IsAmbiguous() {
452  return !best_choices.singleton() || best_choice->dangerous_ambig_found();
453 }
454 
455 // Returns true if the ratings matrix size matches the sum of each of the
456 // segmentation states.
458  int ratings_dim = ratings->dimension();
459  if (raw_choice->TotalOfStates() != ratings_dim) {
460  tprintf("raw_choice has total of states = %d vs ratings dim of %d\n",
461  raw_choice->TotalOfStates(), ratings_dim);
462  return false;
463  }
464  WERD_CHOICE_IT it(&best_choices);
465  int index = 0;
466  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
467  WERD_CHOICE* choice = it.data();
468  if (choice->TotalOfStates() != ratings_dim) {
469  tprintf("Cooked #%d has total of states = %d vs ratings dim of %d\n",
470  index, choice->TotalOfStates(), ratings_dim);
471  return false;
472  }
473  }
474  return true;
475 }
476 
477 // Prints a list of words found if debug is true or the word result matches
478 // the word_to_debug.
479 void WERD_RES::DebugWordChoices(bool debug, const char* word_to_debug) {
480  if (debug ||
481  (word_to_debug != nullptr && *word_to_debug != '\0' && best_choice != nullptr &&
482  best_choice->unichar_string() == STRING(word_to_debug))) {
483  if (raw_choice != nullptr)
484  raw_choice->print("\nBest Raw Choice");
485 
486  WERD_CHOICE_IT it(&best_choices);
487  int index = 0;
488  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
489  WERD_CHOICE* choice = it.data();
490  STRING label;
491  label.add_str_int("\nCooked Choice #", index);
492  choice->print(label.c_str());
493  }
494  }
495 }
496 
497 // Prints the top choice along with the accepted/done flags.
498 void WERD_RES::DebugTopChoice(const char* msg) const {
499  tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ",
501  if (best_choice == nullptr)
502  tprintf("<Null choice>\n");
503  else
504  best_choice->print(msg);
505 }
506 
507 // Removes from best_choices all choices which are not within a reasonable
508 // range of the best choice.
509 // TODO(rays) incorporate the information used here into the params training
510 // re-ranker, in place of this heuristic that is based on the previous
511 // adjustment factor.
512 void WERD_RES::FilterWordChoices(int debug_level) {
513  if (best_choice == nullptr || best_choices.singleton())
514  return;
515 
516  if (debug_level >= 2)
517  best_choice->print("\nFiltering against best choice");
518  WERD_CHOICE_IT it(&best_choices);
519  int index = 0;
520  for (it.forward(); !it.at_first(); it.forward(), ++index) {
521  WERD_CHOICE* choice = it.data();
522  float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
523  choice->adjust_factor());
524  // i, j index the blob choice in choice, best_choice.
525  // chunk is an index into the chopped_word blobs (AKA chunks).
526  // Since the two words may use different segmentations of the chunks, we
527  // iterate over the chunks to find out whether a comparable blob
528  // classification is much worse than the best result.
529  int i = 0, j = 0, chunk = 0;
530  // Each iteration of the while deals with 1 chunk. On entry choice_chunk
531  // and best_chunk are the indices of the first chunk in the NEXT blob,
532  // i.e. we don't have to increment i, j while chunk < choice_chunk and
533  // best_chunk respectively.
534  int choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
535  while (i < choice->length() && j < best_choice->length()) {
536  if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
537  choice->certainty(i) - best_choice->certainty(j) < threshold) {
538  if (debug_level >= 2) {
539  choice->print("WorstCertaintyDiffWorseThan");
540  tprintf(
541  "i %d j %d Choice->Blob[i].Certainty %.4g"
542  " WorstOtherChoiceCertainty %g Threshold %g\n",
543  i, j, choice->certainty(i), best_choice->certainty(j), threshold);
544  tprintf("Discarding bad choice #%d\n", index);
545  }
546  delete it.extract();
547  break;
548  }
549  ++chunk;
550  // If needed, advance choice_chunk to keep up with chunk.
551  while (choice_chunk < chunk && ++i < choice->length())
552  choice_chunk += choice->state(i);
553  // If needed, advance best_chunk to keep up with chunk.
554  while (best_chunk < chunk && ++j < best_choice->length())
555  best_chunk += best_choice->state(j);
556  }
557  }
558 }
559 
560 void WERD_RES::ComputeAdaptionThresholds(float certainty_scale,
561  float min_rating,
562  float max_rating,
563  float rating_margin,
564  float* thresholds) {
565  int chunk = 0;
566  int end_chunk = best_choice->state(0);
567  int end_raw_chunk = raw_choice->state(0);
568  int raw_blob = 0;
569  for (int i = 0; i < best_choice->length(); i++, thresholds++) {
570  float avg_rating = 0.0f;
571  int num_error_chunks = 0;
572 
573  // For each chunk in best choice blob i, count non-matching raw results.
574  while (chunk < end_chunk) {
575  if (chunk >= end_raw_chunk) {
576  ++raw_blob;
577  end_raw_chunk += raw_choice->state(raw_blob);
578  }
579  if (best_choice->unichar_id(i) !=
580  raw_choice->unichar_id(raw_blob)) {
581  avg_rating += raw_choice->certainty(raw_blob);
582  ++num_error_chunks;
583  }
584  ++chunk;
585  }
586 
587  if (num_error_chunks > 0) {
588  avg_rating /= num_error_chunks;
589  *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
590  } else {
591  *thresholds = max_rating;
592  }
593 
594  if (*thresholds > max_rating)
595  *thresholds = max_rating;
596  if (*thresholds < min_rating)
597  *thresholds = min_rating;
598  }
599 }
600 
601 // Saves a copy of the word_choice if it has the best unadjusted rating.
602 // Returns true if the word_choice was the new best.
603 bool WERD_RES::LogNewRawChoice(WERD_CHOICE* word_choice) {
604  if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) {
605  delete raw_choice;
606  raw_choice = new WERD_CHOICE(*word_choice);
608  return true;
609  }
610  return false;
611 }
612 
613 // Consumes word_choice by adding it to best_choices, (taking ownership) if
614 // the certainty for word_choice is some distance of the best choice in
615 // best_choices, or by deleting the word_choice and returning false.
616 // The best_choices list is kept in sorted order by rating. Duplicates are
617 // removed, and the list is kept no longer than max_num_choices in length.
618 // Returns true if the word_choice is still a valid pointer.
619 bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,
620  WERD_CHOICE* word_choice) {
621  if (best_choice != nullptr) {
622  // Throw out obviously bad choices to save some work.
623  // TODO(rays) Get rid of this! This piece of code produces different
624  // results according to the order in which words are found, which is an
625  // undesirable behavior. It would be better to keep all the choices and
626  // prune them later when more information is available.
627  float max_certainty_delta =
628  StopperAmbigThreshold(best_choice->adjust_factor(),
629  word_choice->adjust_factor());
630  if (max_certainty_delta > -kStopperAmbiguityThresholdOffset)
631  max_certainty_delta = -kStopperAmbiguityThresholdOffset;
632  if (word_choice->certainty() - best_choice->certainty() <
633  max_certainty_delta) {
634  if (debug) {
635  STRING bad_string;
636  word_choice->string_and_lengths(&bad_string, nullptr);
637  tprintf("Discarding choice \"%s\" with an overly low certainty"
638  " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
639  bad_string.c_str(), word_choice->certainty(),
641  max_certainty_delta + best_choice->certainty());
642  }
643  delete word_choice;
644  return false;
645  }
646  }
647 
648  // Insert in the list in order of increasing rating, but knock out worse
649  // string duplicates.
650  WERD_CHOICE_IT it(&best_choices);
651  const STRING& new_str = word_choice->unichar_string();
652  bool inserted = false;
653  int num_choices = 0;
654  if (!it.empty()) {
655  do {
656  WERD_CHOICE* choice = it.data();
657  if (choice->rating() > word_choice->rating() && !inserted) {
658  // Time to insert.
659  it.add_before_stay_put(word_choice);
660  inserted = true;
661  if (num_choices == 0)
662  best_choice = word_choice; // This is the new best.
663  ++num_choices;
664  }
665  if (choice->unichar_string() == new_str) {
666  if (inserted) {
667  // New is better.
668  delete it.extract();
669  } else {
670  // Old is better.
671  if (debug) {
672  tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",
673  new_str.c_str(), word_choice->rating(), choice->rating());
674  }
675  delete word_choice;
676  return false;
677  }
678  } else {
679  ++num_choices;
680  if (num_choices > max_num_choices)
681  delete it.extract();
682  }
683  it.forward();
684  } while (!it.at_first());
685  }
686  if (!inserted && num_choices < max_num_choices) {
687  it.add_to_end(word_choice);
688  inserted = true;
689  if (num_choices == 0)
690  best_choice = word_choice; // This is the new best.
691  }
692  if (debug) {
693  if (inserted)
694  tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");
695  else
696  tprintf("Poor");
697  word_choice->print(" Word Choice");
698  }
699  if (!inserted) {
700  delete word_choice;
701  return false;
702  }
703  return true;
704 }
705 
706 
707 // Simple helper moves the ownership of the pointer data from src to dest,
708 // first deleting anything in dest, and nulling out src afterwards.
709 template<class T> static void MovePointerData(T** dest, T**src) {
710  delete *dest;
711  *dest = *src;
712  *src = nullptr;
713 }
714 
715 // Prints a brief list of all the best choices.
716 void WERD_RES::PrintBestChoices() const {
717  STRING alternates_str;
718  WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
719  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
720  if (!it.at_first()) alternates_str += "\", \"";
721  alternates_str += it.data()->unichar_string();
722  }
723  tprintf("Alternates for \"%s\": {\"%s\"}\n",
724  best_choice->unichar_string().c_str(), alternates_str.c_str());
725 }
726 
727 // Returns the sum of the widths of the blob between start_blob and last_blob
728 // inclusive.
729 int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) {
730  int result = 0;
731  for (int b = start_blob; b <= last_blob; ++b) {
732  result += blob_widths[b];
733  if (b < last_blob)
734  result += blob_gaps[b];
735  }
736  return result;
737 }
738 // Returns the width of a gap between the specified blob and the next one.
739 int WERD_RES::GetBlobsGap(int blob_index) {
740  if (blob_index < 0 || blob_index >= blob_gaps.size())
741  return 0;
742  return blob_gaps[blob_index];
743 }
744 
745 // Returns the BLOB_CHOICE corresponding to the given index in the
746 // best choice word taken from the appropriate cell in the ratings MATRIX.
747 // Borrowed pointer, so do not delete. May return nullptr if there is no
748 // BLOB_CHOICE matching the unichar_id at the given index.
749 BLOB_CHOICE* WERD_RES::GetBlobChoice(int index) const {
750  if (index < 0 || index >= best_choice->length()) return nullptr;
751  BLOB_CHOICE_LIST* choices = GetBlobChoices(index);
752  return FindMatchingChoice(best_choice->unichar_id(index), choices);
753 }
754 
755 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
756 // best choice word taken from the appropriate cell in the ratings MATRIX.
757 // Borrowed pointer, so do not delete.
758 BLOB_CHOICE_LIST* WERD_RES::GetBlobChoices(int index) const {
759  return best_choice->blob_choices(index, ratings);
760 }
761 
762 // Moves the results fields from word to this. This takes ownership of all
763 // the data, so src can be destructed.
765  denorm = word->denorm;
766  blob_row = word->blob_row;
767  MovePointerData(&chopped_word, &word->chopped_word);
768  MovePointerData(&rebuild_word, &word->rebuild_word);
769  MovePointerData(&box_word, &word->box_word);
771  seam_array = word->seam_array;
772  word->seam_array.clear();
773  best_state.move(&word->best_state);
774  correct_text.move(&word->correct_text);
775  blob_widths.move(&word->blob_widths);
776  blob_gaps.move(&word->blob_gaps);
777  if (ratings != nullptr) ratings->delete_matrix_pointers();
778  MovePointerData(&ratings, &word->ratings);
779  best_choice = word->best_choice;
780  MovePointerData(&raw_choice, &word->raw_choice);
781  best_choices.clear();
782  WERD_CHOICE_IT wc_it(&best_choices);
783  wc_it.add_list_after(&word->best_choices);
784  reject_map = word->reject_map;
785  if (word->blamer_bundle != nullptr) {
786  assert(blamer_bundle != nullptr);
787  blamer_bundle->CopyResults(*(word->blamer_bundle));
788  }
790 }
791 
792 // Replace the best choice and rebuild box word.
793 // choice must be from the current best_choices list.
795  best_choice = choice;
797  SetupBoxWord();
798  // Make up a fake reject map of the right length to keep the
799  // rejection pass happy.
803 }
804 
805 // Builds the rebuild_word and sets the best_state from the chopped_word and
806 // the best_choice->state.
808  ASSERT_HOST(best_choice != nullptr);
809  delete rebuild_word;
810  rebuild_word = new TWERD;
811  if (seam_array.empty())
813  best_state.truncate(0);
814  int start = 0;
815  for (int i = 0; i < best_choice->length(); ++i) {
816  int length = best_choice->state(i);
817  best_state.push_back(length);
818  if (length > 1) {
820  start + length - 1);
821  }
822  TBLOB* blob = chopped_word->blobs[start];
823  rebuild_word->blobs.push_back(new TBLOB(*blob));
824  if (length > 1) {
826  start + length - 1);
827  }
828  start += length;
829  }
830 }
831 
832 // Copies the chopped_word to the rebuild_word, faking a best_state as well.
833 // Also sets up the output box_word.
835  delete rebuild_word;
837  SetupBoxWord();
838  int word_len = box_word->length();
839  best_state.reserve(word_len);
840  correct_text.reserve(word_len);
841  for (int i = 0; i < word_len; ++i) {
844  }
845 }
846 
847 // Sets/replaces the box_word with one made from the rebuild_word.
848 void WERD_RES::SetupBoxWord() {
849  delete box_word;
853 }
854 
855 // Sets up the script positions in the output best_choice using the best_choice
856 // to get the unichars, and the unicharset to get the target positions.
859 }
860 // Sets all the blobs in all the words (raw choice and best choices) to be
861 // the given position. (When a sub/superscript is recognized as a separate
862 // word, it falls victim to the rule that a whole word cannot be sub or
863 // superscript, so this function overrides that problem.)
866  WERD_CHOICE_IT wc_it(&best_choices);
867  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward())
868  wc_it.data()->SetAllScriptPositions(position);
869 }
870 
871 // Classifies the word with some already-calculated BLOB_CHOICEs.
872 // The choices are an array of blob_count pointers to BLOB_CHOICE,
873 // providing a single classifier result for each blob.
874 // The BLOB_CHOICEs are consumed and the word takes ownership.
875 // The number of blobs in the box_word must match blob_count.
876 void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
877  // Setup the WERD_RES.
878  ASSERT_HOST(box_word != nullptr);
879  ASSERT_HOST(blob_count == box_word->length());
881  ClearRatings();
882  ratings = new MATRIX(blob_count, 1);
883  for (int c = 0; c < blob_count; ++c) {
884  auto* choice_list = new BLOB_CHOICE_LIST;
885  BLOB_CHOICE_IT choice_it(choice_list);
886  choice_it.add_after_then_move(choices[c]);
887  ratings->put(c, c, choice_list);
888  }
890  reject_map.initialise(blob_count);
891  best_state.init_to_size(blob_count, 1);
892  done = true;
893 }
894 
895 // Creates a WERD_CHOICE for the word using the top choices from the leading
896 // diagonal of the ratings matrix.
898  int num_blobs = ratings->dimension();
899  auto* word_choice = new WERD_CHOICE(uch_set, num_blobs);
900  word_choice->set_permuter(permuter);
901  for (int b = 0; b < num_blobs; ++b) {
902  UNICHAR_ID unichar_id = UNICHAR_SPACE;
903  // Initialize rating and certainty like in WERD_CHOICE::make_bad().
904  float rating = WERD_CHOICE::kBadRating;
905  float certainty = -FLT_MAX;
906  BLOB_CHOICE_LIST* choices = ratings->get(b, b);
907  if (choices != nullptr && !choices->empty()) {
908  BLOB_CHOICE_IT bc_it(choices);
909  BLOB_CHOICE* choice = bc_it.data();
910  unichar_id = choice->unichar_id();
911  rating = choice->rating();
912  certainty = choice->certainty();
913  }
914  word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
915  certainty);
916  }
917  LogNewRawChoice(word_choice);
918  // Ownership of word_choice taken by word here.
919  LogNewCookedChoice(1, false, word_choice);
920 }
921 
922 // Copies the best_choice strings to the correct_text for adaption/training.
925  ASSERT_HOST(best_choice != nullptr);
926  for (int i = 0; i < best_choice->length(); ++i) {
927  UNICHAR_ID choice_id = best_choice->unichar_id(i);
928  const char* blob_choice = uch_set->id_to_unichar(choice_id);
929  correct_text.push_back(STRING(blob_choice));
930  }
931 }
932 
933 // Merges 2 adjacent blobs in the result if the permanent callback
934 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
935 // callback box_cb is nullptr or returns true, setting the merged blob
936 // result to the class returned from class_cb.
937 // Returns true if anything was merged.
939  std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> class_cb,
940  std::function<bool(const TBOX&, const TBOX&)> box_cb) {
941  ASSERT_HOST(best_choice->length() == 0 || ratings != nullptr);
942  bool modified = false;
943  for (int i = 0; i + 1 < best_choice->length(); ++i) {
944  UNICHAR_ID new_id = class_cb(best_choice->unichar_id(i),
945  best_choice->unichar_id(i+1));
946  if (new_id != INVALID_UNICHAR_ID &&
947  (box_cb == nullptr || box_cb(box_word->BlobBox(i),
948  box_word->BlobBox(i + 1)))) {
949  // Raw choice should not be fixed.
950  best_choice->set_unichar_id(new_id, i);
951  modified = true;
953  const MATRIX_COORD& coord = best_choice->MatrixCoord(i);
954  if (!coord.Valid(*ratings)) {
955  ratings->IncreaseBandSize(coord.row + 1 - coord.col);
956  }
957  BLOB_CHOICE_LIST* blob_choices = GetBlobChoices(i);
958  if (FindMatchingChoice(new_id, blob_choices) == nullptr) {
959  // Insert a fake result.
960  auto* blob_choice = new BLOB_CHOICE;
961  blob_choice->set_unichar_id(new_id);
962  BLOB_CHOICE_IT bc_it(blob_choices);
963  bc_it.add_before_then_move(blob_choice);
964  }
965  }
966  }
967  return modified;
968 }
969 
970 // Merges 2 adjacent blobs in the result (index and index+1) and corrects
971 // all the data to account for the change.
972 void WERD_RES::MergeAdjacentBlobs(int index) {
973  if (reject_map.length() == best_choice->length())
974  reject_map.remove_pos(index);
975  best_choice->remove_unichar_id(index + 1);
976  rebuild_word->MergeBlobs(index, index + 2);
977  box_word->MergeBoxes(index, index + 2);
978  if (index + 1 < best_state.size()) {
979  best_state[index] += best_state[index + 1];
980  best_state.remove(index + 1);
981  }
982 }
983 
984 // TODO(tkielbus) Decide between keeping this behavior here or modifying the
985 // training data.
986 
987 // Utility function for fix_quotes
988 // Return true if the next character in the string (given the UTF8 length in
989 // bytes) is a quote character.
990 static int is_simple_quote(const char* signed_str, int length) {
991  const auto* str =
992  reinterpret_cast<const unsigned char*>(signed_str);
993  // Standard 1 byte quotes.
994  return (length == 1 && (*str == '\'' || *str == '`')) ||
995  // UTF-8 3 bytes curved quotes.
996  (length == 3 && ((*str == 0xe2 &&
997  *(str + 1) == 0x80 &&
998  *(str + 2) == 0x98) ||
999  (*str == 0xe2 &&
1000  *(str + 1) == 0x80 &&
1001  *(str + 2) == 0x99)));
1002 }
1004 // Callback helper for fix_quotes returns a double quote if both
1005 // arguments are quote, otherwise INVALID_UNICHAR_ID.
1007  const char *ch = uch_set->id_to_unichar(id1);
1008  const char *next_ch = uch_set->id_to_unichar(id2);
1009  if (is_simple_quote(ch, strlen(ch)) &&
1010  is_simple_quote(next_ch, strlen(next_ch)))
1011  return uch_set->unichar_to_id("\"");
1012  return INVALID_UNICHAR_ID;
1014 
1015 // Change pairs of quotes to double quotes.
1016 void WERD_RES::fix_quotes() {
1017  if (!uch_set->contains_unichar("\"") ||
1019  return; // Don't create it if it is disallowed.
1020 
1021  using namespace std::placeholders; // for _1, _2
1022  ConditionalBlobMerge(std::bind(&WERD_RES::BothQuotes, this, _1, _2),
1023  nullptr);
1024 }
1026 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
1027 // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
1029  const char *ch = uch_set->id_to_unichar(id1);
1030  const char *next_ch = uch_set->id_to_unichar(id2);
1031  if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
1032  (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~'))
1033  return uch_set->unichar_to_id("-");
1034  return INVALID_UNICHAR_ID;
1035 }
1037 // Callback helper for fix_hyphens returns true if box1 and box2 overlap
1038 // (assuming both on the same textline, are in order and a chopped em dash.)
1039 bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) {
1040  return box1.right() >= box2.left();
1041 }
1043 // Change pairs of hyphens to a single hyphen if the bounding boxes touch
1044 // Typically a long dash which has been segmented.
1045 void WERD_RES::fix_hyphens() {
1046  if (!uch_set->contains_unichar("-") ||
1048  return; // Don't create it if it is disallowed.
1049 
1050  using namespace std::placeholders; // for _1, _2
1051  ConditionalBlobMerge(std::bind(&WERD_RES::BothHyphens, this, _1, _2),
1052  std::bind(&WERD_RES::HyphenBoxesOverlap, this, _1, _2));
1053 }
1055 // Callback helper for merge_tess_fails returns a space if both
1056 // arguments are space, otherwise INVALID_UNICHAR_ID.
1058  if (id1 == id2 && id1 == uch_set->unichar_to_id(" "))
1059  return id1;
1060  else
1061  return INVALID_UNICHAR_ID;
1063 
1064 // Change pairs of tess failures to a single one
1066  using namespace std::placeholders; // for _1, _2
1068  this, _1, _2), nullptr)) {
1069  int len = best_choice->length();
1070  ASSERT_HOST(reject_map.length() == len);
1071  ASSERT_HOST(box_word->length() == len);
1072  }
1073 }
1075 // Returns true if the collection of count pieces, starting at start, are all
1076 // natural connected components, ie there are no real chops involved.
1077 bool WERD_RES::PiecesAllNatural(int start, int count) const {
1078  // all seams must have no splits.
1079  for (int index = start; index < start + count - 1; ++index) {
1080  if (index >= 0 && index < seam_array.size()) {
1081  SEAM* seam = seam_array[index];
1082  if (seam != nullptr && seam->HasAnySplits()) return false;
1083  }
1084  }
1085  return true;
1087 
1088 
1091 }
1092 
1093 void WERD_RES::Clear() {
1094  if (combination) {
1095  delete word;
1096  }
1097  word = nullptr;
1098  delete blamer_bundle;
1099  blamer_bundle = nullptr;
1101 }
1102 
1103 void WERD_RES::ClearResults() {
1104  done = false;
1105  fontinfo = nullptr;
1106  fontinfo2 = nullptr;
1107  fontinfo_id_count = 0;
1108  fontinfo_id2_count = 0;
1109  delete bln_boxes;
1110  bln_boxes = nullptr;
1111  blob_row = nullptr;
1112  delete chopped_word;
1113  chopped_word = nullptr;
1114  delete rebuild_word;
1115  rebuild_word = nullptr;
1116  delete box_word;
1117  box_word = nullptr;
1118  best_state.clear();
1119  correct_text.clear();
1121  seam_array.clear();
1122  blob_widths.clear();
1123  blob_gaps.clear();
1124  ClearRatings();
1126  if (blamer_bundle != nullptr) blamer_bundle->ClearResults();
1127 }
1129  best_choice = nullptr;
1130  delete raw_choice;
1131  raw_choice = nullptr;
1132  best_choices.clear();
1133  delete ep_choice;
1134  ep_choice = nullptr;
1135 }
1136 void WERD_RES::ClearRatings() {
1137  if (ratings != nullptr) {
1139  delete ratings;
1140  ratings = nullptr;
1141  }
1142 }
1143 
1144 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
1145  ASSERT_HOST(page_res == other.page_res);
1146  if (other.block_res == nullptr) {
1147  // other points to the end of the page.
1148  if (block_res == nullptr)
1149  return 0;
1150  return -1;
1151  }
1152  if (block_res == nullptr) {
1153  return 1; // we point to the end of the page.
1154  }
1155  if (block_res == other.block_res) {
1156  if (other.row_res == nullptr || row_res == nullptr) {
1157  // this should only happen if we hit an image block.
1158  return 0;
1159  }
1160  if (row_res == other.row_res) {
1161  // we point to the same block and row.
1162  ASSERT_HOST(other.word_res != nullptr && word_res != nullptr);
1163  if (word_res == other.word_res) {
1164  // we point to the same word!
1165  return 0;
1166  }
1167 
1168  WERD_RES_IT word_res_it(&row_res->word_res_list);
1169  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
1170  word_res_it.forward()) {
1171  if (word_res_it.data() == word_res) {
1172  return -1;
1173  } else if (word_res_it.data() == other.word_res) {
1174  return 1;
1175  }
1176  }
1177  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1178  }
1179 
1180  // we both point to the same block, but different rows.
1181  ROW_RES_IT row_res_it(&block_res->row_res_list);
1182  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
1183  row_res_it.forward()) {
1184  if (row_res_it.data() == row_res) {
1185  return -1;
1186  } else if (row_res_it.data() == other.row_res) {
1187  return 1;
1188  }
1189  }
1190  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1191  }
1192 
1193  // We point to different blocks.
1194  BLOCK_RES_IT block_res_it(&page_res->block_res_list);
1195  for (block_res_it.mark_cycle_pt();
1196  !block_res_it.cycled_list(); block_res_it.forward()) {
1197  if (block_res_it.data() == block_res) {
1198  return -1;
1199  } else if (block_res_it.data() == other.block_res) {
1200  return 1;
1201  }
1202  }
1203  // Shouldn't happen...
1204  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1205  return 0;
1206 }
1207 
1208 // Inserts the new_word as a combination owned by a corresponding WERD_RES
1209 // before the current position. The simple fields of the WERD_RES are copied
1210 // from clone_res and the resulting WERD_RES is returned for further setup
1211 // with best_choice etc.
1213  WERD* new_word) {
1214  // Make a WERD_RES for the new_word.
1215  auto* new_res = new WERD_RES(new_word);
1216  new_res->CopySimpleFields(clone_res);
1217  new_res->combination = true;
1218  // Insert into the appropriate place in the ROW_RES.
1219  WERD_RES_IT wr_it(&row()->word_res_list);
1220  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1221  WERD_RES* word = wr_it.data();
1222  if (word == word_res)
1223  break;
1224  }
1225  ASSERT_HOST(!wr_it.cycled_list());
1226  wr_it.add_before_then_move(new_res);
1227  if (wr_it.at_first()) {
1228  // This is the new first word, so reset the member iterator so it
1229  // detects the cycled_list state correctly.
1230  ResetWordIterator();
1231  }
1232  return new_res;
1233 }
1234 
1235 // Helper computes the boundaries between blobs in the word. The blob bounds
1236 // are likely very poor, if they come from LSTM, where it only outputs the
1237 // character at one pixel within it, so we find the midpoints between them.
1238 static void ComputeBlobEnds(const WERD_RES& word, const TBOX& clip_box,
1239  C_BLOB_LIST* next_word_blobs,
1240  GenericVector<int>* blob_ends) {
1241  C_BLOB_IT blob_it(word.word->cblob_list());
1242  for (int i = 0; i < word.best_state.size(); ++i) {
1243  int length = word.best_state[i];
1244  // Get the bounding box of the fake blobs
1245  TBOX blob_box = blob_it.data()->bounding_box();
1246  blob_it.forward();
1247  for (int b = 1; b < length; ++b) {
1248  blob_box += blob_it.data()->bounding_box();
1249  blob_it.forward();
1250  }
1251  // This blob_box is crap, so for now we are only looking for the
1252  // boundaries between them.
1253  int blob_end = INT32_MAX;
1254  if (!blob_it.at_first() || next_word_blobs != nullptr) {
1255  if (blob_it.at_first())
1256  blob_it.set_to_list(next_word_blobs);
1257  blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
1258  }
1259  blob_end = ClipToRange<int>(blob_end, clip_box.left(), clip_box.right());
1260  blob_ends->push_back(blob_end);
1261  }
1262  blob_ends->back() = clip_box.right();
1263 }
1264 
1265 // Helper computes the bounds of a word by restricting it to existing words
1266 // that significantly overlap.
1267 static TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES>& words,
1268  int w_index, TBOX prev_box, WERD_RES_IT w_it) {
1269  constexpr int kSignificantOverlapFraction = 4;
1270  TBOX clipped_box;
1271  TBOX current_box = words[w_index]->word->bounding_box();
1272  TBOX next_box;
1273  if (w_index + 1 < words.size() && words[w_index + 1] != nullptr &&
1274  words[w_index + 1]->word != nullptr)
1275  next_box = words[w_index + 1]->word->bounding_box();
1276  for (w_it.forward(); !w_it.at_first() && w_it.data()->part_of_combo;
1277  w_it.forward()) {
1278  if (w_it.data() == nullptr || w_it.data()->word == nullptr) continue;
1279  TBOX w_box = w_it.data()->word->bounding_box();
1280  int height_limit = std::min<int>(w_box.height(), w_box.width() / 2);
1281  int width_limit = w_box.width() / kSignificantOverlapFraction;
1282  int min_significant_overlap = std::max(height_limit, width_limit);
1283  int overlap = w_box.intersection(current_box).width();
1284  int prev_overlap = w_box.intersection(prev_box).width();
1285  int next_overlap = w_box.intersection(next_box).width();
1286  if (overlap > min_significant_overlap) {
1287  if (prev_overlap > min_significant_overlap) {
1288  // We have no choice but to use the LSTM word edge.
1289  clipped_box.set_left(current_box.left());
1290  } else if (next_overlap > min_significant_overlap) {
1291  // We have no choice but to use the LSTM word edge.
1292  clipped_box.set_right(current_box.right());
1293  } else {
1294  clipped_box += w_box;
1295  }
1296  }
1297  }
1298  if (clipped_box.height() <= 0) {
1299  clipped_box.set_top(current_box.top());
1300  clipped_box.set_bottom(current_box.bottom());
1301  }
1302  if (clipped_box.width() <= 0) clipped_box = current_box;
1303  return clipped_box;
1304 }
1305 
1306 // Helper moves the blob from src to dest. If it isn't contained by clip_box,
1307 // the blob is replaced by a fake that is contained.
1308 static TBOX MoveAndClipBlob(C_BLOB_IT* src_it, C_BLOB_IT* dest_it,
1309  const TBOX& clip_box) {
1310  C_BLOB* src_blob = src_it->extract();
1311  TBOX box = src_blob->bounding_box();
1312  if (!clip_box.contains(box)) {
1313  int left =
1314  ClipToRange<int>(box.left(), clip_box.left(), clip_box.right() - 1);
1315  int right =
1316  ClipToRange<int>(box.right(), clip_box.left() + 1, clip_box.right());
1317  int top =
1318  ClipToRange<int>(box.top(), clip_box.bottom() + 1, clip_box.top());
1319  int bottom =
1320  ClipToRange<int>(box.bottom(), clip_box.bottom(), clip_box.top() - 1);
1321  box = TBOX(left, bottom, right, top);
1322  delete src_blob;
1323  src_blob = C_BLOB::FakeBlob(box);
1324  }
1325  dest_it->add_after_then_move(src_blob);
1326  return box;
1327 }
1328 
1329 // Replaces the current WERD/WERD_RES with the given words. The given words
1330 // contain fake blobs that indicate the position of the characters. These are
1331 // replaced with real blobs from the current word as much as possible.
1334  if (words->empty()) {
1335  DeleteCurrentWord();
1336  return;
1337  }
1338  WERD_RES* input_word = word();
1339  // Set the BOL/EOL flags on the words from the input word.
1340  if (input_word->word->flag(W_BOL)) {
1341  (*words)[0]->word->set_flag(W_BOL, true);
1342  } else {
1343  (*words)[0]->word->set_blanks(input_word->word->space());
1344  }
1345  words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
1346 
1347  // Move the blobs from the input word to the new set of words.
1348  // If the input word_res is a combination, then the replacements will also be
1349  // combinations, and will own their own words. If the input word_res is not a
1350  // combination, then the final replacements will not be either, (although it
1351  // is allowed for the input words to be combinations) and their words
1352  // will get put on the row list. This maintains the ownership rules.
1353  WERD_IT w_it(row()->row->word_list());
1354  if (!input_word->combination) {
1355  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1356  WERD* word = w_it.data();
1357  if (word == input_word->word)
1358  break;
1359  }
1360  // w_it is now set to the input_word's word.
1361  ASSERT_HOST(!w_it.cycled_list());
1362  }
1363  // Insert into the appropriate place in the ROW_RES.
1364  WERD_RES_IT wr_it(&row()->word_res_list);
1365  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1366  WERD_RES* word = wr_it.data();
1367  if (word == input_word)
1368  break;
1369  }
1370  ASSERT_HOST(!wr_it.cycled_list());
1371  // Since we only have an estimate of the bounds between blobs, use the blob
1372  // x-middle as the determiner of where to put the blobs
1373  C_BLOB_IT src_b_it(input_word->word->cblob_list());
1374  src_b_it.sort(&C_BLOB::SortByXMiddle);
1375  C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
1376  rej_b_it.sort(&C_BLOB::SortByXMiddle);
1377  TBOX clip_box;
1378  for (int w = 0; w < words->size(); ++w) {
1379  WERD_RES* word_w = (*words)[w];
1380  clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
1381  // Compute blob boundaries.
1382  GenericVector<int> blob_ends;
1383  C_BLOB_LIST* next_word_blobs =
1384  w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
1385  ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
1386  // Remove the fake blobs on the current word, but keep safe for back-up if
1387  // no blob can be found.
1388  C_BLOB_LIST fake_blobs;
1389  C_BLOB_IT fake_b_it(&fake_blobs);
1390  fake_b_it.add_list_after(word_w->word->cblob_list());
1391  fake_b_it.move_to_first();
1392  word_w->word->cblob_list()->clear();
1393  C_BLOB_IT dest_it(word_w->word->cblob_list());
1394  // Build the box word as we move the blobs.
1395  auto* box_word = new tesseract::BoxWord;
1396  for (int i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
1397  int end_x = blob_ends[i];
1398  TBOX blob_box;
1399  // Add the blobs up to end_x.
1400  while (!src_b_it.empty() &&
1401  src_b_it.data()->bounding_box().x_middle() < end_x) {
1402  blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
1403  src_b_it.forward();
1404  }
1405  while (!rej_b_it.empty() &&
1406  rej_b_it.data()->bounding_box().x_middle() < end_x) {
1407  blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
1408  rej_b_it.forward();
1409  }
1410  if (blob_box.null_box()) {
1411  // Use the original box as a back-up.
1412  blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
1413  }
1414  box_word->InsertBox(i, blob_box);
1415  }
1416  delete word_w->box_word;
1417  word_w->box_word = box_word;
1418  if (!input_word->combination) {
1419  // Insert word_w->word into the ROW. It doesn't own its word, so the
1420  // ROW needs to own it.
1421  w_it.add_before_stay_put(word_w->word);
1422  word_w->combination = false;
1423  }
1424  (*words)[w] = nullptr; // We are taking ownership.
1425  wr_it.add_before_stay_put(word_w);
1426  }
1427  // We have taken ownership of the words.
1428  words->clear();
1429  // Delete the current word, which has been replaced. We could just call
1430  // DeleteCurrentWord, but that would iterate both lists again, and we know
1431  // we are already in the right place.
1432  if (!input_word->combination)
1433  delete w_it.extract();
1434  delete wr_it.extract();
1435  ResetWordIterator();
1437 
1438 // Deletes the current WERD_RES and its underlying WERD.
1440  // Check that this word is as we expect. part_of_combos are NEVER iterated
1441  // by the normal iterator, so we should never be trying to delete them.
1442  ASSERT_HOST(!word_res->part_of_combo);
1443  if (!word_res->combination) {
1444  // Combinations own their own word, so we won't find the word on the
1445  // row's word_list, but it is legitimate to try to delete them.
1446  // Delete word from the ROW when not a combination.
1447  WERD_IT w_it(row()->row->word_list());
1448  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1449  if (w_it.data() == word_res->word) {
1450  break;
1451  }
1452  }
1453  ASSERT_HOST(!w_it.cycled_list());
1454  delete w_it.extract();
1455  }
1456  // Remove the WERD_RES for the new_word.
1457  // Remove the WORD_RES from the ROW_RES.
1458  WERD_RES_IT wr_it(&row()->word_res_list);
1459  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1460  if (wr_it.data() == word_res) {
1461  word_res = nullptr;
1462  break;
1463  }
1464  }
1465  ASSERT_HOST(!wr_it.cycled_list());
1466  delete wr_it.extract();
1467  ResetWordIterator();
1468 }
1470 // Makes the current word a fuzzy space if not already fuzzy. Updates
1471 // corresponding part of combo if required.
1473  WERD* real_word = word_res->word;
1474  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
1475  real_word->set_flag(W_FUZZY_SP, true);
1476  if (word_res->combination) {
1477  // The next word should be the corresponding part of combo, but we have
1478  // already stepped past it, so find it by search.
1479  WERD_RES_IT wr_it(&row()->word_res_list);
1480  for (wr_it.mark_cycle_pt();
1481  !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
1482  }
1483  wr_it.forward();
1484  ASSERT_HOST(wr_it.data()->part_of_combo);
1485  real_word = wr_it.data()->word;
1486  ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
1487  !real_word->flag(W_FUZZY_NON));
1488  real_word->set_flag(W_FUZZY_SP, true);
1489  }
1490  }
1491 }
1492 
1493 /*************************************************************************
1494  * PAGE_RES_IT::restart_page
1495  *
1496  * Set things up at the start of the page
1497  *************************************************************************/
1498 
1499 WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) {
1500  block_res_it.set_to_list(&page_res->block_res_list);
1501  block_res_it.mark_cycle_pt();
1502  prev_block_res = nullptr;
1503  prev_row_res = nullptr;
1504  prev_word_res = nullptr;
1505  block_res = nullptr;
1506  row_res = nullptr;
1507  word_res = nullptr;
1508  next_block_res = nullptr;
1509  next_row_res = nullptr;
1510  next_word_res = nullptr;
1511  internal_forward(true, empty_ok);
1512  return internal_forward(false, empty_ok);
1513 }
1514 
1515 // Recovers from operations on the current word, such as in InsertCloneWord
1516 // and DeleteCurrentWord.
1517 // Resets the word_res_it so that it is one past the next_word_res, as
1518 // it should be after internal_forward. If next_row_res != row_res,
1519 // then the next_word_res is in the next row, so there is no need to do
1520 // anything to word_res_it, but it is still a good idea to reset the pointers
1521 // word_res and prev_word_res, which are still in the current row.
1523  if (row_res == next_row_res) {
1524  // Reset the member iterator so it can move forward and detect the
1525  // cycled_list state correctly.
1526  word_res_it.move_to_first();
1527  for (word_res_it.mark_cycle_pt();
1528  !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
1529  word_res_it.forward()) {
1530  if (!word_res_it.data()->part_of_combo) {
1531  if (prev_row_res == row_res) prev_word_res = word_res;
1532  word_res = word_res_it.data();
1533  }
1534  }
1535  ASSERT_HOST(!word_res_it.cycled_list());
1536  wr_it_of_next_word = word_res_it;
1537  word_res_it.forward();
1538  } else {
1539  // word_res_it is OK, but reset word_res and prev_word_res if needed.
1540  WERD_RES_IT wr_it(&row_res->word_res_list);
1541  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1542  if (!wr_it.data()->part_of_combo) {
1543  if (prev_row_res == row_res) prev_word_res = word_res;
1544  word_res = wr_it.data();
1545  }
1546  }
1547  }
1548 }
1549 
1550 /*************************************************************************
1551  * PAGE_RES_IT::internal_forward
1552  *
1553  * Find the next word on the page. If empty_ok is true, then non-text blocks
1554  * and text blocks with no text are visited as if they contain a single
1555  * imaginary word in a single imaginary row. (word() and row() both return nullptr
1556  * in such a block and the return value is nullptr.)
1557  * If empty_ok is false, the old behaviour is maintained. Each real word
1558  * is visited and empty and non-text blocks and rows are skipped.
1559  * new_block is used to initialize the iterators for a new block.
1560  * The iterator maintains pointers to block, row and word for the previous,
1561  * current and next words. These are correct, regardless of block/row
1562  * boundaries. nullptr values denote start and end of the page.
1563  *************************************************************************/
1564 
1565 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
1566  bool new_row = false;
1567 
1568  prev_block_res = block_res;
1569  prev_row_res = row_res;
1570  prev_word_res = word_res;
1571  block_res = next_block_res;
1572  row_res = next_row_res;
1573  word_res = next_word_res;
1574  wr_it_of_current_word = wr_it_of_next_word;
1575  next_block_res = nullptr;
1576  next_row_res = nullptr;
1577  next_word_res = nullptr;
1578 
1579  while (!block_res_it.cycled_list()) {
1580  if (new_block) {
1581  new_block = false;
1582  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
1583  row_res_it.mark_cycle_pt();
1584  if (row_res_it.empty() && empty_ok) {
1585  next_block_res = block_res_it.data();
1586  break;
1587  }
1588  new_row = true;
1589  }
1590  while (!row_res_it.cycled_list()) {
1591  if (new_row) {
1592  new_row = false;
1593  word_res_it.set_to_list(&row_res_it.data()->word_res_list);
1594  word_res_it.mark_cycle_pt();
1595  }
1596  // Skip any part_of_combo words.
1597  while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
1598  word_res_it.forward();
1599  if (!word_res_it.cycled_list()) {
1600  next_block_res = block_res_it.data();
1601  next_row_res = row_res_it.data();
1602  next_word_res = word_res_it.data();
1603  wr_it_of_next_word = word_res_it;
1604  word_res_it.forward();
1605  goto foundword;
1606  }
1607  // end of row reached
1608  row_res_it.forward();
1609  new_row = true;
1610  }
1611  // end of block reached
1612  block_res_it.forward();
1613  new_block = true;
1614  }
1615  foundword:
1616  // Update prev_word_best_choice pointer.
1617  if (page_res != nullptr && page_res->prev_word_best_choice != nullptr) {
1618  *page_res->prev_word_best_choice =
1619  (new_block || prev_word_res == nullptr) ? nullptr : prev_word_res->best_choice;
1620  }
1621  return word_res;
1622 }
1624 /*************************************************************************
1625  * PAGE_RES_IT::restart_row()
1626  *
1627  * Move to the beginning (leftmost word) of the current row.
1628  *************************************************************************/
1630  ROW_RES *row = this->row();
1631  if (!row) return nullptr;
1632  for (restart_page(); this->row() != row; forward()) {
1633  // pass
1634  }
1635  return word();
1636 }
1638 /*************************************************************************
1639  * PAGE_RES_IT::forward_paragraph
1640  *
1641  * Move to the beginning of the next paragraph, allowing empty blocks.
1642  *************************************************************************/
1643 
1645  while (block_res == next_block_res &&
1646  (next_row_res != nullptr && next_row_res->row != nullptr &&
1647  row_res->row->para() == next_row_res->row->para())) {
1648  internal_forward(false, true);
1649  }
1650  return internal_forward(false, true);
1652 
1653 /*************************************************************************
1654  * PAGE_RES_IT::forward_block
1655  *
1656  * Move to the beginning of the next block, allowing empty blocks.
1657  *************************************************************************/
1660  while (block_res == next_block_res) {
1661  internal_forward(false, true);
1662  }
1663  return internal_forward(false, true);
1664 }
1665 
1667  int16_t chars_in_word;
1668  int16_t rejects_in_word = 0;
1669 
1670  chars_in_word = word_res->reject_map.length ();
1671  page_res->char_count += chars_in_word;
1672  block_res->char_count += chars_in_word;
1673  row_res->char_count += chars_in_word;
1674 
1675  rejects_in_word = word_res->reject_map.reject_count ();
1676 
1677  page_res->rej_count += rejects_in_word;
1678  block_res->rej_count += rejects_in_word;
1679  row_res->rej_count += rejects_in_word;
1680  if (chars_in_word == rejects_in_word)
1681  row_res->whole_word_rej_count += rejects_in_word;
1682 }
WERD_RES::done
bool done
Definition: pageres.h:299
TBOX
Definition: cleanapi_test.cc:19
W_SCRIPT_IS_LATIN
Special case latin for y. splitting.
Definition: werd.h:50
WERD_RES::SetupBoxWord
void SetupBoxWord()
Definition: pageres.cpp:845
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
GenericVector::delete_data_pointers
void delete_data_pointers()
Definition: genericvector.h:872
GenericVector::remove
void remove(int index)
Definition: genericvector.h:765
WERD_RES::FakeWordFromRatings
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:894
WERD_RES::ComputeAdaptionThresholds
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:557
TWERD::MergeBlobs
void MergeBlobs(int start, int end)
Definition: blobs.cpp:870
C_BLOB::FakeBlob
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:236
MATRIX::ConsumeAndMakeBigger
MATRIX * ConsumeAndMakeBigger(int ind)
Definition: matrix.cpp:56
C_BLOB::bounding_box
TBOX bounding_box() const
Definition: stepblob.cpp:247
WERD_RES::fix_hyphens
void fix_hyphens()
Definition: pageres.cpp:1042
pdblock.h
BLOCK_RES::rej_count
int32_t rej_count
Definition: pageres.h:115
WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:266
pageres.h
WERD_RES::blob_widths
GenericVector< int > blob_widths
Definition: pageres.h:210
BLOCK_RES::char_count
int32_t char_count
Definition: pageres.h:114
TBOX::intersection
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:83
STRING::add_str_int
void add_str_int(const char *str, int number)
Definition: strngs.cpp:370
ROW::descenders
float descenders() const
Definition: ocrrow.h:84
WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:116
ROW_RES::rej_count
int32_t rej_count
Definition: pageres.h:138
WERD_RES::BestChoiceToCorrectText
void BestChoiceToCorrectText()
Definition: pageres.cpp:920
WERD::set_script_id
void set_script_id(int id)
Definition: werd.h:103
WERD_RES::PiecesAllNatural
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1074
W_REP_CHAR
repeated character
Definition: werd.h:52
BLOCK_RES::font_class
int16_t font_class
Definition: pageres.h:116
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
POLY_BLOCK::IsText
bool IsText() const
Definition: polyblk.h:62
WERD_RES::AlternativeChoiceAdjustmentsWorseThan
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:435
WERD_RES::DebugTopChoice
void DebugTopChoice(const char *msg) const
Definition: pageres.cpp:495
WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:260
WERD_CHOICE
Definition: ratngs.h:261
WERD_CHOICE::dangerous_ambig_found
bool dangerous_ambig_found() const
Definition: ratngs.h:351
REJMAP::initialise
void initialise(int16_t length)
Definition: rejctmap.cpp:272
tesseract::BoxWord::CopyFromNormalized
static BoxWord * CopyFromNormalized(TWERD *tessword)
Definition: boxword.cpp:56
TWERD
Definition: blobs.h:416
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
WERD_RES::LogNewRawChoice
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:600
WERD_RES::bln_boxes
tesseract::BoxWord * bln_boxes
Definition: pageres.h:189
WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:147
GenericVector::insert
void insert(const T &t, int index)
Definition: genericvector.h:750
BLOCK::row_list
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:115
PAGE_RES_IT::forward_paragraph
WERD_RES * forward_paragraph()
Definition: pageres.cpp:1637
WERD_RES::odd_size
bool odd_size
Definition: pageres.h:301
WERD_RES::denorm
DENORM denorm
Definition: pageres.h:195
PAGE_RES::PAGE_RES
PAGE_RES()
Definition: pageres.h:99
WERD_RES::GetBlobChoices
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:755
BLOB_CHOICE::certainty
float certainty() const
Definition: ratngs.h:81
WERD_RES::BothSpaces
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1054
WERD_CHOICE::TotalOfStates
int TotalOfStates() const
Definition: ratngs.cpp:713
PermuterType
PermuterType
Definition: ratngs.h:230
WERD_RES::ConditionalBlobMerge
bool ConditionalBlobMerge(std::function< UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> class_cb, std::function< bool(const TBOX &, const TBOX &)> box_cb)
Definition: pageres.cpp:935
BlamerBundle::SetupNormTruthWord
void SetupNormTruthWord(const DENORM &denorm)
Definition: blamer.cpp:151
FindMatchingChoice
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:182
WERD_RES::unlv_crunch_mode
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:309
tesseract::Tesseract
Definition: tesseractclass.h:172
W_SCRIPT_HAS_XHEIGHT
x-height concept makes sense.
Definition: werd.h:49
MATRIX_COORD::Valid
bool Valid(const MATRIX &m) const
Definition: matrix.h:614
WERD_RES::BothQuotes
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1003
MATRIX
Definition: matrix.h:574
tesseract::PointerVector< WERD_RES >
kMaxWordGapRatio
const double kMaxWordGapRatio
Definition: pageres.cpp:59
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
WERD_RES::combination
bool combination
Definition: pageres.h:333
TBOX::top
int16_t top() const
Definition: rect.h:57
TBOX::contains
bool contains(const FCOORD pt) const
Definition: rect.h:330
STRING
Definition: strngs.h:45
WERD_RES::x_height
float x_height
Definition: pageres.h:310
polyblk.h
TBOX::set_top
void set_top(int y)
Definition: rect.h:60
WERD_RES::fontinfo
const FontInfo * fontinfo
Definition: pageres.h:303
WERD_RES
Definition: pageres.h:160
tesseract::PointerVector::clear
void clear()
Definition: genericvector.h:490
tesseract::OEM_LSTM_ONLY
Definition: publictypes.h:267
MATRIX::IncreaseBandSize
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:47
WERD_RES::ConsumeWordResults
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:761
WERD_RES::merge_tess_fails
void merge_tess_fails()
Definition: pageres.cpp:1062
C_BLOB::SortByXMiddle
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:124
WERD_RES::ratings
MATRIX * ratings
Definition: pageres.h:231
WERD_RES::best_state
GenericVector< int > best_state
Definition: pageres.h:279
WERD_CHOICE::kBadRating
static const float kBadRating
Definition: ratngs.h:263
WERD_CHOICE::state
int state(int index) const
Definition: ratngs.h:307
WERD_RES::SetAllScriptPositions
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:861
WERD_RES::GetBlobChoice
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:746
tesseract::BoxWord::ClipToOriginalWord
void ClipToOriginalWord(const BLOCK *block, WERD *original_word)
Definition: boxword.cpp:92
blobs.h
BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:75
TWERD::ComputeBoundingBoxes
void ComputeBoundingBoxes()
Definition: blobs.cpp:853
WERD_RES::SetupBasicsFromChoppedWord
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:339
BLOCK_RES
Definition: pageres.h:110
PAGE_RES::block_res_list
BLOCK_RES_LIST block_res_list
Definition: pageres.h:78
C_BLOB
Definition: stepblob.h:36
WERD_RES::CloneChoppedToRebuild
void CloneChoppedToRebuild()
Definition: pageres.cpp:831
GenericVector::back
T & back() const
Definition: genericvector.h:728
WERD_RES::fontinfo_id_count
int8_t fontinfo_id_count
Definition: pageres.h:305
TBOX::height
int16_t height() const
Definition: rect.h:107
SEAM
Definition: seam.h:36
GenericVector::move
void move(GenericVector< T > *from)
Definition: genericvector.h:1087
WERD_RES::tess_failed
bool tess_failed
Definition: pageres.h:289
WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:197
PAGE_RES_IT::InsertSimpleCloneWord
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1209
WERD_RES::ep_choice
WERD_CHOICE * ep_choice
Definition: pageres.h:287
BlamerBundle::CopyTruth
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:204
WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:94
GENERIC_2D_ARRAY::delete_matrix_pointers
void delete_matrix_pointers()
Definition: matrix.h:454
WERD_RES::blamer_bundle
BlamerBundle * blamer_bundle
Definition: pageres.h:246
WERD_RES::InitForRetryRecognition
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:273
WERD_RES::GetBlobsWidth
int GetBlobsWidth(int start_blob, int last_blob)
Definition: pageres.cpp:726
PAGE_RES_IT::MakeCurrentWordFuzzy
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1469
ROW_RES::char_count
int32_t char_count
Definition: pageres.h:137
WERD_CHOICE::string_and_lengths
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:451
WERD_CHOICE::SetScriptPositions
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:552
DENORM::block
const BLOCK * block() const
Definition: normalis.h:272
REJMAP::remove_pos
void remove_pos(int16_t pos)
Definition: rejctmap.cpp:308
TBOX::set_right
void set_right(int x)
Definition: rect.h:81
tesseract::ScriptPos
ScriptPos
Definition: ratngs.h:250
kWordrecMaxNumJoinChunks
const int kWordrecMaxNumJoinChunks
Definition: pageres.cpp:52
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
BLOCK
Definition: ocrblock.h:28
BLOCK::pdblk
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:189
PAGE_RES_IT::restart_row
WERD_RES * restart_row()
Definition: pageres.cpp:1623
REJMAP::length
int32_t length() const
Definition: rejctmap.h:222
W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:54
ROW::x_height
float x_height() const
Definition: ocrrow.h:63
WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:347
WERD_RES::PrintBestChoices
void PrintBestChoices() const
Definition: pageres.cpp:713
PAGE_RES_IT::forward_block
WERD_RES * forward_block()
Definition: pageres.cpp:1651
WERD_RES::DebugWordChoices
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:476
SEAM::HasAnySplits
bool HasAnySplits() const
Definition: seam.h:59
BLOCK_RES::row_count
int16_t row_count
Definition: pageres.h:117
tesseract::BoxWord::BlobBox
const TBOX & BlobBox(int index) const
Definition: boxword.h:83
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
WERD_RES::SetupForRecognition
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:298
WERD::space
uint8_t space()
Definition: werd.h:98
WERD_RES::LogNewCookedChoice
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:616
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
UNICHARSET::get_enabled
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:868
WERD_RES::InsertSeam
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:414
SEAM::PrepareToInsertSeam
bool PrepareToInsertSeam(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int insert_index, bool modify)
Definition: seam.cpp:74
PDBLK::poly_block
POLY_BLOCK * poly_block() const
Definition: pdblock.h:54
TBOX::null_box
bool null_box() const
Definition: rect.h:49
ROW_RES::row
ROW * row
Definition: pageres.h:136
WERD_CHOICE::MatrixCoord
MATRIX_COORD MatrixCoord(int index) const
Definition: ratngs.cpp:304
WERD_RES::fontinfo2
const FontInfo * fontinfo2
Definition: pageres.h:304
WERD_RES::Clear
void Clear()
Definition: pageres.cpp:1090
WERD::set_flag
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:117
WERD_RES::ClearResults
void ClearResults()
Definition: pageres.cpp:1100
WERD_RES::baseline_shift
float baseline_shift
Definition: pageres.h:312
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
ROW::ascenders
float ascenders() const
Definition: ocrrow.h:81
UNICHAR_SPACE
Definition: unicharset.h:34
W_EOL
end of line
Definition: werd.h:47
BLOCK_RES::font_assigned
bool font_assigned
Definition: pageres.h:119
ROW_RES::whole_word_rej_count
int32_t whole_word_rej_count
Definition: pageres.h:139
publictypes.h
GENERIC_2D_ARRAY::get
T get(ICOORD pos) const
Definition: matrix.h:227
TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:457
stepblob.h
PAGE_RES_IT::DeleteCurrentWord
void DeleteCurrentWord()
Definition: pageres.cpp:1436
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
WERD_CHOICE::adjust_factor
float adjust_factor() const
Definition: ratngs.h:294
WERD_CHOICE::UpdateStateForSplit
void UpdateStateForSplit(int blob_position)
Definition: ratngs.cpp:701
TBOX::width
int16_t width() const
Definition: rect.h:114
UNICHARSET
Definition: unicharset.h:145
BlamerBundle::ClearResults
void ClearResults()
Definition: blamer.h:190
UNICHARSET::script_has_xheight
bool script_has_xheight() const
Definition: unicharset.h:894
WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:297
UNICHARSET::latin_sid
int latin_sid() const
Definition: unicharset.h:876
TWERD::PolygonalCopy
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:774
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
start_seam_list
void start_seam_list(TWERD *word, GenericVector< SEAM * > *seam_array)
Definition: seam.cpp:261
WERD_RES::FilterWordChoices
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:509
WERD_RES::raw_choice
WERD_CHOICE * raw_choice
Definition: pageres.h:240
SEAM::BreakPieces
static void BreakPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:186
CLISTIZE
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:206
WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:288
TWERD::BLNormalize
void BLNormalize(const BLOCK *block, const ROW *row, Pix *pix, bool inverse, float x_height, float baseline_shift, bool numeric_mode, tesseract::OcrEngineMode hint, const TBOX *norm_box, DENORM *word_denorm)
Definition: blobs.cpp:788
tesseract
Definition: baseapi.h:65
PAGE_RES::prev_word_best_choice
WERD_CHOICE ** prev_word_best_choice
Definition: pageres.h:82
WERD_RES::CopySimpleFields
void CopySimpleFields(const WERD_RES &source)
Definition: pageres.cpp:247
ELIST_LINK::operator=
void operator=(const ELIST_LINK &)
Definition: elst.h:134
WERD_RES::best_choices
WERD_CHOICE_LIST best_choices
Definition: pageres.h:243
WERD_RES::SetScriptPositions
void SetScriptPositions()
Definition: pageres.cpp:854
WERD_RES::SetupBlobWidthsAndGaps
void SetupBlobWidthsAndGaps()
Definition: pageres.cpp:396
tprintf.h
TOP_CHOICE_PERM
Definition: ratngs.h:233
WERD_RES::seam_array
GenericVector< SEAM * > seam_array
Definition: pageres.h:208
BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:78
PAGE_RES_IT::rej_stat_word
void rej_stat_word()
Definition: pageres.cpp:1658
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:466
WERD_RES::IsAmbiguous
bool IsAmbiguous()
Definition: pageres.cpp:448
GenericVector< int >
kMaxWordSizeRatio
const double kMaxWordSizeRatio
Definition: pageres.cpp:55
BLOCK_RES::row_res_list
ROW_RES_LIST row_res_list
Definition: pageres.h:122
GenericVector::reserve
void reserve(int size)
Definition: genericvector.h:679
PAGE_RES_IT
Definition: pageres.h:668
WERD_RES::copy_on
void copy_on(WERD_RES *word_res)
Definition: pageres.h:654
WERD_RES::caps_height
float caps_height
Definition: pageres.h:311
WERD_RES::StatesAllValid
bool StatesAllValid()
Definition: pageres.cpp:454
WERD_RES::tess_would_adapt
bool tess_would_adapt
Definition: pageres.h:298
WERD_RES::fix_quotes
void fix_quotes()
Definition: pageres.cpp:1013
tesseract::BoxWord::InsertBox
void InsertBox(int index, const TBOX &box)
Definition: boxword.cpp:148
W_INVERSE
white on black
Definition: werd.h:55
WERD_RES::GetBlobsGap
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:736
WERD_RES::SetupWordScript
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:380
WERD_RES::RebuildBestState
void RebuildBestState()
Definition: pageres.cpp:804
WERD_CHOICE::print
void print() const
Definition: ratngs.h:568
WERD_RES::SetupFake
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:348
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
count
int count(LIST var_list)
Definition: oldlist.cpp:79
W_FUZZY_SP
fuzzy space
Definition: werd.h:53
WERD_RES::operator=
WERD_RES & operator=(const WERD_RES &source)
Definition: pageres.cpp:184
BLOB_CHOICE
Definition: ratngs.h:49
tesseract::BoxWord::length
int length() const
Definition: boxword.h:82
MATRIX_COORD
Definition: matrix.h:604
TBLOB
Definition: blobs.h:282
ROW_RES
Definition: pageres.h:133
WERD_RES::blob_row
ROW * blob_row
Definition: pageres.h:191
WERD_RES::tesseract
tesseract::Tesseract * tesseract
Definition: pageres.h:274
WERD_CHOICE::SetAllScriptPositions
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: ratngs.cpp:625
ocrrow.h
WERD_RES::SetupBlamerBundle
void SetupBlamerBundle()
Definition: pageres.cpp:389
WERD
Definition: werd.h:55
PAGE_RES_IT::cmp
int cmp(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1141
BLOCK_RES::block
BLOCK * block
Definition: pageres.h:113
GenericVector::truncate
void truncate(int size)
Definition: genericvector.h:132
TBOX::left
int16_t left() const
Definition: rect.h:71
ROW
Definition: ocrrow.h:35
UNICHARSET::contains_unichar
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
ocrblock.h
MATRIX_COORD::col
int col
Definition: matrix.h:632
WERD_RES::FakeClassifyWord
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:873
GenericVector::clear
void clear()
Definition: genericvector.h:857
ROW_RES::word_res_list
WERD_RES_LIST word_res_list
Definition: pageres.h:140
tesstrain_utils.dest
dest
Definition: tesstrain_utils.py:139
WERD_RES::fontinfo_id2_count
int8_t fontinfo_id2_count
Definition: pageres.h:306
TBOX::right
int16_t right() const
Definition: rect.h:78
BlamerBundle::CopyResults
void CopyResults(const BlamerBundle &other)
Definition: blamer.h:211
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:706
PAGE_RES_IT::ReplaceCurrentWord
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1329
WERD_RES::correct_text
GenericVector< STRING > correct_text
Definition: pageres.h:283
GENERIC_2D_ARRAY::put
void put(ICOORD pos, const T &thing)
Definition: matrix.h:219
WERD_RES::WERD_RES
WERD_RES()=default
WERD_RES::part_of_combo
bool part_of_combo
Definition: pageres.h:334
ROW_RES::ROW_RES
ROW_RES()=default
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
WERD_RES::blob_gaps
GenericVector< int > blob_gaps
Definition: pageres.h:213
errcode.h
PAGE_RES::Init
void Init()
Definition: pageres.h:91
POLY_BLOCK
Definition: polyblk.h:26
WERD_RES::ClearWordChoices
void ClearWordChoices()
Definition: pageres.cpp:1125
seam.h
WERD_RES::reject_spaces
bool reject_spaces
Definition: pageres.h:335
MATRIX_COORD::row
int row
Definition: matrix.h:633
WERD_RES::word
WERD * word
Definition: pageres.h:180
WERD_RES::guessed_caps_ht
bool guessed_caps_ht
Definition: pageres.h:308
WERD_RES::BothHyphens
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1025
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
SEAM::JoinPieces
static void JoinPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:208
WERD_RES::MergeAdjacentBlobs
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:969
WERD_CHOICE::remove_unichar_id
void remove_unichar_id(int index)
Definition: ratngs.h:472
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
TBOX::set_bottom
void set_bottom(int y)
Definition: rect.h:67
tesseract::BoxWord::MergeBoxes
void MergeBoxes(int start, int end)
Definition: boxword.cpp:131
WERD_RES::ReplaceBestChoice
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:791
BLOCK_RES::BLOCK_RES
BLOCK_RES()=default
WERD_RES::HyphenBoxesOverlap
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
Definition: pageres.cpp:1036
BlamerBundle
Definition: blamer.h:103
blamer.h
GenericVector::size
int size() const
Definition: genericvector.h:71
ROW::word_list
WERD_LIST * word_list()
Definition: ocrrow.h:54
WERD_RES::guessed_x_ht
bool guessed_x_ht
Definition: pageres.h:307
WERD_CHOICE::set_permuter
void set_permuter(uint8_t perm)
Definition: ratngs.h:363
PAGE_RES_IT::start_page
WERD_RES * start_page(bool empty_ok)
Definition: pageres.cpp:1495
BLOB_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:139
ROW::body_size
float body_size() const
Definition: ocrrow.h:72
PAGE_RES_IT::page_res
PAGE_RES * page_res
Definition: pageres.h:671
kMaxLineSizeRatio
const double kMaxLineSizeRatio
Definition: pageres.cpp:57
boxword.h
WERD::rej_cblob_list
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:89
BLOCK_RES::x_height
float x_height
Definition: pageres.h:118
PAGE_RES_IT::ResetWordIterator
void ResetWordIterator()
Definition: pageres.cpp:1518
WERD_CHOICE::blob_choices
BLOB_CHOICE_LIST * blob_choices(int index, MATRIX *ratings) const
Definition: ratngs.cpp:292
ELISTIZE
#define ELISTIZE(CLASSNAME)
Definition: elst.h:919
WERD_RES::small_caps
bool small_caps
Definition: pageres.h:300
TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:446
UNICHARSET::default_sid
int default_sid() const
Definition: unicharset.h:884
W_BOL
start of line
Definition: werd.h:46
TBOX::set_left
void set_left(int x)
Definition: rect.h:74
WERD_RES::~WERD_RES
~WERD_RES()
Definition: pageres.cpp:1086
WERD_RES::ClearRatings
void ClearRatings()
Definition: pageres.cpp:1133
BandTriMatrix::dimension
int dimension() const
Definition: matrix.h:532
TBOX
Definition: rect.h:33
tesseract::BoxWord
Definition: boxword.h:36