tesseract  4.0.0-1-g2a2b
pageres.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pageres.cpp (Formerly page_res.c)
3  * Description: Hierarchy of results classes from PAGE_RES to WERD_RES
4  * and an iterator class to iterate over the words.
5  * Main purposes:
6  * Easy way to iterate over the words without a 3-nested loop.
7  * Holds data used during word recognition.
8  * Holds information about alternative spacing paths.
9  * Author: Phil Cheatle
10  * Created: Tue Sep 22 08:42:49 BST 1992
11  *
12  * (C) Copyright 1992, Hewlett-Packard Ltd.
13  ** Licensed under the Apache License, Version 2.0 (the "License");
14  ** you may not use this file except in compliance with the License.
15  ** You may obtain a copy of the License at
16  ** http://www.apache.org/licenses/LICENSE-2.0
17  ** Unless required by applicable law or agreed to in writing, software
18  ** distributed under the License is distributed on an "AS IS" BASIS,
19  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20  ** See the License for the specific language governing permissions and
21  ** limitations under the License.
22  *
23  **********************************************************************/
24 
25 #include "pageres.h"
26 #include <cassert> // for assert
27 #include <cstdint> // for INT32_MAX
28 #include <cstring> // for strlen
29 #include "blamer.h" // for BlamerBundle
30 #include "blobs.h" // for TWERD, TBLOB
31 #include "boxword.h" // for BoxWord
32 #include "errcode.h" // for ASSERT_HOST
33 #include "host.h" // for TRUE, FALSE
34 #include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
35 #include "ocrrow.h" // for ROW, ROW_IT
36 #include "pdblock.h" // for PDBLK
37 #include "polyblk.h" // for POLY_BLOCK
38 #include "publictypes.h" // for OcrEngineMode, OEM_LSTM_ONLY
39 #include "seam.h" // for SEAM, start_seam_list
40 #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
41 #include "tesscallback.h" // for NewPermanentTessCallback, TessResultCallback2
42 #include "tprintf.h" // for tprintf
43 
44 struct Pix;
45 
48 
49 // Gain factor for computing thresholds that determine the ambiguity of a word.
50 static const double kStopperAmbiguityThresholdGain = 8.0;
51 // Constant offset for computing thresholds that determine the ambiguity of a
52 // word.
53 static const double kStopperAmbiguityThresholdOffset = 1.5;
54 // Max number of broken pieces to associate.
56 // Max ratio of word box height to line size to allow it to be processed as
57 // a line with other words.
58 const double kMaxWordSizeRatio = 1.25;
59 // Max ratio of line box height to line size to allow a new word to be added.
60 const double kMaxLineSizeRatio = 1.25;
61 // Max ratio of word gap to line size to allow a new word to be added.
62 const double kMaxWordGapRatio = 2.0;
63 
64 // Computes and returns a threshold of certainty difference used to determine
65 // which words to keep, based on the adjustment factors of the two words.
66 // TODO(rays) This is horrible. Replace with an enhance params training model.
67 static double StopperAmbigThreshold(double f1, double f2) {
68  return (f2 - f1) * kStopperAmbiguityThresholdGain -
69  kStopperAmbiguityThresholdOffset;
70 }
71 
72 /*************************************************************************
73  * PAGE_RES::PAGE_RES
74  *
75  * Constructor for page results
76  *************************************************************************/
78  bool merge_similar_words,
79  BLOCK_LIST *the_block_list,
80  WERD_CHOICE **prev_word_best_choice_ptr) {
81  Init();
82  BLOCK_IT block_it(the_block_list);
83  BLOCK_RES_IT block_res_it(&block_res_list);
84  for (block_it.mark_cycle_pt();
85  !block_it.cycled_list(); block_it.forward()) {
86  block_res_it.add_to_end(new BLOCK_RES(merge_similar_words,
87  block_it.data()));
88  }
89  prev_word_best_choice = prev_word_best_choice_ptr;
90 }
91 
92 /*************************************************************************
93  * BLOCK_RES::BLOCK_RES
94  *
95  * Constructor for BLOCK results
96  *************************************************************************/
97 
98 BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
99  ROW_IT row_it (the_block->row_list ());
100  ROW_RES_IT row_res_it(&row_res_list);
101 
102  char_count = 0;
103  rej_count = 0;
104  font_class = -1; //not assigned
105  x_height = -1.0;
106  font_assigned = false;
107  bold = false;
108  italic = false;
109  row_count = 0;
110 
111  block = the_block;
112 
113  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
114  row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
115  }
116 }
117 
118 /*************************************************************************
119  * ROW_RES::ROW_RES
120  *
121  * Constructor for ROW results
122  *************************************************************************/
123 
124 ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
125  WERD_IT word_it(the_row->word_list());
126  WERD_RES_IT word_res_it(&word_res_list);
127  WERD_RES *combo = nullptr; // current combination of fuzzies
128  WERD *copy_word;
129 
130  char_count = 0;
131  rej_count = 0;
133 
134  row = the_row;
135  bool add_next_word = false;
136  TBOX union_box;
137  float line_height = the_row->x_height() + the_row->ascenders() -
138  the_row->descenders();
139  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
140  WERD_RES* word_res = new WERD_RES(word_it.data());
141  word_res->x_height = the_row->x_height();
142  if (add_next_word) {
143  ASSERT_HOST(combo != nullptr);
144  // We are adding this word to the combination.
145  word_res->part_of_combo = TRUE;
146  combo->copy_on(word_res);
147  } else if (merge_similar_words) {
148  union_box = word_res->word->bounding_box();
149  add_next_word = !word_res->word->flag(W_REP_CHAR) &&
150  union_box.height() <= line_height * kMaxWordSizeRatio;
151  word_res->odd_size = !add_next_word;
152  }
153  WERD* next_word = word_it.data_relative(1);
154  if (merge_similar_words) {
155  if (add_next_word && !next_word->flag(W_REP_CHAR)) {
156  // Next word will be added on if all of the following are true:
157  // Not a rep char.
158  // Box height small enough.
159  // Union box height small enough.
160  // Horizontal gap small enough.
161  TBOX next_box = next_word->bounding_box();
162  int prev_right = union_box.right();
163  union_box += next_box;
164  if (next_box.height() > line_height * kMaxWordSizeRatio ||
165  union_box.height() > line_height * kMaxLineSizeRatio ||
166  next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
167  add_next_word = false;
168  }
169  }
170  next_word->set_flag(W_FUZZY_NON, add_next_word);
171  } else {
172  add_next_word = next_word->flag(W_FUZZY_NON);
173  }
174  if (add_next_word) {
175  if (combo == nullptr) {
176  copy_word = new WERD;
177  *copy_word = *(word_it.data()); // deep copy
178  combo = new WERD_RES(copy_word);
179  combo->x_height = the_row->x_height();
180  combo->combination = TRUE;
181  word_res_it.add_to_end(combo);
182  }
183  word_res->part_of_combo = TRUE;
184  } else {
185  combo = nullptr;
186  }
187  word_res_it.add_to_end(word_res);
188  }
189 }
190 
191 
193  this->ELIST_LINK::operator=(source);
194  Clear();
195  if (source.combination) {
196  word = new WERD;
197  *word = *(source.word); // deep copy
198  } else {
199  word = source.word; // pt to same word
200  }
201  if (source.bln_boxes != nullptr)
202  bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
203  if (source.chopped_word != nullptr)
204  chopped_word = new TWERD(*source.chopped_word);
205  if (source.rebuild_word != nullptr)
206  rebuild_word = new TWERD(*source.rebuild_word);
207  // TODO(rays) Do we ever need to copy the seam_array?
208  blob_row = source.blob_row;
209  denorm = source.denorm;
210  if (source.box_word != nullptr)
211  box_word = new tesseract::BoxWord(*source.box_word);
212  best_state = source.best_state;
213  correct_text = source.correct_text;
214  blob_widths = source.blob_widths;
215  blob_gaps = source.blob_gaps;
216  // None of the uses of operator= require the ratings matrix to be copied,
217  // so don't as it would be really slow.
218 
219  // Copy the cooked choices.
220  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&source.best_choices));
221  WERD_CHOICE_IT wc_dest_it(&best_choices);
222  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
223  const WERD_CHOICE *choice = wc_it.data();
224  wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));
225  }
226  if (!wc_dest_it.empty()) {
227  wc_dest_it.move_to_first();
228  best_choice = wc_dest_it.data();
229  } else {
230  best_choice = nullptr;
231  }
232 
233  if (source.raw_choice != nullptr) {
234  raw_choice = new WERD_CHOICE(*source.raw_choice);
235  } else {
236  raw_choice = nullptr;
237  }
238  if (source.ep_choice != nullptr) {
239  ep_choice = new WERD_CHOICE(*source.ep_choice);
240  } else {
241  ep_choice = nullptr;
242  }
243  reject_map = source.reject_map;
244  combination = source.combination;
245  part_of_combo = source.part_of_combo;
246  CopySimpleFields(source);
247  if (source.blamer_bundle != nullptr) {
248  blamer_bundle = new BlamerBundle(*(source.blamer_bundle));
249  }
250  return *this;
251 }
252 
253 // Copies basic fields that don't involve pointers that might be useful
254 // to copy when making one WERD_RES from another.
256  tess_failed = source.tess_failed;
257  tess_accepted = source.tess_accepted;
259  done = source.done;
261  small_caps = source.small_caps;
262  odd_size = source.odd_size;
263  italic = source.italic;
264  bold = source.bold;
265  fontinfo = source.fontinfo;
266  fontinfo2 = source.fontinfo2;
269  x_height = source.x_height;
270  caps_height = source.caps_height;
272  guessed_x_ht = source.guessed_x_ht;
274  reject_spaces = source.reject_spaces;
275  uch_set = source.uch_set;
276  tesseract = source.tesseract;
277 }
278 
279 // Initializes a blank (default constructed) WERD_RES from one that has
280 // already been recognized.
281 // Use SetupFor*Recognition afterwards to complete the setup and make
282 // it ready for a retry recognition.
284  word = source.word;
285  CopySimpleFields(source);
286  if (source.blamer_bundle != nullptr) {
287  blamer_bundle = new BlamerBundle();
289  }
290 }
291 
292 // Sets up the members used in recognition: bln_boxes, chopped_word,
293 // seam_array, denorm. Returns false if
294 // the word is empty and sets up fake results. If use_body_size is
295 // true and row->body_size is set, then body_size will be used for
296 // blob normalization instead of xheight + ascrise. This flag is for
297 // those languages that are using CJK pitch model and thus it has to
298 // be true if and only if tesseract->textord_use_cjk_fp_model is
299 // true.
300 // If allow_detailed_fx is true, the feature extractor will receive fine
301 // precision outline information, allowing smoother features and better
302 // features on low resolution images.
303 // The norm_mode_hint sets the default mode for normalization in absence
304 // of any of the above flags.
305 // norm_box is used to override the word bounding box to determine the
306 // normalization scale and offset.
307 // Returns false if the word is empty and sets up fake results.
308 bool WERD_RES::SetupForRecognition(const UNICHARSET& unicharset_in,
309  tesseract::Tesseract* tess, Pix* pix,
310  int norm_mode,
311  const TBOX* norm_box,
312  bool numeric_mode,
313  bool use_body_size,
314  bool allow_detailed_fx,
315  ROW *row, const BLOCK* block) {
316  tesseract::OcrEngineMode norm_mode_hint =
317  static_cast<tesseract::OcrEngineMode>(norm_mode);
318  tesseract = tess;
319  POLY_BLOCK* pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
320  if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY &&
321  word->cblob_list()->empty()) ||
322  (pb != nullptr && !pb->IsText())) {
323  // Empty words occur when all the blobs have been moved to the rej_blobs
324  // list, which seems to occur frequently in junk.
325  SetupFake(unicharset_in);
326  word->set_flag(W_REP_CHAR, false);
327  return false;
328  }
329  ClearResults();
330  SetupWordScript(unicharset_in);
331  chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);
332  float word_xheight = use_body_size && row != nullptr && row->body_size() > 0.0f
333  ? row->body_size() : x_height;
334  chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
335  word_xheight, baseline_shift, numeric_mode,
336  norm_mode_hint, norm_box, &denorm);
337  blob_row = row;
338  SetupBasicsFromChoppedWord(unicharset_in);
340  int num_blobs = chopped_word->NumBlobs();
341  ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);
342  tess_failed = false;
343  return true;
344 }
345 
346 // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
347 // accumulators from a made chopped word. We presume the fields are already
348 // empty.
354 }
355 
356 // Sets up the members used in recognition for an empty recognition result:
357 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
358 void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
359  ClearResults();
360  SetupWordScript(unicharset_in);
361  chopped_word = new TWERD;
362  rebuild_word = new TWERD;
365  int blob_count = word->cblob_list()->length();
366  if (blob_count > 0) {
367  BLOB_CHOICE** fake_choices = new BLOB_CHOICE*[blob_count];
368  // For non-text blocks, just pass any blobs through to the box_word
369  // and call the word failed with a fake classification.
370  C_BLOB_IT b_it(word->cblob_list());
371  int blob_id = 0;
372  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
373  TBOX box = b_it.data()->bounding_box();
374  box_word->InsertBox(box_word->length(), box);
375  fake_choices[blob_id++] = new BLOB_CHOICE;
376  }
377  FakeClassifyWord(blob_count, fake_choices);
378  delete [] fake_choices;
379  } else {
380  WERD_CHOICE* word = new WERD_CHOICE(&unicharset_in);
381  word->make_bad();
383  // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.
384  LogNewCookedChoice(1, false, word);
385  }
386  tess_failed = true;
387  done = true;
388 }
389 
391  uch_set = &uch;
392  int script = uch.default_sid();
393  word->set_script_id(script);
395  word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
396 }
397 
398 // Sets up the blamer_bundle if it is not null, using the initialized denorm.
400  if (blamer_bundle != nullptr) {
402  }
403 }
404 
405 // Computes the blob_widths and blob_gaps from the chopped_word.
408  blob_gaps.truncate(0);
409  int num_blobs = chopped_word->NumBlobs();
410  for (int b = 0; b < num_blobs; ++b) {
411  TBLOB *blob = chopped_word->blobs[b];
412  TBOX box = blob->bounding_box();
413  blob_widths.push_back(box.width());
414  if (b + 1 < num_blobs) {
416  chopped_word->blobs[b + 1]->bounding_box().left() - box.right());
417  }
418  }
419 }
420 
421 // Updates internal data to account for a new SEAM (chop) at the given
422 // blob_number. Fixes the ratings matrix and states in the choices, as well
423 // as the blob widths and gaps.
424 void WERD_RES::InsertSeam(int blob_number, SEAM* seam) {
425  // Insert the seam into the SEAMS array.
426  seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
427  seam_array.insert(seam, blob_number);
428  if (ratings != nullptr) {
429  // Expand the ratings matrix.
430  ratings = ratings->ConsumeAndMakeBigger(blob_number);
431  // Fix all the segmentation states.
432  if (raw_choice != nullptr)
433  raw_choice->UpdateStateForSplit(blob_number);
434  WERD_CHOICE_IT wc_it(&best_choices);
435  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
436  WERD_CHOICE* choice = wc_it.data();
437  choice->UpdateStateForSplit(blob_number);
438  }
440  }
441 }
442 
443 // Returns true if all the word choices except the first have adjust_factors
444 // worse than the given threshold.
446  // The choices are not changed by this iteration.
447  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
448  for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
449  WERD_CHOICE* choice = wc_it.data();
450  if (choice->adjust_factor() <= threshold)
451  return false;
452  }
453  return true;
454 }
455 
456 // Returns true if the current word is ambiguous (by number of answers or
457 // by dangerous ambigs.)
459  return !best_choices.singleton() || best_choice->dangerous_ambig_found();
460 }
461 
462 // Returns true if the ratings matrix size matches the sum of each of the
463 // segmentation states.
465  int ratings_dim = ratings->dimension();
466  if (raw_choice->TotalOfStates() != ratings_dim) {
467  tprintf("raw_choice has total of states = %d vs ratings dim of %d\n",
468  raw_choice->TotalOfStates(), ratings_dim);
469  return false;
470  }
471  WERD_CHOICE_IT it(&best_choices);
472  int index = 0;
473  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
474  WERD_CHOICE* choice = it.data();
475  if (choice->TotalOfStates() != ratings_dim) {
476  tprintf("Cooked #%d has total of states = %d vs ratings dim of %d\n",
477  index, choice->TotalOfStates(), ratings_dim);
478  return false;
479  }
480  }
481  return true;
482 }
483 
484 // Prints a list of words found if debug is true or the word result matches
485 // the word_to_debug.
486 void WERD_RES::DebugWordChoices(bool debug, const char* word_to_debug) {
487  if (debug ||
488  (word_to_debug != nullptr && *word_to_debug != '\0' && best_choice != nullptr &&
489  best_choice->unichar_string() == STRING(word_to_debug))) {
490  if (raw_choice != nullptr)
491  raw_choice->print("\nBest Raw Choice");
492 
493  WERD_CHOICE_IT it(&best_choices);
494  int index = 0;
495  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
496  WERD_CHOICE* choice = it.data();
497  STRING label;
498  label.add_str_int("\nCooked Choice #", index);
499  choice->print(label.string());
500  }
501  }
502 }
503 
504 // Prints the top choice along with the accepted/done flags.
505 void WERD_RES::DebugTopChoice(const char* msg) const {
506  tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ",
508  if (best_choice == nullptr)
509  tprintf("<Null choice>\n");
510  else
511  best_choice->print(msg);
512 }
513 
514 // Removes from best_choices all choices which are not within a reasonable
515 // range of the best choice.
516 // TODO(rays) incorporate the information used here into the params training
517 // re-ranker, in place of this heuristic that is based on the previous
518 // adjustment factor.
519 void WERD_RES::FilterWordChoices(int debug_level) {
520  if (best_choice == nullptr || best_choices.singleton())
521  return;
522 
523  if (debug_level >= 2)
524  best_choice->print("\nFiltering against best choice");
525  WERD_CHOICE_IT it(&best_choices);
526  int index = 0;
527  for (it.forward(); !it.at_first(); it.forward(), ++index) {
528  WERD_CHOICE* choice = it.data();
529  float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
530  choice->adjust_factor());
531  // i, j index the blob choice in choice, best_choice.
532  // chunk is an index into the chopped_word blobs (AKA chunks).
533  // Since the two words may use different segmentations of the chunks, we
534  // iterate over the chunks to find out whether a comparable blob
535  // classification is much worse than the best result.
536  int i = 0, j = 0, chunk = 0;
537  // Each iteration of the while deals with 1 chunk. On entry choice_chunk
538  // and best_chunk are the indices of the first chunk in the NEXT blob,
539  // i.e. we don't have to increment i, j while chunk < choice_chunk and
540  // best_chunk respectively.
541  int choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
542  while (i < choice->length() && j < best_choice->length()) {
543  if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
544  choice->certainty(i) - best_choice->certainty(j) < threshold) {
545  if (debug_level >= 2) {
546  choice->print("WorstCertaintyDiffWorseThan");
547  tprintf(
548  "i %d j %d Choice->Blob[i].Certainty %.4g"
549  " WorstOtherChoiceCertainty %g Threshold %g\n",
550  i, j, choice->certainty(i), best_choice->certainty(j), threshold);
551  tprintf("Discarding bad choice #%d\n", index);
552  }
553  delete it.extract();
554  break;
555  }
556  ++chunk;
557  // If needed, advance choice_chunk to keep up with chunk.
558  while (choice_chunk < chunk && ++i < choice->length())
559  choice_chunk += choice->state(i);
560  // If needed, advance best_chunk to keep up with chunk.
561  while (best_chunk < chunk && ++j < best_choice->length())
562  best_chunk += best_choice->state(j);
563  }
564  }
565 }
566 
567 void WERD_RES::ComputeAdaptionThresholds(float certainty_scale,
568  float min_rating,
569  float max_rating,
570  float rating_margin,
571  float* thresholds) {
572  int chunk = 0;
573  int end_chunk = best_choice->state(0);
574  int end_raw_chunk = raw_choice->state(0);
575  int raw_blob = 0;
576  for (int i = 0; i < best_choice->length(); i++, thresholds++) {
577  float avg_rating = 0.0f;
578  int num_error_chunks = 0;
579 
580  // For each chunk in best choice blob i, count non-matching raw results.
581  while (chunk < end_chunk) {
582  if (chunk >= end_raw_chunk) {
583  ++raw_blob;
584  end_raw_chunk += raw_choice->state(raw_blob);
585  }
586  if (best_choice->unichar_id(i) !=
587  raw_choice->unichar_id(raw_blob)) {
588  avg_rating += raw_choice->certainty(raw_blob);
589  ++num_error_chunks;
590  }
591  ++chunk;
592  }
593 
594  if (num_error_chunks > 0) {
595  avg_rating /= num_error_chunks;
596  *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
597  } else {
598  *thresholds = max_rating;
599  }
600 
601  if (*thresholds > max_rating)
602  *thresholds = max_rating;
603  if (*thresholds < min_rating)
604  *thresholds = min_rating;
605  }
606 }
607 
608 // Saves a copy of the word_choice if it has the best unadjusted rating.
609 // Returns true if the word_choice was the new best.
611  if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) {
612  delete raw_choice;
613  raw_choice = new WERD_CHOICE(*word_choice);
615  return true;
616  }
617  return false;
618 }
619 
620 // Consumes word_choice by adding it to best_choices, (taking ownership) if
621 // the certainty for word_choice is some distance of the best choice in
622 // best_choices, or by deleting the word_choice and returning false.
623 // The best_choices list is kept in sorted order by rating. Duplicates are
624 // removed, and the list is kept no longer than max_num_choices in length.
625 // Returns true if the word_choice is still a valid pointer.
626 bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,
627  WERD_CHOICE* word_choice) {
628  if (best_choice != nullptr) {
629  // Throw out obviously bad choices to save some work.
630  // TODO(rays) Get rid of this! This piece of code produces different
631  // results according to the order in which words are found, which is an
632  // undesirable behavior. It would be better to keep all the choices and
633  // prune them later when more information is available.
634  float max_certainty_delta =
635  StopperAmbigThreshold(best_choice->adjust_factor(),
636  word_choice->adjust_factor());
637  if (max_certainty_delta > -kStopperAmbiguityThresholdOffset)
638  max_certainty_delta = -kStopperAmbiguityThresholdOffset;
639  if (word_choice->certainty() - best_choice->certainty() <
640  max_certainty_delta) {
641  if (debug) {
642  STRING bad_string;
643  word_choice->string_and_lengths(&bad_string, nullptr);
644  tprintf("Discarding choice \"%s\" with an overly low certainty"
645  " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
646  bad_string.string(), word_choice->certainty(),
648  max_certainty_delta + best_choice->certainty());
649  }
650  delete word_choice;
651  return false;
652  }
653  }
654 
655  // Insert in the list in order of increasing rating, but knock out worse
656  // string duplicates.
657  WERD_CHOICE_IT it(&best_choices);
658  const STRING& new_str = word_choice->unichar_string();
659  bool inserted = false;
660  int num_choices = 0;
661  if (!it.empty()) {
662  do {
663  WERD_CHOICE* choice = it.data();
664  if (choice->rating() > word_choice->rating() && !inserted) {
665  // Time to insert.
666  it.add_before_stay_put(word_choice);
667  inserted = true;
668  if (num_choices == 0)
669  best_choice = word_choice; // This is the new best.
670  ++num_choices;
671  }
672  if (choice->unichar_string() == new_str) {
673  if (inserted) {
674  // New is better.
675  delete it.extract();
676  } else {
677  // Old is better.
678  if (debug) {
679  tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",
680  new_str.string(), word_choice->rating(), choice->rating());
681  }
682  delete word_choice;
683  return false;
684  }
685  } else {
686  ++num_choices;
687  if (num_choices > max_num_choices)
688  delete it.extract();
689  }
690  it.forward();
691  } while (!it.at_first());
692  }
693  if (!inserted && num_choices < max_num_choices) {
694  it.add_to_end(word_choice);
695  inserted = true;
696  if (num_choices == 0)
697  best_choice = word_choice; // This is the new best.
698  }
699  if (debug) {
700  if (inserted)
701  tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");
702  else
703  tprintf("Poor");
704  word_choice->print(" Word Choice");
705  }
706  if (!inserted) {
707  delete word_choice;
708  return false;
709  }
710  return true;
711 }
712 
713 
714 // Simple helper moves the ownership of the pointer data from src to dest,
715 // first deleting anything in dest, and nulling out src afterwards.
716 template<class T> static void MovePointerData(T** dest, T**src) {
717  delete *dest;
718  *dest = *src;
719  *src = nullptr;
720 }
721 
722 // Prints a brief list of all the best choices.
724  STRING alternates_str;
725  WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
726  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
727  if (!it.at_first()) alternates_str += "\", \"";
728  alternates_str += it.data()->unichar_string();
729  }
730  tprintf("Alternates for \"%s\": {\"%s\"}\n",
731  best_choice->unichar_string().string(), alternates_str.string());
732 }
733 
734 // Returns the sum of the widths of the blob between start_blob and last_blob
735 // inclusive.
736 int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) {
737  int result = 0;
738  for (int b = start_blob; b <= last_blob; ++b) {
739  result += blob_widths[b];
740  if (b < last_blob)
741  result += blob_gaps[b];
742  }
743  return result;
744 }
745 // Returns the width of a gap between the specified blob and the next one.
746 int WERD_RES::GetBlobsGap(int blob_index) {
747  if (blob_index < 0 || blob_index >= blob_gaps.size())
748  return 0;
749  return blob_gaps[blob_index];
750 }
751 
752 // Returns the BLOB_CHOICE corresponding to the given index in the
753 // best choice word taken from the appropriate cell in the ratings MATRIX.
754 // Borrowed pointer, so do not delete. May return nullptr if there is no
755 // BLOB_CHOICE matching the unichar_id at the given index.
757  if (index < 0 || index >= best_choice->length()) return nullptr;
758  BLOB_CHOICE_LIST* choices = GetBlobChoices(index);
759  return FindMatchingChoice(best_choice->unichar_id(index), choices);
760 }
761 
762 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
763 // best choice word taken from the appropriate cell in the ratings MATRIX.
764 // Borrowed pointer, so do not delete.
765 BLOB_CHOICE_LIST* WERD_RES::GetBlobChoices(int index) const {
766  return best_choice->blob_choices(index, ratings);
767 }
768 
769 // Moves the results fields from word to this. This takes ownership of all
770 // the data, so src can be destructed.
772  denorm = word->denorm;
773  blob_row = word->blob_row;
774  MovePointerData(&chopped_word, &word->chopped_word);
775  MovePointerData(&rebuild_word, &word->rebuild_word);
776  MovePointerData(&box_word, &word->box_word);
778  seam_array = word->seam_array;
779  word->seam_array.clear();
780  best_state.move(&word->best_state);
781  correct_text.move(&word->correct_text);
782  blob_widths.move(&word->blob_widths);
783  blob_gaps.move(&word->blob_gaps);
784  if (ratings != nullptr) ratings->delete_matrix_pointers();
785  MovePointerData(&ratings, &word->ratings);
786  best_choice = word->best_choice;
787  MovePointerData(&raw_choice, &word->raw_choice);
788  best_choices.clear();
789  WERD_CHOICE_IT wc_it(&best_choices);
790  wc_it.add_list_after(&word->best_choices);
791  reject_map = word->reject_map;
792  if (word->blamer_bundle != nullptr) {
793  assert(blamer_bundle != nullptr);
794  blamer_bundle->CopyResults(*(word->blamer_bundle));
795  }
797 }
798 
799 // Replace the best choice and rebuild box word.
800 // choice must be from the current best_choices list.
802  best_choice = choice;
804  SetupBoxWord();
805  // Make up a fake reject map of the right length to keep the
806  // rejection pass happy.
810 }
811 
812 // Builds the rebuild_word and sets the best_state from the chopped_word and
813 // the best_choice->state.
815  ASSERT_HOST(best_choice != nullptr);
816  delete rebuild_word;
817  rebuild_word = new TWERD;
818  if (seam_array.empty())
820  best_state.truncate(0);
821  int start = 0;
822  for (int i = 0; i < best_choice->length(); ++i) {
823  int length = best_choice->state(i);
824  best_state.push_back(length);
825  if (length > 1) {
827  start + length - 1);
828  }
829  TBLOB* blob = chopped_word->blobs[start];
830  rebuild_word->blobs.push_back(new TBLOB(*blob));
831  if (length > 1) {
833  start + length - 1);
834  }
835  start += length;
836  }
837 }
838 
839 // Copies the chopped_word to the rebuild_word, faking a best_state as well.
840 // Also sets up the output box_word.
842  delete rebuild_word;
844  SetupBoxWord();
845  int word_len = box_word->length();
846  best_state.reserve(word_len);
847  correct_text.reserve(word_len);
848  for (int i = 0; i < word_len; ++i) {
851  }
852 }
853 
854 // Sets/replaces the box_word with one made from the rebuild_word.
856  delete box_word;
860 }
861 
862 // Sets up the script positions in the output best_choice using the best_choice
863 // to get the unichars, and the unicharset to get the target positions.
866 }
867 // Sets all the blobs in all the words (raw choice and best choices) to be
868 // the given position. (When a sub/superscript is recognized as a separate
869 // word, it falls victim to the rule that a whole word cannot be sub or
870 // superscript, so this function overrides that problem.)
873  WERD_CHOICE_IT wc_it(&best_choices);
874  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward())
875  wc_it.data()->SetAllScriptPositions(position);
876 }
877 
878 // Classifies the word with some already-calculated BLOB_CHOICEs.
879 // The choices are an array of blob_count pointers to BLOB_CHOICE,
880 // providing a single classifier result for each blob.
881 // The BLOB_CHOICEs are consumed and the word takes ownership.
882 // The number of blobs in the box_word must match blob_count.
883 void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
884  // Setup the WERD_RES.
885  ASSERT_HOST(box_word != nullptr);
886  ASSERT_HOST(blob_count == box_word->length());
888  ClearRatings();
889  ratings = new MATRIX(blob_count, 1);
890  for (int c = 0; c < blob_count; ++c) {
891  BLOB_CHOICE_LIST* choice_list = new BLOB_CHOICE_LIST;
892  BLOB_CHOICE_IT choice_it(choice_list);
893  choice_it.add_after_then_move(choices[c]);
894  ratings->put(c, c, choice_list);
895  }
897  reject_map.initialise(blob_count);
898  best_state.init_to_size(blob_count, 1);
899  done = true;
900 }
901 
902 // Creates a WERD_CHOICE for the word using the top choices from the leading
903 // diagonal of the ratings matrix.
905  int num_blobs = ratings->dimension();
906  WERD_CHOICE* word_choice = new WERD_CHOICE(uch_set, num_blobs);
907  word_choice->set_permuter(permuter);
908  for (int b = 0; b < num_blobs; ++b) {
909  UNICHAR_ID unichar_id = UNICHAR_SPACE;
910  float rating = INT32_MAX;
911  float certainty = -INT32_MAX;
912  BLOB_CHOICE_LIST* choices = ratings->get(b, b);
913  if (choices != nullptr && !choices->empty()) {
914  BLOB_CHOICE_IT bc_it(choices);
915  BLOB_CHOICE* choice = bc_it.data();
916  unichar_id = choice->unichar_id();
917  rating = choice->rating();
918  certainty = choice->certainty();
919  }
920  word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
921  certainty);
922  }
923  LogNewRawChoice(word_choice);
924  // Ownership of word_choice taken by word here.
925  LogNewCookedChoice(1, false, word_choice);
926 }
927 
928 // Copies the best_choice strings to the correct_text for adaption/training.
931  ASSERT_HOST(best_choice != nullptr);
932  for (int i = 0; i < best_choice->length(); ++i) {
933  UNICHAR_ID choice_id = best_choice->unichar_id(i);
934  const char* blob_choice = uch_set->id_to_unichar(choice_id);
935  correct_text.push_back(STRING(blob_choice));
936  }
937 }
938 
939 // Merges 2 adjacent blobs in the result if the permanent callback
940 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
941 // callback box_cb is nullptr or returns true, setting the merged blob
942 // result to the class returned from class_cb.
943 // Returns true if anything was merged.
947  ASSERT_HOST(best_choice->length() == 0 || ratings != nullptr);
948  bool modified = false;
949  for (int i = 0; i + 1 < best_choice->length(); ++i) {
950  UNICHAR_ID new_id = class_cb->Run(best_choice->unichar_id(i),
951  best_choice->unichar_id(i+1));
952  if (new_id != INVALID_UNICHAR_ID &&
953  (box_cb == nullptr || box_cb->Run(box_word->BlobBox(i),
954  box_word->BlobBox(i + 1)))) {
955  // Raw choice should not be fixed.
956  best_choice->set_unichar_id(new_id, i);
957  modified = true;
959  const MATRIX_COORD& coord = best_choice->MatrixCoord(i);
960  if (!coord.Valid(*ratings)) {
961  ratings->IncreaseBandSize(coord.row + 1 - coord.col);
962  }
963  BLOB_CHOICE_LIST* blob_choices = GetBlobChoices(i);
964  if (FindMatchingChoice(new_id, blob_choices) == nullptr) {
965  // Insert a fake result.
966  BLOB_CHOICE* blob_choice = new BLOB_CHOICE;
967  blob_choice->set_unichar_id(new_id);
968  BLOB_CHOICE_IT bc_it(blob_choices);
969  bc_it.add_before_then_move(blob_choice);
970  }
971  }
972  }
973  delete class_cb;
974  delete box_cb;
975  return modified;
976 }
977 
978 // Merges 2 adjacent blobs in the result (index and index+1) and corrects
979 // all the data to account for the change.
981  if (reject_map.length() == best_choice->length())
982  reject_map.remove_pos(index);
983  best_choice->remove_unichar_id(index + 1);
984  rebuild_word->MergeBlobs(index, index + 2);
985  box_word->MergeBoxes(index, index + 2);
986  if (index + 1 < best_state.length()) {
987  best_state[index] += best_state[index + 1];
988  best_state.remove(index + 1);
989  }
990 }
991 
992 // TODO(tkielbus) Decide between keeping this behavior here or modifying the
993 // training data.
994 
995 // Utility function for fix_quotes
996 // Return true if the next character in the string (given the UTF8 length in
997 // bytes) is a quote character.
998 static int is_simple_quote(const char* signed_str, int length) {
999  const unsigned char* str =
1000  reinterpret_cast<const unsigned char*>(signed_str);
1001  // Standard 1 byte quotes.
1002  return (length == 1 && (*str == '\'' || *str == '`')) ||
1003  // UTF-8 3 bytes curved quotes.
1004  (length == 3 && ((*str == 0xe2 &&
1005  *(str + 1) == 0x80 &&
1006  *(str + 2) == 0x98) ||
1007  (*str == 0xe2 &&
1008  *(str + 1) == 0x80 &&
1009  *(str + 2) == 0x99)));
1010 }
1011 
1012 // Callback helper for fix_quotes returns a double quote if both
1013 // arguments are quote, otherwise INVALID_UNICHAR_ID.
1015  const char *ch = uch_set->id_to_unichar(id1);
1016  const char *next_ch = uch_set->id_to_unichar(id2);
1017  if (is_simple_quote(ch, strlen(ch)) &&
1018  is_simple_quote(next_ch, strlen(next_ch)))
1019  return uch_set->unichar_to_id("\"");
1020  return INVALID_UNICHAR_ID;
1021 }
1022 
1023 // Change pairs of quotes to double quotes.
1025  if (!uch_set->contains_unichar("\"") ||
1027  return; // Don't create it if it is disallowed.
1028 
1031  nullptr);
1032 }
1033 
1034 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
1035 // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
1037  const char *ch = uch_set->id_to_unichar(id1);
1038  const char *next_ch = uch_set->id_to_unichar(id2);
1039  if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
1040  (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~'))
1041  return uch_set->unichar_to_id("-");
1042  return INVALID_UNICHAR_ID;
1043 }
1044 
1045 // Callback helper for fix_hyphens returns true if box1 and box2 overlap
1046 // (assuming both on the same textline, are in order and a chopped em dash.)
1047 bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) {
1048  return box1.right() >= box2.left();
1049 }
1050 
1051 // Change pairs of hyphens to a single hyphen if the bounding boxes touch
1052 // Typically a long dash which has been segmented.
1054  if (!uch_set->contains_unichar("-") ||
1056  return; // Don't create it if it is disallowed.
1057 
1061 }
1062 
1063 // Callback helper for merge_tess_fails returns a space if both
1064 // arguments are space, otherwise INVALID_UNICHAR_ID.
1066  if (id1 == id2 && id1 == uch_set->unichar_to_id(" "))
1067  return id1;
1068  else
1069  return INVALID_UNICHAR_ID;
1070 }
1071 
1072 // Change pairs of tess failures to a single one
1075  NewPermanentTessCallback(this, &WERD_RES::BothSpaces), nullptr)) {
1076  int len = best_choice->length();
1077  ASSERT_HOST(reject_map.length() == len);
1078  ASSERT_HOST(box_word->length() == len);
1079  }
1080 }
1081 
1082 // Returns true if the collection of count pieces, starting at start, are all
1083 // natural connected components, ie there are no real chops involved.
1084 bool WERD_RES::PiecesAllNatural(int start, int count) const {
1085  // all seams must have no splits.
1086  for (int index = start; index < start + count - 1; ++index) {
1087  if (index >= 0 && index < seam_array.size()) {
1088  SEAM* seam = seam_array[index];
1089  if (seam != nullptr && seam->HasAnySplits()) return false;
1090  }
1091  }
1092  return true;
1093 }
1094 
1095 
1097  Clear();
1098 }
1099 
1101  tess_failed = false;
1102  tess_accepted = false;
1103  tess_would_adapt = false;
1104  done = false;
1106  small_caps = false;
1107  odd_size = false;
1108  italic = FALSE;
1109  bold = FALSE;
1110  // The fontinfos and tesseract count as non-pointers as they point to
1111  // data owned elsewhere.
1112  fontinfo = nullptr;
1113  fontinfo2 = nullptr;
1114  tesseract = nullptr;
1115  fontinfo_id_count = 0;
1116  fontinfo_id2_count = 0;
1117  x_height = 0.0;
1118  caps_height = 0.0;
1119  baseline_shift = 0.0f;
1120  space_certainty = 0.0f;
1121  guessed_x_ht = true;
1122  guessed_caps_ht = true;
1123  combination = false;
1124  part_of_combo = false;
1125  reject_spaces = false;
1126 }
1127 
1129  word = nullptr;
1130  bln_boxes = nullptr;
1131  blob_row = nullptr;
1132  uch_set = nullptr;
1133  chopped_word = nullptr;
1134  rebuild_word = nullptr;
1135  box_word = nullptr;
1136  ratings = nullptr;
1137  best_choice = nullptr;
1138  raw_choice = nullptr;
1139  ep_choice = nullptr;
1140  blamer_bundle = nullptr;
1141 }
1142 
1144  if (combination) {
1145  delete word;
1146  }
1147  word = nullptr;
1148  delete blamer_bundle;
1149  blamer_bundle = nullptr;
1150  ClearResults();
1151 }
1152 
1154  done = false;
1155  fontinfo = nullptr;
1156  fontinfo2 = nullptr;
1157  fontinfo_id_count = 0;
1158  fontinfo_id2_count = 0;
1159  delete bln_boxes;
1160  bln_boxes = nullptr;
1161  blob_row = nullptr;
1162  delete chopped_word;
1163  chopped_word = nullptr;
1164  delete rebuild_word;
1165  rebuild_word = nullptr;
1166  delete box_word;
1167  box_word = nullptr;
1168  best_state.clear();
1169  correct_text.clear();
1171  seam_array.clear();
1172  blob_widths.clear();
1173  blob_gaps.clear();
1174  ClearRatings();
1175  ClearWordChoices();
1176  if (blamer_bundle != nullptr) blamer_bundle->ClearResults();
1177 }
1179  best_choice = nullptr;
1180  delete raw_choice;
1181  raw_choice = nullptr;
1182  best_choices.clear();
1183  delete ep_choice;
1184  ep_choice = nullptr;
1185 }
1187  if (ratings != nullptr) {
1189  delete ratings;
1190  ratings = nullptr;
1191  }
1192 }
1193 
1194 
1195 bool PAGE_RES_IT::operator ==(const PAGE_RES_IT &other) const {
1196  return word_res == other.word_res &&
1197  row_res == other.row_res &&
1198  block_res == other.block_res;
1199 }
1200 
1201 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
1202  ASSERT_HOST(page_res == other.page_res);
1203  if (other.block_res == nullptr) {
1204  // other points to the end of the page.
1205  if (block_res == nullptr)
1206  return 0;
1207  return -1;
1208  }
1209  if (block_res == nullptr) {
1210  return 1; // we point to the end of the page.
1211  }
1212  if (block_res == other.block_res) {
1213  if (other.row_res == nullptr || row_res == nullptr) {
1214  // this should only happen if we hit an image block.
1215  return 0;
1216  }
1217  if (row_res == other.row_res) {
1218  // we point to the same block and row.
1219  ASSERT_HOST(other.word_res != nullptr && word_res != nullptr);
1220  if (word_res == other.word_res) {
1221  // we point to the same word!
1222  return 0;
1223  }
1224 
1225  WERD_RES_IT word_res_it(&row_res->word_res_list);
1226  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
1227  word_res_it.forward()) {
1228  if (word_res_it.data() == word_res) {
1229  return -1;
1230  } else if (word_res_it.data() == other.word_res) {
1231  return 1;
1232  }
1233  }
1234  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1235  }
1236 
1237  // we both point to the same block, but different rows.
1238  ROW_RES_IT row_res_it(&block_res->row_res_list);
1239  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
1240  row_res_it.forward()) {
1241  if (row_res_it.data() == row_res) {
1242  return -1;
1243  } else if (row_res_it.data() == other.row_res) {
1244  return 1;
1245  }
1246  }
1247  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1248  }
1249 
1250  // We point to different blocks.
1251  BLOCK_RES_IT block_res_it(&page_res->block_res_list);
1252  for (block_res_it.mark_cycle_pt();
1253  !block_res_it.cycled_list(); block_res_it.forward()) {
1254  if (block_res_it.data() == block_res) {
1255  return -1;
1256  } else if (block_res_it.data() == other.block_res) {
1257  return 1;
1258  }
1259  }
1260  // Shouldn't happen...
1261  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1262  return 0;
1263 }
1264 
1265 // Inserts the new_word as a combination owned by a corresponding WERD_RES
1266 // before the current position. The simple fields of the WERD_RES are copied
1267 // from clone_res and the resulting WERD_RES is returned for further setup
1268 // with best_choice etc.
1270  WERD* new_word) {
1271  // Make a WERD_RES for the new_word.
1272  WERD_RES* new_res = new WERD_RES(new_word);
1273  new_res->CopySimpleFields(clone_res);
1274  new_res->combination = true;
1275  // Insert into the appropriate place in the ROW_RES.
1276  WERD_RES_IT wr_it(&row()->word_res_list);
1277  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1278  WERD_RES* word = wr_it.data();
1279  if (word == word_res)
1280  break;
1281  }
1282  ASSERT_HOST(!wr_it.cycled_list());
1283  wr_it.add_before_then_move(new_res);
1284  if (wr_it.at_first()) {
1285  // This is the new first word, so reset the member iterator so it
1286  // detects the cycled_list state correctly.
1288  }
1289  return new_res;
1290 }
1291 
1292 // Helper computes the boundaries between blobs in the word. The blob bounds
1293 // are likely very poor, if they come from LSTM, where it only outputs the
1294 // character at one pixel within it, so we find the midpoints between them.
1295 static void ComputeBlobEnds(const WERD_RES& word, C_BLOB_LIST* next_word_blobs,
1296  GenericVector<int>* blob_ends) {
1297  C_BLOB_IT blob_it(word.word->cblob_list());
1298  for (int i = 0; i < word.best_state.size(); ++i) {
1299  int length = word.best_state[i];
1300  // Get the bounding box of the fake blobs
1301  TBOX blob_box = blob_it.data()->bounding_box();
1302  blob_it.forward();
1303  for (int b = 1; b < length; ++b) {
1304  blob_box += blob_it.data()->bounding_box();
1305  blob_it.forward();
1306  }
1307  // This blob_box is crap, so for now we are only looking for the
1308  // boundaries between them.
1309  int blob_end = INT32_MAX;
1310  if (!blob_it.at_first() || next_word_blobs != nullptr) {
1311  if (blob_it.at_first())
1312  blob_it.set_to_list(next_word_blobs);
1313  blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
1314  }
1315  blob_ends->push_back(blob_end);
1316  }
1317 }
1318 
1319 // Replaces the current WERD/WERD_RES with the given words. The given words
1320 // contain fake blobs that indicate the position of the characters. These are
1321 // replaced with real blobs from the current word as much as possible.
1324  if (words->empty()) {
1326  return;
1327  }
1328  WERD_RES* input_word = word();
1329  // Set the BOL/EOL flags on the words from the input word.
1330  if (input_word->word->flag(W_BOL)) {
1331  (*words)[0]->word->set_flag(W_BOL, true);
1332  } else {
1333  (*words)[0]->word->set_blanks(input_word->word->space());
1334  }
1335  words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
1336 
1337  // Move the blobs from the input word to the new set of words.
1338  // If the input word_res is a combination, then the replacements will also be
1339  // combinations, and will own their own words. If the input word_res is not a
1340  // combination, then the final replacements will not be either, (although it
1341  // is allowed for the input words to be combinations) and their words
1342  // will get put on the row list. This maintains the ownership rules.
1343  WERD_IT w_it(row()->row->word_list());
1344  if (!input_word->combination) {
1345  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1346  WERD* word = w_it.data();
1347  if (word == input_word->word)
1348  break;
1349  }
1350  // w_it is now set to the input_word's word.
1351  ASSERT_HOST(!w_it.cycled_list());
1352  }
1353  // Insert into the appropriate place in the ROW_RES.
1354  WERD_RES_IT wr_it(&row()->word_res_list);
1355  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1356  WERD_RES* word = wr_it.data();
1357  if (word == input_word)
1358  break;
1359  }
1360  ASSERT_HOST(!wr_it.cycled_list());
1361  // Since we only have an estimate of the bounds between blobs, use the blob
1362  // x-middle as the determiner of where to put the blobs
1363  C_BLOB_IT src_b_it(input_word->word->cblob_list());
1364  src_b_it.sort(&C_BLOB::SortByXMiddle);
1365  C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
1366  rej_b_it.sort(&C_BLOB::SortByXMiddle);
1367  for (int w = 0; w < words->size(); ++w) {
1368  WERD_RES* word_w = (*words)[w];
1369  // Compute blob boundaries.
1370  GenericVector<int> blob_ends;
1371  C_BLOB_LIST* next_word_blobs =
1372  w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
1373  ComputeBlobEnds(*word_w, next_word_blobs, &blob_ends);
1374  // Delete the fake blobs on the current word.
1375  word_w->word->cblob_list()->clear();
1376  C_BLOB_IT dest_it(word_w->word->cblob_list());
1377  // Build the box word as we move the blobs.
1378  tesseract::BoxWord* box_word = new tesseract::BoxWord;
1379  for (int i = 0; i < blob_ends.size(); ++i) {
1380  int end_x = blob_ends[i];
1381  TBOX blob_box;
1382  // Add the blobs up to end_x.
1383  while (!src_b_it.empty() &&
1384  src_b_it.data()->bounding_box().x_middle() < end_x) {
1385  blob_box += src_b_it.data()->bounding_box();
1386  dest_it.add_after_then_move(src_b_it.extract());
1387  src_b_it.forward();
1388  }
1389  while (!rej_b_it.empty() &&
1390  rej_b_it.data()->bounding_box().x_middle() < end_x) {
1391  blob_box += rej_b_it.data()->bounding_box();
1392  dest_it.add_after_then_move(rej_b_it.extract());
1393  rej_b_it.forward();
1394  }
1395  // Clip to the previously computed bounds. Although imperfectly accurate,
1396  // it is good enough, and much more complicated to determine where else
1397  // to clip.
1398  if (i > 0 && blob_box.left() < blob_ends[i - 1])
1399  blob_box.set_left(blob_ends[i - 1]);
1400  if (blob_box.right() > end_x)
1401  blob_box.set_right(end_x);
1402  box_word->InsertBox(i, blob_box);
1403  }
1404  // Fix empty boxes. If a very joined blob sits over multiple characters,
1405  // then we will have some empty boxes from using the middle, so look for
1406  // overlaps.
1407  for (int i = 0; i < box_word->length(); ++i) {
1408  TBOX box = box_word->BlobBox(i);
1409  if (box.null_box()) {
1410  // Nothing has its middle in the bounds of this blob, so use anything
1411  // that overlaps.
1412  for (dest_it.mark_cycle_pt(); !dest_it.cycled_list();
1413  dest_it.forward()) {
1414  TBOX blob_box = dest_it.data()->bounding_box();
1415  if (blob_box.left() < blob_ends[i] &&
1416  (i == 0 || blob_box.right() >= blob_ends[i - 1])) {
1417  if (i > 0 && blob_box.left() < blob_ends[i - 1])
1418  blob_box.set_left(blob_ends[i - 1]);
1419  if (blob_box.right() > blob_ends[i])
1420  blob_box.set_right(blob_ends[i]);
1421  box_word->ChangeBox(i, blob_box);
1422  break;
1423  }
1424  }
1425  }
1426  }
1427  delete word_w->box_word;
1428  word_w->box_word = box_word;
1429  if (!input_word->combination) {
1430  // Insert word_w->word into the ROW. It doesn't own its word, so the
1431  // ROW needs to own it.
1432  w_it.add_before_stay_put(word_w->word);
1433  word_w->combination = false;
1434  }
1435  (*words)[w] = nullptr; // We are taking ownership.
1436  wr_it.add_before_stay_put(word_w);
1437  }
1438  // We have taken ownership of the words.
1439  words->clear();
1440  // Delete the current word, which has been replaced. We could just call
1441  // DeleteCurrentWord, but that would iterate both lists again, and we know
1442  // we are already in the right place.
1443  if (!input_word->combination)
1444  delete w_it.extract();
1445  delete wr_it.extract();
1447 }
1448 
1449 // Deletes the current WERD_RES and its underlying WERD.
1451  // Check that this word is as we expect. part_of_combos are NEVER iterated
1452  // by the normal iterator, so we should never be trying to delete them.
1453  ASSERT_HOST(!word_res->part_of_combo);
1454  if (!word_res->combination) {
1455  // Combinations own their own word, so we won't find the word on the
1456  // row's word_list, but it is legitimate to try to delete them.
1457  // Delete word from the ROW when not a combination.
1458  WERD_IT w_it(row()->row->word_list());
1459  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1460  if (w_it.data() == word_res->word) {
1461  break;
1462  }
1463  }
1464  ASSERT_HOST(!w_it.cycled_list());
1465  delete w_it.extract();
1466  }
1467  // Remove the WERD_RES for the new_word.
1468  // Remove the WORD_RES from the ROW_RES.
1469  WERD_RES_IT wr_it(&row()->word_res_list);
1470  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1471  if (wr_it.data() == word_res) {
1472  word_res = nullptr;
1473  break;
1474  }
1475  }
1476  ASSERT_HOST(!wr_it.cycled_list());
1477  delete wr_it.extract();
1479 }
1480 
1481 // Makes the current word a fuzzy space if not already fuzzy. Updates
1482 // corresponding part of combo if required.
1484  WERD* real_word = word_res->word;
1485  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
1486  real_word->set_flag(W_FUZZY_SP, true);
1487  if (word_res->combination) {
1488  // The next word should be the corresponding part of combo, but we have
1489  // already stepped past it, so find it by search.
1490  WERD_RES_IT wr_it(&row()->word_res_list);
1491  for (wr_it.mark_cycle_pt();
1492  !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
1493  }
1494  wr_it.forward();
1495  ASSERT_HOST(wr_it.data()->part_of_combo);
1496  real_word = wr_it.data()->word;
1497  ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
1498  !real_word->flag(W_FUZZY_NON));
1499  real_word->set_flag(W_FUZZY_SP, true);
1500  }
1501  }
1502 }
1503 
1504 /*************************************************************************
1505  * PAGE_RES_IT::restart_page
1506  *
1507  * Set things up at the start of the page
1508  *************************************************************************/
1509 
1511  block_res_it.set_to_list(&page_res->block_res_list);
1512  block_res_it.mark_cycle_pt();
1513  prev_block_res = nullptr;
1514  prev_row_res = nullptr;
1515  prev_word_res = nullptr;
1516  block_res = nullptr;
1517  row_res = nullptr;
1518  word_res = nullptr;
1519  next_block_res = nullptr;
1520  next_row_res = nullptr;
1521  next_word_res = nullptr;
1522  internal_forward(true, empty_ok);
1523  return internal_forward(false, empty_ok);
1524 }
1525 
1526 // Recovers from operations on the current word, such as in InsertCloneWord
1527 // and DeleteCurrentWord.
1528 // Resets the word_res_it so that it is one past the next_word_res, as
1529 // it should be after internal_forward. If next_row_res != row_res,
1530 // then the next_word_res is in the next row, so there is no need to do
1531 // anything to word_res_it, but it is still a good idea to reset the pointers
1532 // word_res and prev_word_res, which are still in the current row.
1534  if (row_res == next_row_res) {
1535  // Reset the member iterator so it can move forward and detect the
1536  // cycled_list state correctly.
1537  word_res_it.move_to_first();
1538  for (word_res_it.mark_cycle_pt();
1539  !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
1540  word_res_it.forward()) {
1541  if (!word_res_it.data()->part_of_combo) {
1542  if (prev_row_res == row_res) prev_word_res = word_res;
1543  word_res = word_res_it.data();
1544  }
1545  }
1546  ASSERT_HOST(!word_res_it.cycled_list());
1547  word_res_it.forward();
1548  } else {
1549  // word_res_it is OK, but reset word_res and prev_word_res if needed.
1550  WERD_RES_IT wr_it(&row_res->word_res_list);
1551  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1552  if (!wr_it.data()->part_of_combo) {
1553  if (prev_row_res == row_res) prev_word_res = word_res;
1554  word_res = wr_it.data();
1555  }
1556  }
1557  }
1558 }
1559 
1560 /*************************************************************************
1561  * PAGE_RES_IT::internal_forward
1562  *
1563  * Find the next word on the page. If empty_ok is true, then non-text blocks
1564  * and text blocks with no text are visited as if they contain a single
1565  * imaginary word in a single imaginary row. (word() and row() both return nullptr
1566  * in such a block and the return value is nullptr.)
1567  * If empty_ok is false, the old behaviour is maintained. Each real word
1568  * is visited and empty and non-text blocks and rows are skipped.
1569  * new_block is used to initialize the iterators for a new block.
1570  * The iterator maintains pointers to block, row and word for the previous,
1571  * current and next words. These are correct, regardless of block/row
1572  * boundaries. nullptr values denote start and end of the page.
1573  *************************************************************************/
1574 
1575 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
1576  bool new_row = false;
1577 
1578  prev_block_res = block_res;
1579  prev_row_res = row_res;
1580  prev_word_res = word_res;
1581  block_res = next_block_res;
1582  row_res = next_row_res;
1583  word_res = next_word_res;
1584  next_block_res = nullptr;
1585  next_row_res = nullptr;
1586  next_word_res = nullptr;
1587 
1588  while (!block_res_it.cycled_list()) {
1589  if (new_block) {
1590  new_block = false;
1591  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
1592  row_res_it.mark_cycle_pt();
1593  if (row_res_it.empty() && empty_ok) {
1594  next_block_res = block_res_it.data();
1595  break;
1596  }
1597  new_row = true;
1598  }
1599  while (!row_res_it.cycled_list()) {
1600  if (new_row) {
1601  new_row = false;
1602  word_res_it.set_to_list(&row_res_it.data()->word_res_list);
1603  word_res_it.mark_cycle_pt();
1604  }
1605  // Skip any part_of_combo words.
1606  while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
1607  word_res_it.forward();
1608  if (!word_res_it.cycled_list()) {
1609  next_block_res = block_res_it.data();
1610  next_row_res = row_res_it.data();
1611  next_word_res = word_res_it.data();
1612  word_res_it.forward();
1613  goto foundword;
1614  }
1615  // end of row reached
1616  row_res_it.forward();
1617  new_row = true;
1618  }
1619  // end of block reached
1620  block_res_it.forward();
1621  new_block = true;
1622  }
1623  foundword:
1624  // Update prev_word_best_choice pointer.
1625  if (page_res != nullptr && page_res->prev_word_best_choice != nullptr) {
1627  (new_block || prev_word_res == nullptr) ? nullptr : prev_word_res->best_choice;
1628  }
1629  return word_res;
1630 }
1631 
1632 /*************************************************************************
1633  * PAGE_RES_IT::restart_row()
1634  *
1635  * Move to the beginning (leftmost word) of the current row.
1636  *************************************************************************/
1638  ROW_RES *row = this->row();
1639  if (!row) return nullptr;
1640  for (restart_page(); this->row() != row; forward()) {
1641  // pass
1642  }
1643  return word();
1644 }
1645 
1646 /*************************************************************************
1647  * PAGE_RES_IT::forward_paragraph
1648  *
1649  * Move to the beginning of the next paragraph, allowing empty blocks.
1650  *************************************************************************/
1651 
1653  while (block_res == next_block_res &&
1654  (next_row_res != nullptr && next_row_res->row != nullptr &&
1655  row_res->row->para() == next_row_res->row->para())) {
1656  internal_forward(false, true);
1657  }
1658  return internal_forward(false, true);
1659 }
1660 
1661 /*************************************************************************
1662  * PAGE_RES_IT::forward_block
1663  *
1664  * Move to the beginning of the next block, allowing empty blocks.
1665  *************************************************************************/
1666 
1668  while (block_res == next_block_res) {
1669  internal_forward(false, true);
1670  }
1671  return internal_forward(false, true);
1672 }
1673 
1675  int16_t chars_in_word;
1676  int16_t rejects_in_word = 0;
1677 
1678  chars_in_word = word_res->reject_map.length ();
1679  page_res->char_count += chars_in_word;
1680  block_res->char_count += chars_in_word;
1681  row_res->char_count += chars_in_word;
1682 
1683  rejects_in_word = word_res->reject_map.reject_count ();
1684 
1685  page_res->rej_count += rejects_in_word;
1686  block_res->rej_count += rejects_in_word;
1687  row_res->rej_count += rejects_in_word;
1688  if (chars_in_word == rejects_in_word)
1689  row_res->whole_word_rej_count += rejects_in_word;
1690 }
void BLNormalize(const BLOCK *block, const ROW *row, Pix *pix, bool inverse, float x_height, float baseline_shift, bool numeric_mode, tesseract::OcrEngineMode hint, const TBOX *norm_box, DENORM *word_denorm)
Definition: blobs.cpp:800
void DebugTopChoice(const char *msg) const
Definition: pageres.cpp:505
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:359
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1322
BLOCK_RES()=default
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1084
WERD_CHOICE_LIST best_choices
Definition: pageres.h:243
WERD_RES_LIST word_res_list
Definition: pageres.h:147
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:771
Definition: werd.h:43
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1036
const double kMaxWordGapRatio
Definition: pageres.cpp:62
float space_certainty
Definition: pageres.h:316
bool tess_failed
Definition: pageres.h:288
TWERD * rebuild_word
Definition: pageres.h:260
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:980
float certainty() const
Definition: ratngs.h:83
int UNICHAR_ID
Definition: unichar.h:35
void remove_unichar_id(int index)
Definition: ratngs.h:484
bool guessed_x_ht
Definition: pageres.h:308
int size() const
Definition: genericvector.h:71
void delete_matrix_pointers()
Definition: matrix.h:455
WERD_RES * start_page(bool empty_ok)
Definition: pageres.cpp:1510
#define TRUE
Definition: capi.h:51
Definition: blobs.h:402
float x_height
Definition: pageres.h:122
int32_t rej_count
Definition: pageres.h:80
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:904
const double kMaxLineSizeRatio
Definition: pageres.cpp:60
ROW_RES * row() const
Definition: pageres.h:754
void SetupBlobWidthsAndGaps()
Definition: pageres.cpp:406
void CopyResults(const BlamerBundle &other)
Definition: blamer.h:206
tesseract::BoxWord * bln_boxes
Definition: pageres.h:198
WERD_RES * forward_paragraph()
Definition: pageres.cpp:1652
int32_t whole_word_rej_count
Definition: pageres.h:146
bool guessed_caps_ht
Definition: pageres.h:309
MATRIX_COORD MatrixCoord(int index) const
Definition: ratngs.cpp:302
void start_seam_list(TWERD *word, GenericVector< SEAM *> *seam_array)
Definition: seam.cpp:269
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: ratngs.cpp:626
GenericVector< int > blob_widths
Definition: pageres.h:219
bool null_box() const
Definition: rect.h:50
const int kWordrecMaxNumJoinChunks
Definition: pageres.cpp:55
void move(GenericVector< T > *from)
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:567
int8_t italic
Definition: pageres.h:301
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:746
REJMAP reject_map
Definition: pageres.h:287
void ClipToOriginalWord(const BLOCK *block, WERD *original_word)
Definition: boxword.cpp:92
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:452
MATRIX * ConsumeAndMakeBigger(int ind)
Definition: matrix.cpp:63
const char * string() const
Definition: strngs.cpp:196
int count(LIST var_list)
Definition: oldlist.cpp:98
void print() const
Definition: ratngs.h:580
void CopySimpleFields(const WERD_RES &source)
Definition: pageres.cpp:255
int state(int index) const
Definition: ratngs.h:319
TBOX bounding_box() const
Definition: werd.cpp:159
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:765
Definition: seam.h:44
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:424
void MergeBoxes(int start, int end)
Definition: boxword.cpp:131
void remove(int index)
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
Definition: pageres.cpp:1047
void fix_hyphens()
Definition: pageres.cpp:1053
bool font_assigned
Definition: pageres.h:123
int32_t char_count
Definition: pageres.h:118
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:118
const BLOCK * block() const
Definition: normalis.h:273
ROW_RES_LIST row_res_list
Definition: pageres.h:128
Definition: rect.h:34
int NumBlobs() const
Definition: blobs.h:432
WERD_LIST * word_list()
Definition: ocrrow.h:55
static void JoinPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:216
static void BreakPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:194
void Clear()
Definition: pageres.cpp:1143
int32_t length() const
Definition: rejctmap.h:223
Definition: werd.h:35
void reserve(int size)
int latin_sid() const
Definition: unicharset.h:880
bool bold
Definition: pageres.h:125
bool odd_size
Definition: pageres.h:300
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:445
void merge_tess_fails()
Definition: pageres.cpp:1073
ROW_RES()=default
const FontInfo * fontinfo
Definition: pageres.h:304
void UpdateStateForSplit(int blob_position)
Definition: ratngs.cpp:702
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
bool operator==(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1195
float rating() const
Definition: ratngs.h:327
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
float certainty() const
Definition: ratngs.h:330
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1065
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:127
bool script_has_xheight() const
Definition: unicharset.h:898
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:199
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:93
float body_size() const
Definition: ocrrow.h:73
void InitNonPointers()
Definition: pageres.cpp:1100
uint8_t space()
Definition: werd.h:102
BLOCK_RES_LIST block_res_list
Definition: pageres.h:81
float baseline_shift
Definition: pageres.h:313
const TBOX & BlobBox(int index) const
Definition: boxword.h:84
T & back() const
int GetBlobsWidth(int start_blob, int last_blob)
Definition: pageres.cpp:736
bool Valid(const MATRIX &m) const
Definition: matrix.h:615
WERD_RES * forward_block()
Definition: pageres.cpp:1667
void SetScriptPositions()
Definition: pageres.cpp:864
int8_t fontinfo_id2_count
Definition: pageres.h:307
bool small_caps
Definition: pageres.h:299
void ResetWordIterator()
Definition: pageres.cpp:1533
GenericVector< STRING > correct_text
Definition: pageres.h:275
bool IsAmbiguous()
Definition: pageres.cpp:458
bool dangerous_ambig_found() const
Definition: ratngs.h:363
bool PrepareToInsertSeam(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int insert_index, bool modify)
Definition: seam.cpp:82
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:54
void set_right(int x)
Definition: rect.h:82
int16_t width() const
Definition: rect.h:115
WERD_RES * restart_page()
Definition: pageres.h:698
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:519
BLOCK * block
Definition: pageres.h:117
void SetupBlamerBundle()
Definition: pageres.cpp:399
void SetupBoxWord()
Definition: pageres.cpp:855
bool tess_would_adapt
Definition: pageres.h:297
int16_t left() const
Definition: rect.h:72
void rej_stat_word()
Definition: pageres.cpp:1674
float ascenders() const
Definition: ocrrow.h:82
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
WERD_RES * restart_row()
Definition: pageres.cpp:1637
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:486
void insert(const T &t, int index)
DENORM denorm
Definition: pageres.h:204
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:310
WERD_RES & operator=(const WERD_RES &source)
Definition: pageres.cpp:192
void ClearResults()
Definition: blamer.h:185
float x_height() const
Definition: ocrrow.h:64
void fix_quotes()
Definition: pageres.cpp:1024
int16_t reject_count()
Definition: rejctmap.h:229
void PrintBestChoices() const
Definition: pageres.cpp:723
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:145
void init_to_size(int size, const T &t)
WERD_CHOICE ** prev_word_best_choice
Definition: pageres.h:85
#define FALSE
Definition: capi.h:52
int8_t bold
Definition: pageres.h:302
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:883
virtual R Run(A1, A2)=0
void operator=(const ELIST_LINK &)
Definition: elst.h:101
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:610
void Init()
Definition: pageres.h:94
bool tess_accepted
Definition: pageres.h:296
int length() const
Definition: genericvector.h:85
void copy_on(WERD_RES *word_res)
Definition: pageres.h:660
WERD_RES * word() const
Definition: pageres.h:751
GenericVector< int > best_state
Definition: pageres.h:271
bool ConditionalBlobMerge(TessResultCallback2< UNICHAR_ID, UNICHAR_ID, UNICHAR_ID > *class_cb, TessResultCallback2< bool, const TBOX &, const TBOX &> *box_cb)
Definition: pageres.cpp:944
void InitPointers()
Definition: pageres.cpp:1128
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:308
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56
int dimension() const
Definition: matrix.h:533
#define ELISTIZE(CLASSNAME)
Definition: elst.h:961
void BestChoiceToCorrectText()
Definition: pageres.cpp:929
void CloneChoppedToRebuild()
Definition: pageres.cpp:841
Definition: werd.h:59
static BoxWord * CopyFromNormalized(TWERD *tessword)
Definition: boxword.cpp:56
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:873
int8_t fontinfo_id_count
Definition: pageres.h:306
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
Definition: ocrrow.h:36
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
TBOX bounding_box() const
Definition: blobs.cpp:478
float adjust_factor() const
Definition: ratngs.h:306
Definition: werd.h:34
int32_t char_count
Definition: pageres.h:79
void put(ICOORD pos, const T &thing)
Definition: matrix.h:220
bool IsText() const
Definition: polyblk.h:49
Definition: ocrblock.h:30
int length() const
Definition: ratngs.h:303
PAGE_RES * page_res
Definition: pageres.h:677
C_BLOB_LIST * cblob_list()
Definition: werd.h:98
float caps_height
Definition: pageres.h:312
GenericVector< SEAM * > seam_array
Definition: pageres.h:217
void set_script_id(int id)
Definition: werd.h:111
int cmp(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1201
GenericVector< int > blob_gaps
Definition: pageres.h:222
int push_back(T object)
int default_sid() const
Definition: unicharset.h:888
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:449
void add_str_int(const char *str, int number)
Definition: strngs.cpp:379
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:358
bool done
Definition: pageres.h:298
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1483
float descenders() const
Definition: ocrrow.h:85
void set_left(int x)
Definition: rect.h:75
bool reject_spaces
Definition: pageres.h:336
PAGE_RES()
Definition: pageres.h:102
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:390
bool combination
Definition: pageres.h:334
float rating() const
Definition: ratngs.h:80
tesseract::Tesseract * tesseract
Definition: pageres.h:282
float x_height
Definition: pageres.h:311
const double kMaxWordSizeRatio
Definition: pageres.cpp:58
void ClearResults()
Definition: pageres.cpp:1153
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:349
Definition: strngs.h:45
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:125
void DeleteCurrentWord()
Definition: pageres.cpp:1450
int32_t rej_count
Definition: pageres.h:119
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1269
void MergeBlobs(int start, int end)
Definition: blobs.cpp:882
MATRIX * ratings
Definition: pageres.h:231
const UNICHARSET * uch_set
Definition: pageres.h:206
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
WERD_CHOICE * ep_choice
Definition: pageres.h:286
void delete_data_pointers()
bool italic
Definition: pageres.h:126
void ClearRatings()
Definition: pageres.cpp:1186
int length() const
Definition: boxword.h:83
BlamerBundle * blamer_bundle
Definition: pageres.h:246
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:283
const STRING & unichar_string() const
Definition: ratngs.h:541
int16_t font_class
Definition: pageres.h:120
void ClearWordChoices()
Definition: pageres.cpp:1178
bool part_of_combo
Definition: pageres.h:335
int16_t right() const
Definition: rect.h:79
void truncate(int size)
ROW * blob_row
Definition: pageres.h:200
void RebuildBestState()
Definition: pageres.cpp:814
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:180
WERD_RES * forward()
Definition: pageres.h:731
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:550
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
int32_t char_count
Definition: pageres.h:144
Definition: matrix.h:575
Definition: blobs.h:268
int16_t row_count
Definition: pageres.h:121
WERD_CHOICE * raw_choice
Definition: pageres.h:240
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:786
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:871
void ComputeBoundingBoxes()
Definition: blobs.cpp:865
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:801
TWERD * chopped_word
Definition: pageres.h:215
bool HasAnySplits() const
Definition: seam.h:67
int32_t rej_count
Definition: pageres.h:145
bool StatesAllValid()
Definition: pageres.cpp:464
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1014
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
PDBLK pdblk
Definition: ocrblock.h:192
WERD_CHOICE * best_choice
Definition: pageres.h:235
void SetupNormTruthWord(const DENORM &denorm)
Definition: blamer.cpp:150
int16_t height() const
Definition: rect.h:108
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:626
PermuterType
Definition: ratngs.h:242
tesseract::BoxWord * box_word
Definition: pageres.h:266
void InsertBox(int index, const TBOX &box)
Definition: boxword.cpp:148
T get(ICOORD pos) const
Definition: matrix.h:228
BLOB_CHOICE_LIST * blob_choices(int index, MATRIX *ratings) const
Definition: ratngs.cpp:290
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:756
void initialise(int16_t length)
Definition: rejctmap.cpp:275
int TotalOfStates() const
Definition: ratngs.cpp:714
void remove_pos(int16_t pos)
Definition: rejctmap.cpp:311
ROW * row
Definition: pageres.h:143
#define ASSERT_HOST(x)
Definition: errcode.h:84
const FontInfo * fontinfo2
Definition: pageres.h:305
PARA * para() const
Definition: ocrrow.h:118
void set_permuter(uint8_t perm)
Definition: ratngs.h:375
WERD * word
Definition: pageres.h:189