All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
pageres.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pageres.h (Formerly page_res.h)
3  * Description: Results classes used by control.c
4  * Author: Phil Cheatle
5  * Created: Tue Sep 22 08:42:49 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 #ifndef PAGERES_H
20 #define PAGERES_H
21 
22 #include "blamer.h"
23 #include "blobs.h"
24 #include "boxword.h"
25 #include "elst.h"
26 #include "genericvector.h"
27 #include "normalis.h"
28 #include "ocrblock.h"
29 #include "ocrrow.h"
31 #include "ratngs.h"
32 #include "rejctmap.h"
33 #include "seam.h"
34 #include "werd.h"
35 
36 namespace tesseract {
37 struct FontInfo;
38 class Tesseract;
39 }
41 
42 /* Forward declarations */
43 
44 class BLOCK_RES;
45 
47 class
48 ROW_RES;
49 
50 ELISTIZEH (ROW_RES)
51 class WERD_RES;
52 
53 ELISTIZEH (WERD_RES)
54 
55 /*************************************************************************
56  * PAGE_RES - Page results
57  *************************************************************************/
58 class PAGE_RES { // page result
59  public:
62  BLOCK_RES_LIST block_res_list;
64  // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to
65  // the next word. This pointer is not owned by PAGE_RES class.
67  // Sums of blame reasons computed by the blamer.
69  // Debug information about all the misadaptions on this page.
70  // Each BlamerBundle contains an index into this vector, so that words that
71  // caused misadaption could be marked. However, since words could be
72  // deleted/split/merged, the log is stored on the PAGE_RES level.
74 
75  inline void Init() {
76  char_count = 0;
77  rej_count = 0;
78  rejected = FALSE;
79  prev_word_best_choice = NULL;
80  blame_reasons.init_to_size(IRR_NUM_REASONS, 0);
81  }
82 
83  PAGE_RES() { Init(); } // empty constructor
84 
85  PAGE_RES(bool merge_similar_words,
86  BLOCK_LIST *block_list, // real blocks
87  WERD_CHOICE **prev_word_best_choice_ptr);
88 
89  ~PAGE_RES () { // destructor
90  }
91 };
92 
93 /*************************************************************************
94  * BLOCK_RES - Block results
95  *************************************************************************/
96 
97 class BLOCK_RES:public ELIST_LINK {
98  public:
99  BLOCK * block; // real block
100  inT32 char_count; // chars in block
101  inT32 rej_count; // rejected chars
104  float x_height;
105  BOOL8 font_assigned; // block already
106  // processed
107  BOOL8 bold; // all bold
108  BOOL8 italic; // all italic
109 
110  ROW_RES_LIST row_res_list;
111 
113  } // empty constructor
114 
115  BLOCK_RES(bool merge_similar_words, BLOCK *the_block); // real block
116 
117  ~BLOCK_RES () { // destructor
118  }
119 };
120 
121 /*************************************************************************
122  * ROW_RES - Row results
123  *************************************************************************/
124 
125 class ROW_RES:public ELIST_LINK {
126  public:
127  ROW * row; // real row
128  inT32 char_count; // chars in block
129  inT32 rej_count; // rejected chars
130  inT32 whole_word_rej_count; // rejs in total rej wds
131  WERD_RES_LIST word_res_list;
132 
134  } // empty constructor
135 
136  ROW_RES(bool merge_similar_words, ROW *the_row); // real row
137 
138  ~ROW_RES() { // destructor
139  }
140 };
141 
142 /*************************************************************************
143  * WERD_RES - Word results
144  *************************************************************************/
146 {
151 };
152 
153 // WERD_RES is a collection of publicly accessible members that gathers
154 // information about a word result.
155 class WERD_RES : public ELIST_LINK {
156  public:
157  // Which word is which?
158  // There are 3 coordinate spaces in use here: a possibly rotated pixel space,
159  // the original image coordinate space, and the BLN space in which the
160  // baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight,
161  // and the x-middle of the word is at 0.
162  // In the rotated pixel space, coordinates correspond to the input image,
163  // but may be rotated about the origin by a multiple of 90 degrees,
164  // and may therefore be negative.
165  // In any case a rotation by denorm.block()->re_rotation() will take them
166  // back to the original image.
167  // The other differences between words all represent different stages of
168  // processing during recognition.
169 
170  // ---------------------------INPUT-------------------------------------
171 
172  // The word is the input C_BLOBs in the rotated pixel space.
173  // word is NOT owned by the WERD_RES unless combination is true.
174  // All the other word pointers ARE owned by the WERD_RES.
175  WERD* word; // Input C_BLOB word.
176 
177  // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------
178 
179  // The bln_boxes contains the bounding boxes (only) of the input word, in the
180  // BLN space. The lengths of word and bln_boxes
181  // match as they are both before any chopping.
182  // TODO(rays) determine if docqual does anything useful and delete bln_boxes
183  // if it doesn't.
184  tesseract::BoxWord* bln_boxes; // BLN input bounding boxes.
185  // The ROW that this word sits in. NOT owned by the WERD_RES.
187  // The denorm provides the transformation to get back to the rotated image
188  // coords from the chopped_word/rebuild_word BLN coords, but each blob also
189  // has its own denorm.
190  DENORM denorm; // For use on chopped_word.
191  // Unicharset used by the classifier output in best_choice and raw_choice.
192  const UNICHARSET* uch_set; // For converting back to utf8.
193 
194  // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION----
195  // ----Setup to a (different!) state expected by the various classifiers----
196  // TODO(rays) Tidy and make more consistent.
197 
198  // The chopped_word is also in BLN space, and represents the fully chopped
199  // character fragments that make up the word.
200  // The length of chopped_word matches length of seam_array + 1 (if set).
201  TWERD* chopped_word; // BLN chopped fragments output.
202  // Vector of SEAM* holding chopping points matching chopped_word.
204  // Widths of blobs in chopped_word.
206  // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
207  // blob i and blob i+1.
209  // Ratings matrix contains classifier choices for each classified combination
210  // of blobs. The dimension is the same as the number of blobs in chopped_word
211  // and the leading diagonal corresponds to classifier results of the blobs
212  // in chopped_word. The state_ members of best_choice, raw_choice and
213  // best_choices all correspond to this ratings matrix and allow extraction
214  // of the blob choices for any given WERD_CHOICE.
215  MATRIX* ratings; // Owned pointer.
216  // Pointer to the first WERD_CHOICE in best_choices. This is the result that
217  // will be output from Tesseract. Note that this is now a borrowed pointer
218  // and should NOT be deleted.
219  WERD_CHOICE* best_choice; // Borrowed pointer.
220  // The best raw_choice found during segmentation search. Differs from the
221  // best_choice by being the best result according to just the character
222  // classifier, not taking any language model information into account.
223  // Unlike best_choice, the pointer IS owned by this WERD_RES.
224  WERD_CHOICE* raw_choice; // Owned pointer.
225  // Alternative results found during chopping/segmentation search stages.
226  // Note that being an ELIST, best_choices owns the WERD_CHOICEs.
227  WERD_CHOICE_LIST best_choices;
228 
229  // Truth bounding boxes, text and incorrect choice reason.
231 
232  // --------------OUTPUT FROM RECOGNITION-------------------------------
233  // --------------Not all fields are necessarily set.-------------------
234  // ---best_choice, raw_choice *must* end up set, with a box_word-------
235  // ---In complete output, the number of blobs in rebuild_word matches---
236  // ---the number of boxes in box_word, the number of unichar_ids in---
237  // ---best_choice, the number of ints in best_state, and the number---
238  // ---of strings in correct_text--------------------------------------
239  // ---SetupFake Sets everything to appropriate values if the word is---
240  // ---known to be bad before recognition.------------------------------
241 
242  // The rebuild_word is also in BLN space, but represents the final best
243  // segmentation of the word. Its length is therefore the same as box_word.
244  TWERD* rebuild_word; // BLN best segmented word.
245  // The box_word is in the original image coordinate space. It is the
246  // bounding boxes of the rebuild_word, after denormalization.
247  // The length of box_word matches rebuild_word, best_state (if set) and
248  // correct_text (if set), as well as best_choice and represents the
249  // number of classified units in the output.
250  tesseract::BoxWord* box_word; // Denormalized output boxes.
251  // The best_state stores the relationship between chopped_word and
252  // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i]
253  // adjacent blobs in chopped_word. The seams in seam_array are hidden
254  // within a rebuild_word blob and revealed between them.
255  GenericVector<int> best_state; // Number of blobs in each best blob.
256  // The correct_text is used during training and adaption to carry the
257  // text to the training system without the need for a unicharset. There
258  // is one entry in the vector for each blob in rebuild_word and box_word.
260  // The Tesseract that was used to recognize this word. Just a borrowed
261  // pointer. Note: Tesseract's class definition is in a higher-level library.
262  // We avoid introducing a cyclic dependency by not using the Tesseract
263  // within WERD_RES. We are just storing it to provide access to it
264  // for the top-level multi-language controller, and maybe for output of
265  // the recognized language.
267 
268  // Less-well documented members.
269  // TODO(rays) Add more documentation here.
270  WERD_CHOICE *ep_choice; // ep text TODO(rays) delete this.
271  REJMAP reject_map; // best_choice rejects
273  /*
274  If tess_failed is TRUE, one of the following tests failed when Tess
275  returned:
276  - The outword blob list was not the same length as the best_choice string;
277  - The best_choice string contained ALL blanks;
278  - The best_choice string was zero length
279  */
280  BOOL8 tess_accepted; // Tess thinks its ok?
281  BOOL8 tess_would_adapt; // Tess would adapt?
282  BOOL8 done; // ready for output?
283  bool small_caps; // word appears to be small caps
284  bool odd_size; // word is bigger than line or leader dots.
287  // The fontinfos are pointers to data owned by the classifier.
290  inT8 fontinfo_id_count; // number of votes
291  inT8 fontinfo_id2_count; // number of votes
295  float x_height; // post match estimate
296  float caps_height; // post match estimate
297  float baseline_shift; // post match estimate.
298 
299  /*
300  To deal with fuzzy spaces we need to be able to combine "words" to form
301  combinations when we suspect that the gap is a non-space. The (new) text
302  ord code generates separate words for EVERY fuzzy gap - flags in the word
303  indicate whether the gap is below the threshold (fuzzy kern) and is thus
304  NOT a real word break by default, or above the threshold (fuzzy space) and
305  this is a real word break by default.
306 
307  The WERD_RES list contains all these words PLUS "combination" words built
308  out of (copies of) the words split by fuzzy kerns. The separate parts have
309  their "part_of_combo" flag set true and should be IGNORED on a default
310  reading of the list.
311 
312  Combination words are FOLLOWED by the sequence of part_of_combo words
313  which they combine.
314  */
315  BOOL8 combination; //of two fuzzy gap wds
316  BOOL8 part_of_combo; //part of a combo
317  BOOL8 reject_spaces; //Reject spacing?
318 
320  InitNonPointers();
321  InitPointers();
322  }
323  WERD_RES(WERD *the_word) {
324  InitNonPointers();
325  InitPointers();
326  word = the_word;
327  }
328  // Deep copies everything except the ratings MATRIX.
329  // To get that use deep_copy below.
330  WERD_RES(const WERD_RES &source) {
331  InitPointers();
332  *this = source; // see operator=
333  }
334 
335  ~WERD_RES();
336 
337  // Returns the UTF-8 string for the given blob index in the best_choice word,
338  // given that we know whether we are in a right-to-left reading context.
339  // This matters for mirrorable characters such as parentheses. We recognize
340  // characters purely based on their shape on the page, and by default produce
341  // the corresponding unicode for a left-to-right context.
342  const char* const BestUTF8(int blob_index, bool in_rtl_context) const {
343  if (blob_index < 0 || best_choice == NULL ||
344  blob_index >= best_choice->length())
345  return NULL;
346  UNICHAR_ID id = best_choice->unichar_id(blob_index);
347  if (id < 0 || id >= uch_set->size() || id == INVALID_UNICHAR_ID)
348  return NULL;
349  UNICHAR_ID mirrored = uch_set->get_mirror(id);
350  if (in_rtl_context && mirrored > 0 && mirrored != INVALID_UNICHAR_ID)
351  id = mirrored;
352  return uch_set->id_to_unichar_ext(id);
353  }
354  // Returns the UTF-8 string for the given blob index in the raw_choice word.
355  const char* const RawUTF8(int blob_index) const {
356  if (blob_index < 0 || blob_index >= raw_choice->length())
357  return NULL;
358  UNICHAR_ID id = raw_choice->unichar_id(blob_index);
359  if (id < 0 || id >= uch_set->size() || id == INVALID_UNICHAR_ID)
360  return NULL;
361  return uch_set->id_to_unichar(id);
362  }
363 
364  UNICHARSET::Direction SymbolDirection(int blob_index) const {
365  if (best_choice == NULL ||
366  blob_index >= best_choice->length() ||
367  blob_index < 0)
369  return uch_set->get_direction(best_choice->unichar_id(blob_index));
370  }
371 
372  bool AnyRtlCharsInWord() const {
373  if (uch_set == NULL || best_choice == NULL || best_choice->length() < 1)
374  return false;
375  for (int id = 0; id < best_choice->length(); id++) {
376  int unichar_id = best_choice->unichar_id(id);
377  if (unichar_id < 0 || unichar_id >= uch_set->size())
378  continue; // Ignore illegal chars.
380  uch_set->get_direction(unichar_id);
381  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
384  return true;
385  }
386  return false;
387  }
388 
389  bool AnyLtrCharsInWord() const {
390  if (uch_set == NULL || best_choice == NULL || best_choice->length() < 1)
391  return false;
392  for (int id = 0; id < best_choice->length(); id++) {
393  int unichar_id = best_choice->unichar_id(id);
394  if (unichar_id < 0 || unichar_id >= uch_set->size())
395  continue; // Ignore illegal chars.
396  UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
397  if (dir == UNICHARSET::U_LEFT_TO_RIGHT)
398  return true;
399  }
400  return false;
401  }
402 
403  // Return whether the blobs in this WERD_RES 0, 1,... come from an engine
404  // that gave us the unichars in reading order (as opposed to strict left
405  // to right).
406  bool UnicharsInReadingOrder() const {
407  return best_choice->unichars_in_script_order();
408  }
409 
410  void InitNonPointers();
411  void InitPointers();
412  void Clear();
413  void ClearResults();
414  void ClearWordChoices();
415  void ClearRatings();
416 
417  // Deep copies everything except the ratings MATRIX.
418  // To get that use deep_copy below.
419  WERD_RES& operator=(const WERD_RES& source); //from this
420 
421  void CopySimpleFields(const WERD_RES& source);
422 
423  // Initializes a blank (default constructed) WERD_RES from one that has
424  // already been recognized.
425  // Use SetupFor*Recognition afterwards to complete the setup and make
426  // it ready for a retry recognition.
427  void InitForRetryRecognition(const WERD_RES& source);
428 
429  // Sets up the members used in recognition: bln_boxes, chopped_word,
430  // seam_array, denorm. Returns false if
431  // the word is empty and sets up fake results. If use_body_size is
432  // true and row->body_size is set, then body_size will be used for
433  // blob normalization instead of xheight + ascrise. This flag is for
434  // those languages that are using CJK pitch model and thus it has to
435  // be true if and only if tesseract->textord_use_cjk_fp_model is
436  // true.
437  // If allow_detailed_fx is true, the feature extractor will receive fine
438  // precision outline information, allowing smoother features and better
439  // features on low resolution images.
440  // The norm_mode sets the default mode for normalization in absence
441  // of any of the above flags. It should really be a tesseract::OcrEngineMode
442  // but is declared as int for ease of use with tessedit_ocr_engine_mode.
443  // Returns false if the word is empty and sets up fake results.
444  bool SetupForRecognition(const UNICHARSET& unicharset_in,
445  tesseract::Tesseract* tesseract, Pix* pix,
446  int norm_mode,
447  const TBOX* norm_box, bool numeric_mode,
448  bool use_body_size, bool allow_detailed_fx,
449  ROW *row, const BLOCK* block);
450 
451  // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
452  // accumulators from a made chopped word. We presume the fields are already
453  // empty.
454  void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in);
455 
456  // Sets up the members used in recognition for an empty recognition result:
457  // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
458  void SetupFake(const UNICHARSET& uch);
459 
460  // Set the word as having the script of the input unicharset.
461  void SetupWordScript(const UNICHARSET& unicharset_in);
462 
463  // Sets up the blamer_bundle if it is not null, using the initialized denorm.
464  void SetupBlamerBundle();
465 
466  // Computes the blob_widths and blob_gaps from the chopped_word.
467  void SetupBlobWidthsAndGaps();
468 
469  // Updates internal data to account for a new SEAM (chop) at the given
470  // blob_number. Fixes the ratings matrix and states in the choices, as well
471  // as the blob widths and gaps.
472  void InsertSeam(int blob_number, SEAM* seam);
473 
474  // Returns true if all the word choices except the first have adjust_factors
475  // worse than the given threshold.
476  bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const;
477 
478  // Returns true if the current word is ambiguous (by number of answers or
479  // by dangerous ambigs.)
480  bool IsAmbiguous();
481 
482  // Returns true if the ratings matrix size matches the sum of each of the
483  // segmentation states.
484  bool StatesAllValid();
485 
486  // Prints a list of words found if debug is true or the word result matches
487  // the word_to_debug.
488  void DebugWordChoices(bool debug, const char* word_to_debug);
489 
490  // Prints the top choice along with the accepted/done flags.
491  void DebugTopChoice(const char* msg) const;
492 
493  // Removes from best_choices all choices which are not within a reasonable
494  // range of the best choice.
495  void FilterWordChoices(int debug_level);
496 
497  // Computes a set of distance thresholds used to control adaption.
498  // Compares the best choice for the current word to the best raw choice
499  // to determine which characters were classified incorrectly by the
500  // classifier. Then places a separate threshold into thresholds for each
501  // character in the word. If the classifier was correct, max_rating is placed
502  // into thresholds. If the classifier was incorrect, the mean match rating
503  // (error percentage) of the classifier's incorrect choice minus some margin
504  // is placed into thresholds. This can then be used by the caller to try to
505  // create a new template for the desired class that will classify the
506  // character with a rating better than the threshold value. The match rating
507  // placed into thresholds is never allowed to be below min_rating in order to
508  // prevent trying to make overly tight templates.
509  // min_rating limits how tight to make a template.
510  // max_rating limits how loose to make a template.
511  // rating_margin denotes the amount of margin to put in template.
512  void ComputeAdaptionThresholds(float certainty_scale,
513  float min_rating,
514  float max_rating,
515  float rating_margin,
516  float* thresholds);
517 
518  // Saves a copy of the word_choice if it has the best unadjusted rating.
519  // Returns true if the word_choice was the new best.
520  bool LogNewRawChoice(WERD_CHOICE* word_choice);
521  // Consumes word_choice by adding it to best_choices, (taking ownership) if
522  // the certainty for word_choice is some distance of the best choice in
523  // best_choices, or by deleting the word_choice and returning false.
524  // The best_choices list is kept in sorted order by rating. Duplicates are
525  // removed, and the list is kept no longer than max_num_choices in length.
526  // Returns true if the word_choice is still a valid pointer.
527  bool LogNewCookedChoice(int max_num_choices, bool debug,
528  WERD_CHOICE* word_choice);
529 
530  // Prints a brief list of all the best choices.
531  void PrintBestChoices() const;
532 
533  // Returns the sum of the widths of the blob between start_blob and last_blob
534  // inclusive.
535  int GetBlobsWidth(int start_blob, int last_blob);
536  // Returns the width of a gap between the specified blob and the next one.
537  int GetBlobsGap(int blob_index);
538 
539  // Returns the BLOB_CHOICE corresponding to the given index in the
540  // best choice word taken from the appropriate cell in the ratings MATRIX.
541  // Borrowed pointer, so do not delete. May return NULL if there is no
542  // BLOB_CHOICE matching the unichar_id at the given index.
543  BLOB_CHOICE* GetBlobChoice(int index) const;
544 
545  // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
546  // best choice word taken from the appropriate cell in the ratings MATRIX.
547  // Borrowed pointer, so do not delete.
548  BLOB_CHOICE_LIST* GetBlobChoices(int index) const;
549 
550  // Moves the results fields from word to this. This takes ownership of all
551  // the data, so src can be destructed.
552  // word1.ConsumeWordResult(word);
553  // delete word;
554  // is simpler and faster than:
555  // word1 = *word;
556  // delete word;
557  // as it doesn't need to copy and reallocate anything.
558  void ConsumeWordResults(WERD_RES* word);
559 
560  // Replace the best choice and rebuild box word.
561  // choice must be from the current best_choices list.
562  void ReplaceBestChoice(WERD_CHOICE* choice);
563 
564  // Builds the rebuild_word and sets the best_state from the chopped_word and
565  // the best_choice->state.
566  void RebuildBestState();
567 
568  // Copies the chopped_word to the rebuild_word, faking a best_state as well.
569  // Also sets up the output box_word.
570  void CloneChoppedToRebuild();
571 
572  // Sets/replaces the box_word with one made from the rebuild_word.
573  void SetupBoxWord();
574 
575  // Sets up the script positions in the best_choice using the best_choice
576  // to get the unichars, and the unicharset to get the target positions.
577  void SetScriptPositions();
578  // Sets all the blobs in all the words (best choice and alternates) to be
579  // the given position. (When a sub/superscript is recognized as a separate
580  // word, it falls victim to the rule that a whole word cannot be sub or
581  // superscript, so this function overrides that problem.)
583 
584  // Classifies the word with some already-calculated BLOB_CHOICEs.
585  // The choices are an array of blob_count pointers to BLOB_CHOICE,
586  // providing a single classifier result for each blob.
587  // The BLOB_CHOICEs are consumed and the word takes ownership.
588  // The number of blobs in the box_word must match blob_count.
589  void FakeClassifyWord(int blob_count, BLOB_CHOICE** choices);
590 
591  // Creates a WERD_CHOICE for the word using the top choices from the leading
592  // diagonal of the ratings matrix.
593  void FakeWordFromRatings();
594 
595  // Copies the best_choice strings to the correct_text for adaption/training.
597 
598  // Merges 2 adjacent blobs in the result if the permanent callback
599  // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
600  // callback box_cb is NULL or returns true, setting the merged blob
601  // result to the class returned from class_cb.
602  // Returns true if anything was merged.
606 
607  // Merges 2 adjacent blobs in the result (index and index+1) and corrects
608  // all the data to account for the change.
609  void MergeAdjacentBlobs(int index);
610 
611  // Callback helper for fix_quotes returns a double quote if both
612  // arguments are quote, otherwise INVALID_UNICHAR_ID.
614  void fix_quotes();
615 
616  // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
617  // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
619  // Callback helper for fix_hyphens returns true if box1 and box2 overlap
620  // (assuming both on the same textline, are in order and a chopped em dash.)
621  bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2);
622  void fix_hyphens();
623 
624  // Callback helper for merge_tess_fails returns a space if both
625  // arguments are space, otherwise INVALID_UNICHAR_ID.
627  void merge_tess_fails();
628 
629  // Returns a really deep copy of *src, including the ratings MATRIX.
630  static WERD_RES* deep_copy(const WERD_RES* src) {
631  WERD_RES* result = new WERD_RES(*src);
632  // That didn't copy the ratings, but we want a copy if there is one to
633  // begin width.
634  if (src->ratings != NULL)
635  result->ratings = src->ratings->DeepCopy();
636  return result;
637  }
638 
639  // Copy blobs from word_res onto this word (eliminating spaces between).
640  // Since this may be called bidirectionally OR both the BOL and EOL flags.
641  void copy_on(WERD_RES *word_res) { //from this word
642  word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL));
643  word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL));
644  word->copy_on(word_res->word);
645  }
646 
647  // Returns true if the collection of count pieces, starting at start, are all
648  // natural connected components, ie there are no real chops involved.
649  bool PiecesAllNatural(int start, int count) const;
650 };
651 
652 /*************************************************************************
653  * PAGE_RES_IT - Page results iterator
654  *************************************************************************/
655 
656 class PAGE_RES_IT {
657  public:
658  PAGE_RES * page_res; // page being iterated
659 
661  } // empty contructor
662 
663  PAGE_RES_IT(PAGE_RES *the_page_res) { // page result
664  page_res = the_page_res;
665  restart_page(); // ready to scan
666  }
667 
668  // Do two PAGE_RES_ITs point at the same word?
669  // This is much cheaper than cmp().
670  bool operator ==(const PAGE_RES_IT &other) const;
671 
672  bool operator !=(const PAGE_RES_IT &other) const {return !(*this == other); }
673 
674  // Given another PAGE_RES_IT to the same page,
675  // this before other: -1
676  // this equal to other: 0
677  // this later than other: 1
678  int cmp(const PAGE_RES_IT &other) const;
679 
681  return start_page(false); // Skip empty blocks.
682  }
684  return start_page(true); // Allow empty blocks.
685  }
686  WERD_RES *start_page(bool empty_ok);
687 
689 
690  // ============ Methods that mutate the underling structures ===========
691  // Note that these methods will potentially invalidate other PAGE_RES_ITs
692  // and are intended to be used only while a single PAGE_RES_IT is active.
693  // This problem needs to be taken into account if these mutation operators
694  // are ever provided to PageIterator or its subclasses.
695 
696  // Inserts the new_word and a corresponding WERD_RES before the current
697  // position. The simple fields of the WERD_RES are copied from clone_res and
698  // the resulting WERD_RES is returned for further setup with best_choice etc.
699  WERD_RES* InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word);
700 
701  // Replaces the current WERD/WERD_RES with the given words. The given words
702  // contain fake blobs that indicate the position of the characters. These are
703  // replaced with real blobs from the current word as much as possible.
705 
706  // Deletes the current WERD_RES and its underlying WERD.
707  void DeleteCurrentWord();
708 
709  // Makes the current word a fuzzy space if not already fuzzy. Updates
710  // corresponding part of combo if required.
711  void MakeCurrentWordFuzzy();
712 
713  WERD_RES *forward() { // Get next word.
714  return internal_forward(false, false);
715  }
716  // Move forward, but allow empty blocks to show as single NULL words.
718  return internal_forward(false, true);
719  }
720 
721  WERD_RES *forward_paragraph(); // get first word in next non-empty paragraph
722  WERD_RES *forward_block(); // get first word in next non-empty block
723 
724  WERD_RES *prev_word() const { // previous word
725  return prev_word_res;
726  }
727  ROW_RES *prev_row() const { // row of prev word
728  return prev_row_res;
729  }
730  BLOCK_RES *prev_block() const { // block of prev word
731  return prev_block_res;
732  }
733  WERD_RES *word() const { // current word
734  return word_res;
735  }
736  ROW_RES *row() const { // row of current word
737  return row_res;
738  }
739  BLOCK_RES *block() const { // block of cur. word
740  return block_res;
741  }
742  WERD_RES *next_word() const { // next word
743  return next_word_res;
744  }
745  ROW_RES *next_row() const { // row of next word
746  return next_row_res;
747  }
748  BLOCK_RES *next_block() const { // block of next word
749  return next_block_res;
750  }
751  void rej_stat_word(); // for page/block/row
752  void ResetWordIterator();
753 
754  private:
755  WERD_RES *internal_forward(bool new_block, bool empty_ok);
756 
757  WERD_RES * prev_word_res; // previous word
758  ROW_RES *prev_row_res; // row of prev word
759  BLOCK_RES *prev_block_res; // block of prev word
760 
761  WERD_RES *word_res; // current word
762  ROW_RES *row_res; // row of current word
763  BLOCK_RES *block_res; // block of cur. word
764 
765  WERD_RES *next_word_res; // next word
766  ROW_RES *next_row_res; // row of next word
767  BLOCK_RES *next_block_res; // block of next word
768 
769  BLOCK_RES_IT block_res_it; // iterators
770  ROW_RES_IT row_res_it;
771  WERD_RES_IT word_res_it;
772 };
773 #endif
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:343
bool StatesAllValid()
Definition: pageres.cpp:449
BOOL8 tess_accepted
Definition: pageres.h:280
void SetScriptPositions()
Definition: pageres.cpp:853
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:757
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:471
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:596
void rej_stat_word()
Definition: pageres.cpp:1673
WERD_RES_LIST word_res_list
Definition: pageres.h:131
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
tesseract::BoxWord * box_word
Definition: pageres.h:250
void ClearResults()
Definition: pageres.cpp:1140
BLOCK_RES_LIST block_res_list
Definition: pageres.h:62
void RebuildBestState()
Definition: pageres.cpp:800
#define CLISTIZEH(CLASSNAME)
Definition: clst.h:946
const char *const RawUTF8(int blob_index) const
Definition: pageres.h:355
bool unichars_in_script_order() const
Definition: ratngs.h:518
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:787
void ClearWordChoices()
Definition: pageres.cpp:1173
WERD_RES * start_page(bool empty_ok)
Definition: pageres.cpp:1509
MATRIX * ratings
Definition: pageres.h:215
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:553
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
ROW_RES * next_row() const
Definition: pageres.h:745
void ResetWordIterator()
Definition: pageres.cpp:1532
WERD_RES(WERD *the_word)
Definition: pageres.h:323
REJMAP reject_map
Definition: pageres.h:271
TWERD * chopped_word
Definition: pageres.h:201
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1321
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:612
inT32 char_count
Definition: pageres.h:60
inT8 bold
Definition: pageres.h:286
void SetupBlamerBundle()
Definition: pageres.cpp:384
inT32 whole_word_rej_count
Definition: pageres.h:130
~ROW_RES()
Definition: pageres.h:138
WERD_RES & operator=(const WERD_RES &source)
Definition: pageres.cpp:178
BOOL8 reject_spaces
Definition: pageres.h:317
bool UnicharsInReadingOrder() const
Definition: pageres.h:406
float caps_height
Definition: pageres.h:296
PAGE_RES * page_res
Definition: pageres.h:658
PAGE_RES_IT(PAGE_RES *the_page_res)
Definition: pageres.h:663
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:334
UNICHARSET::Direction SymbolDirection(int blob_index) const
Definition: pageres.h:364
BLOCK_RES * prev_block() const
Definition: pageres.h:730
unsigned char BOOL8
Definition: host.h:113
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:375
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:269
~PAGE_RES()
Definition: pageres.h:89
BOOL8 font_assigned
Definition: pageres.h:105
GenericVector< STRING > correct_text
Definition: pageres.h:259
const FontInfo * fontinfo
Definition: pageres.h:288
bool operator!=(const PAGE_RES_IT &other) const
Definition: pageres.h:672
void fix_quotes()
Definition: pageres.cpp:1012
bool AnyRtlCharsInWord() const
Definition: pageres.h:372
GenericVector< int > blame_reasons
Definition: pageres.h:68
bool small_caps
Definition: pageres.h:283
void DeleteCurrentWord()
Definition: pageres.cpp:1449
void Init()
Definition: pageres.h:75
float x_height
Definition: pageres.h:295
BLOCK * block
Definition: pageres.h:99
WERD_RES * restart_page_with_empties()
Definition: pageres.h:683
BOOL8 tess_would_adapt
Definition: pageres.h:281
BOOL8 rejected
Definition: pageres.h:63
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:630
Definition: ocrrow.h:32
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1024
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:645
ELISTIZEH(BLOCK_RES) CLISTIZEH(BLOCK_RES) class ROW_RES
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
Definition: pageres.cpp:1035
Definition: werd.h:35
float x_height
Definition: pageres.h:104
void InitPointers()
Definition: pageres.cpp:1115
BLOCK_RES * block() const
Definition: pageres.h:739
BOOL8 part_of_combo
Definition: pageres.h:316
void DebugTopChoice(const char *msg) const
Definition: pageres.cpp:490
BOOL8 combination
Definition: pageres.h:315
WERD_RES * forward()
Definition: pageres.h:713
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:732
GenericVector< STRING > misadaption_log
Definition: pageres.h:73
inT8 fontinfo_id_count
Definition: pageres.h:290
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1482
WERD_RES * restart_page()
Definition: pageres.h:680
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:872
Definition: werd.h:36
BOOL8 italic
Definition: pageres.h:108
CRUNCH_MODE
Definition: pageres.h:145
void CloneChoppedToRebuild()
Definition: pageres.cpp:828
ROW_RES()
Definition: pageres.h:133
TWERD * rebuild_word
Definition: pageres.h:244
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
inT32 char_count
Definition: pageres.h:100
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
Definition: ocrblock.h:30
const UNICHARSET * uch_set
Definition: pageres.h:192
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1053
void InitNonPointers()
Definition: pageres.cpp:1088
void init_to_size(int size, T t)
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:504
ROW_RES * row() const
Definition: pageres.h:736
DENORM denorm
Definition: pageres.h:190
GenericVector< int > blob_gaps
Definition: pageres.h:208
bool odd_size
Definition: pageres.h:284
MATRIX * DeepCopy() const
Definition: matrix.cpp:94
WERD_CHOICE * raw_choice
Definition: pageres.h:224
WERD_RES * forward_block()
Definition: pageres.cpp:1666
WERD_RES * restart_row()
Definition: pageres.cpp:1636
void SetupBlobWidthsAndGaps()
Definition: pageres.cpp:391
inT32 rej_count
Definition: pageres.h:61
tesseract::Tesseract * tesseract
Definition: pageres.h:266
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1072
void PrintBestChoices() const
Definition: pageres.cpp:709
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
int cmp(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1200
int UNICHAR_ID
Definition: unichar.h:33
float baseline_shift
Definition: pageres.h:297
BOOL8 guessed_x_ht
Definition: pageres.h:292
BLOCK_RES * next_block() const
Definition: pageres.h:748
const FontInfo * fontinfo2
Definition: pageres.h:289
WERD_RES * forward_with_empties()
Definition: pageres.h:717
inT32 rej_count
Definition: pageres.h:129
Definition: werd.h:60
inT32 char_count
Definition: pageres.h:128
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:409
BOOL8 done
Definition: pageres.h:282
WERD_RES()
Definition: pageres.h:319
inT8 fontinfo_id2_count
Definition: pageres.h:291
int GetBlobsWidth(int start_blob, int last_blob)
Definition: pageres.cpp:722
WERD * word
Definition: pageres.h:175
inT32 rej_count
Definition: pageres.h:101
ROW_RES_LIST row_res_list
Definition: pageres.h:110
BLOCK_RES()
Definition: pageres.h:112
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1268
inT16 font_class
Definition: pageres.h:102
ROW * blob_row
Definition: pageres.h:186
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:638
#define FALSE
Definition: capi.h:29
WERD_CHOICE * ep_choice
Definition: pageres.h:270
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:430
tesseract::BoxWord * bln_boxes
Definition: pageres.h:184
bool ConditionalBlobMerge(TessResultCallback2< UNICHAR_ID, UNICHAR_ID, UNICHAR_ID > *class_cb, TessResultCallback2< bool, const TBOX &, const TBOX & > *box_cb)
Definition: pageres.cpp:932
BOOL8 tess_failed
Definition: pageres.h:272
int count(LIST var_list)
Definition: oldlist.cpp:108
void fix_hyphens()
Definition: pageres.cpp:1041
ROW * row
Definition: pageres.h:127
Definition: rect.h:30
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294
WERD_RES * next_word() const
Definition: pageres.h:742
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:860
void SetupBoxWord()
Definition: pageres.cpp:843
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:968
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
PAGE_RES()
Definition: pageres.h:83
Definition: matrix.h:289
bool AnyLtrCharsInWord() const
Definition: pageres.h:389
GenericVector< int > best_state
Definition: pageres.h:255
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
inT16 row_count
Definition: pageres.h:103
#define NULL
Definition: host.h:144
bool IsAmbiguous()
Definition: pageres.cpp:443
SIGNED char inT8
Definition: host.h:98
GenericVector< int > blob_widths
Definition: pageres.h:205
const char *const id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:274
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1002
bool operator==(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1194
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:742
PAGE_RES_IT()
Definition: pageres.h:660
WERD_CHOICE ** prev_word_best_choice
Definition: pageres.h:66
Definition: blobs.h:395
WERD_RES * prev_word() const
Definition: pageres.h:724
Definition: seam.h:44
void copy_on(WERD_RES *word_res)
Definition: pageres.h:641
inT8 italic
Definition: pageres.h:285
int size() const
Definition: unicharset.h:297
WERD_RES * forward_paragraph()
Definition: pageres.cpp:1651
BOOL8 bold
Definition: pageres.h:107
ROW_RES * prev_row() const
Definition: pageres.h:727
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
void copy_on(WERD *other)
Definition: werd.cpp:234
~BLOCK_RES()
Definition: pageres.h:117
void merge_tess_fails()
Definition: pageres.cpp:1061
void FakeWordFromRatings()
Definition: pageres.cpp:892
void BestChoiceToCorrectText()
Definition: pageres.cpp:917
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:751
const char *const BestUTF8(int blob_index, bool in_rtl_context) const
Definition: pageres.h:342
BlamerBundle * blamer_bundle
Definition: pageres.h:230
void Clear()
Definition: pageres.cpp:1130
BOOL8 guessed_caps_ht
Definition: pageres.h:293
WERD_RES * word() const
Definition: pageres.h:733
WERD_RES(const WERD_RES &source)
Definition: pageres.h:330
short inT16
Definition: host.h:100
void ClearRatings()
Definition: pageres.cpp:1185
int inT32
Definition: host.h:102
void CopySimpleFields(const WERD_RES &source)
Definition: pageres.cpp:241