tesseract  5.0.0-alpha-619-ge9db
pageres.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pageres.h (Formerly page_res.h)
3  * Description: Results classes used by control.c
4  * Author: Phil Cheatle
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifndef PAGERES_H
20 #define PAGERES_H
21 
22 #include <cstdint> // for int32_t, int16_t
23 #include <functional> // for std::function
24 #include <set> // for std::pair
25 #include <vector> // for std::vector
26 #include <sys/types.h> // for int8_t
27 #include "blamer.h" // for BlamerBundle (ptr only), IRR_NUM_REASONS
28 #include "clst.h" // for CLIST_ITERATOR, CLISTIZEH
29 #include "elst.h" // for ELIST_ITERATOR, ELIST_LINK, ELISTIZEH
30 #include <tesseract/genericvector.h> // for GenericVector, PointerVector (ptr only)
31 #include "matrix.h" // for MATRIX
32 #include "normalis.h" // for DENORM
33 #include "ratngs.h" // for WERD_CHOICE, BLOB_CHOICE (ptr only)
34 #include "rect.h" // for TBOX
35 #include "rejctmap.h" // for REJMAP
36 #include <tesseract/strngs.h> // for STRING
37 #include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID
38 #include "unicharset.h" // for UNICHARSET, UNICHARSET::Direction, UNI...
39 #include "werd.h" // for WERD, W_BOL, W_EOL
40 
41 class BLOCK;
42 class BLOCK_LIST;
43 class BLOCK_RES;
44 class ROW;
45 class ROW_RES;
46 class SEAM;
47 class WERD_RES;
48 
49 struct Pix;
50 struct TWERD;
51 
52 namespace tesseract {
53  class BoxWord;
54  class Tesseract;
55  struct FontInfo;
56 }
58 
59 /* Forward declarations */
60 
61 class BLOCK_RES;
62 
64 class
65 ROW_RES;
66 
68 class WERD_RES;
69 
71 
72 /*************************************************************************
73  * PAGE_RES - Page results
74  *************************************************************************/
75 class PAGE_RES { // page result
76  public:
77  int32_t char_count;
78  int32_t rej_count;
79  BLOCK_RES_LIST block_res_list;
80  bool rejected;
81  // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to
82  // the next word. This pointer is not owned by PAGE_RES class.
83  WERD_CHOICE **prev_word_best_choice;
84  // Sums of blame reasons computed by the blamer.
85  GenericVector<int> blame_reasons;
86  // Debug information about all the misadaptions on this page.
87  // Each BlamerBundle contains an index into this vector, so that words that
88  // caused misadaption could be marked. However, since words could be
89  // deleted/split/merged, the log is stored on the PAGE_RES level.
90  GenericVector<STRING> misadaption_log;
91 
92  inline void Init() {
93  char_count = 0;
94  rej_count = 0;
95  rejected = false;
96  prev_word_best_choice = nullptr;
97  blame_reasons.init_to_size(IRR_NUM_REASONS, 0);
98  }
99 
100  PAGE_RES() { Init(); } // empty constructor
101 
102  PAGE_RES(bool merge_similar_words,
103  BLOCK_LIST *block_list, // real blocks
104  WERD_CHOICE **prev_word_best_choice_ptr);
105 
106  ~PAGE_RES () = default;
107 };
108 
109 /*************************************************************************
110  * BLOCK_RES - Block results
111  *************************************************************************/
112 
113 class BLOCK_RES:public ELIST_LINK {
114  public:
115  BLOCK * block; // real block
116  int32_t char_count; // chars in block
117  int32_t rej_count; // rejected chars
118  int16_t font_class; //
119  int16_t row_count;
120  float x_height;
121  bool font_assigned; // block already
122  // processed
123 
124  ROW_RES_LIST row_res_list;
125 
126  BLOCK_RES() = default;
127 
128  BLOCK_RES(bool merge_similar_words, BLOCK *the_block); // real block
129 
130  ~BLOCK_RES () = default;
131 };
132 
133 /*************************************************************************
134  * ROW_RES - Row results
135  *************************************************************************/
136 
137 class ROW_RES:public ELIST_LINK {
138  public:
139  ROW * row; // real row
140  int32_t char_count; // chars in block
141  int32_t rej_count; // rejected chars
142  int32_t whole_word_rej_count; // rejs in total rej wds
143  WERD_RES_LIST word_res_list;
144 
145  ROW_RES() = default;
146 
147  ROW_RES(bool merge_similar_words, ROW *the_row); // real row
148 
149  ~ROW_RES() = default;
150 };
151 
152 /*************************************************************************
153  * WERD_RES - Word results
154  *************************************************************************/
156 {
157  CR_NONE,
161 };
162 
163 // WERD_RES is a collection of publicly accessible members that gathers
164 // information about a word result.
165 class WERD_RES : public ELIST_LINK {
166  public:
167  // Which word is which?
168  // There are 3 coordinate spaces in use here: a possibly rotated pixel space,
169  // the original image coordinate space, and the BLN space in which the
170  // baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight,
171  // and the x-middle of the word is at 0.
172  // In the rotated pixel space, coordinates correspond to the input image,
173  // but may be rotated about the origin by a multiple of 90 degrees,
174  // and may therefore be negative.
175  // In any case a rotation by denorm.block()->re_rotation() will take them
176  // back to the original image.
177  // The other differences between words all represent different stages of
178  // processing during recognition.
179 
180  // ---------------------------INPUT-------------------------------------
181 
182  // The word is the input C_BLOBs in the rotated pixel space.
183  // word is NOT owned by the WERD_RES unless combination is true.
184  // All the other word pointers ARE owned by the WERD_RES.
185  WERD* word = nullptr; // Input C_BLOB word.
186 
187  // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------
188 
189  // The bln_boxes contains the bounding boxes (only) of the input word, in the
190  // BLN space. The lengths of word and bln_boxes
191  // match as they are both before any chopping.
192  // TODO(rays) determine if docqual does anything useful and delete bln_boxes
193  // if it doesn't.
194  tesseract::BoxWord* bln_boxes = nullptr; // BLN input bounding boxes.
195  // The ROW that this word sits in. NOT owned by the WERD_RES.
196  ROW* blob_row = nullptr;
197  // The denorm provides the transformation to get back to the rotated image
198  // coords from the chopped_word/rebuild_word BLN coords, but each blob also
199  // has its own denorm.
200  DENORM denorm; // For use on chopped_word.
201  // Unicharset used by the classifier output in best_choice and raw_choice.
202  const UNICHARSET* uch_set = nullptr; // For converting back to utf8.
203 
204  // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION----
205  // ----Setup to a (different!) state expected by the various classifiers----
206  // TODO(rays) Tidy and make more consistent.
207 
208  // The chopped_word is also in BLN space, and represents the fully chopped
209  // character fragments that make up the word.
210  // The length of chopped_word matches length of seam_array + 1 (if set).
211  TWERD* chopped_word = nullptr; // BLN chopped fragments output.
212  // Vector of SEAM* holding chopping points matching chopped_word.
214  // Widths of blobs in chopped_word.
216  // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
217  // blob i and blob i+1.
219  // Stores the lstm choices of every timestep
220  std::vector<std::vector<std::pair<const char*, float>>> timesteps;
221  // Stores the lstm choices of every timestep segmented by character
222  std::vector<std::vector<std::vector<
223  std::pair<const char*, float>>>> segmented_timesteps;
224  //Symbolchoices aquired during CTC
225  std::vector<std::vector<std::pair<const char*, float>>> CTC_symbol_choices;
226  // Stores if the timestep vector starts with a space
227  bool leading_space = false;
228  // Stores value when the word ends
229  int end = 0;
230  // Ratings matrix contains classifier choices for each classified combination
231  // of blobs. The dimension is the same as the number of blobs in chopped_word
232  // and the leading diagonal corresponds to classifier results of the blobs
233  // in chopped_word. The state_ members of best_choice, raw_choice and
234  // best_choices all correspond to this ratings matrix and allow extraction
235  // of the blob choices for any given WERD_CHOICE.
236  MATRIX* ratings = nullptr; // Owned pointer.
237  // Pointer to the first WERD_CHOICE in best_choices. This is the result that
238  // will be output from Tesseract. Note that this is now a borrowed pointer
239  // and should NOT be deleted.
240  WERD_CHOICE* best_choice = nullptr; // Borrowed pointer.
241  // The best raw_choice found during segmentation search. Differs from the
242  // best_choice by being the best result according to just the character
243  // classifier, not taking any language model information into account.
244  // Unlike best_choice, the pointer IS owned by this WERD_RES.
245  WERD_CHOICE* raw_choice = nullptr; // Owned pointer.
246  // Alternative results found during chopping/segmentation search stages.
247  // Note that being an ELIST, best_choices owns the WERD_CHOICEs.
248  WERD_CHOICE_LIST best_choices;
249 
250  // Truth bounding boxes, text and incorrect choice reason.
251  BlamerBundle* blamer_bundle = nullptr;
252 
253  // --------------OUTPUT FROM RECOGNITION-------------------------------
254  // --------------Not all fields are necessarily set.-------------------
255  // ---best_choice, raw_choice *must* end up set, with a box_word-------
256  // ---In complete output, the number of blobs in rebuild_word matches---
257  // ---the number of boxes in box_word, the number of unichar_ids in---
258  // ---best_choice, the number of ints in best_state, and the number---
259  // ---of strings in correct_text--------------------------------------
260  // ---SetupFake Sets everything to appropriate values if the word is---
261  // ---known to be bad before recognition.------------------------------
262 
263  // The rebuild_word is also in BLN space, but represents the final best
264  // segmentation of the word. Its length is therefore the same as box_word.
265  TWERD* rebuild_word = nullptr; // BLN best segmented word.
266  // The box_word is in the original image coordinate space. It is the
267  // bounding boxes of the rebuild_word, after denormalization.
268  // The length of box_word matches rebuild_word, best_state (if set) and
269  // correct_text (if set), as well as best_choice and represents the
270  // number of classified units in the output.
271  tesseract::BoxWord* box_word = nullptr; // Denormalized output boxes.
272  // The Tesseract that was used to recognize this word. Just a borrowed
273  // pointer. Note: Tesseract's class definition is in a higher-level library.
274  // We avoid introducing a cyclic dependency by not using the Tesseract
275  // within WERD_RES. We are just storing it to provide access to it
276  // for the top-level multi-language controller, and maybe for output of
277  // the recognized language.
278  // tesseract points to data owned elsewhere.
280  // The best_state stores the relationship between chopped_word and
281  // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i]
282  // adjacent blobs in chopped_word. The seams in seam_array are hidden
283  // within a rebuild_word blob and revealed between them.
284  GenericVector<int> best_state; // Number of blobs in each best blob.
285  // The correct_text is used during training and adaption to carry the
286  // text to the training system without the need for a unicharset. There
287  // is one entry in the vector for each blob in rebuild_word and box_word.
289 
290  // Less-well documented members.
291  // TODO(rays) Add more documentation here.
292  WERD_CHOICE *ep_choice = nullptr; // ep text TODO(rays) delete this.
293  REJMAP reject_map; // best_choice rejects
294  bool tess_failed = false;
295  /*
296  If tess_failed is true, one of the following tests failed when Tess
297  returned:
298  - The outword blob list was not the same length as the best_choice string;
299  - The best_choice string contained ALL blanks;
300  - The best_choice string was zero length
301  */
302  bool tess_accepted = false; // Tess thinks its ok?
303  bool tess_would_adapt = false; // Tess would adapt?
304  bool done = false; // ready for output?
305  bool small_caps = false; // word appears to be small caps
306  bool odd_size = false; // word is bigger than line or leader dots.
307  // The fontinfos are pointers to data owned by the classifier.
308  const FontInfo* fontinfo = nullptr;
309  const FontInfo* fontinfo2 = nullptr;
310  int8_t fontinfo_id_count = 0; // number of votes
311  int8_t fontinfo_id2_count = 0; // number of votes
312  bool guessed_x_ht = true;
313  bool guessed_caps_ht = true;
315  float x_height = 0.0f; // post match estimate
316  float caps_height = 0.0f; // post match estimate
317  float baseline_shift = 0.0f; // post match estimate.
318  // Certainty score for the spaces either side of this word (LSTM mode).
319  // MIN this value with the actual word certainty.
320  float space_certainty = 0.0f;
321 
322  /*
323  To deal with fuzzy spaces we need to be able to combine "words" to form
324  combinations when we suspect that the gap is a non-space. The (new) text
325  ord code generates separate words for EVERY fuzzy gap - flags in the word
326  indicate whether the gap is below the threshold (fuzzy kern) and is thus
327  NOT a real word break by default, or above the threshold (fuzzy space) and
328  this is a real word break by default.
329 
330  The WERD_RES list contains all these words PLUS "combination" words built
331  out of (copies of) the words split by fuzzy kerns. The separate parts have
332  their "part_of_combo" flag set true and should be IGNORED on a default
333  reading of the list.
334 
335  Combination words are FOLLOWED by the sequence of part_of_combo words
336  which they combine.
337  */
338  bool combination = false; //of two fuzzy gap wds
339  bool part_of_combo = false; //part of a combo
340  bool reject_spaces = false; //Reject spacing?
341 
342  WERD_RES() = default;
343 
344  WERD_RES(WERD *the_word) {
345  word = the_word;
346  }
347  // Deep copies everything except the ratings MATRIX.
348  // To get that use deep_copy below.
349  WERD_RES(const WERD_RES& source) : ELIST_LINK(source) {
350  // combination is used in function Clear which is called from operator=.
351  combination = false;
352  *this = source; // see operator=
353  }
354 
355  ~WERD_RES();
356 
357  // Returns the UTF-8 string for the given blob index in the best_choice word,
358  // given that we know whether we are in a right-to-left reading context.
359  // This matters for mirrorable characters such as parentheses. We recognize
360  // characters purely based on their shape on the page, and by default produce
361  // the corresponding unicode for a left-to-right context.
362  const char* BestUTF8(int blob_index, bool in_rtl_context) const {
363  if (blob_index < 0 || best_choice == nullptr ||
364  blob_index >= best_choice->length())
365  return nullptr;
366  UNICHAR_ID id = best_choice->unichar_id(blob_index);
367  if (id < 0 || id >= uch_set->size())
368  return nullptr;
369  UNICHAR_ID mirrored = uch_set->get_mirror(id);
370  if (in_rtl_context && mirrored > 0)
371  id = mirrored;
372  return uch_set->id_to_unichar_ext(id);
373  }
374  // Returns the UTF-8 string for the given blob index in the raw_choice word.
375  const char* RawUTF8(int blob_index) const {
376  if (blob_index < 0 || blob_index >= raw_choice->length())
377  return nullptr;
378  UNICHAR_ID id = raw_choice->unichar_id(blob_index);
379  if (id < 0 || id >= uch_set->size())
380  return nullptr;
381  return uch_set->id_to_unichar(id);
382  }
383 
384  UNICHARSET::Direction SymbolDirection(int blob_index) const {
385  if (best_choice == nullptr ||
386  blob_index >= best_choice->length() ||
387  blob_index < 0)
389  return uch_set->get_direction(best_choice->unichar_id(blob_index));
390  }
391 
392  bool AnyRtlCharsInWord() const {
393  if (uch_set == nullptr || best_choice == nullptr || best_choice->length() < 1)
394  return false;
395  for (int id = 0; id < best_choice->length(); id++) {
396  int unichar_id = best_choice->unichar_id(id);
397  if (unichar_id < 0 || unichar_id >= uch_set->size())
398  continue; // Ignore illegal chars.
400  uch_set->get_direction(unichar_id);
401  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
403  return true;
404  }
405  return false;
406  }
407 
408  bool AnyLtrCharsInWord() const {
409  if (uch_set == nullptr || best_choice == nullptr || best_choice->length() < 1)
410  return false;
411  for (int id = 0; id < best_choice->length(); id++) {
412  int unichar_id = best_choice->unichar_id(id);
413  if (unichar_id < 0 || unichar_id >= uch_set->size())
414  continue; // Ignore illegal chars.
415  UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
416  if (dir == UNICHARSET::U_LEFT_TO_RIGHT ||
418  return true;
419  }
420  return false;
421  }
422 
423  // Return whether the blobs in this WERD_RES 0, 1,... come from an engine
424  // that gave us the unichars in reading order (as opposed to strict left
425  // to right).
426  bool UnicharsInReadingOrder() const {
428  }
429 
430  void Clear();
431  void ClearResults();
432  void ClearWordChoices();
433  void ClearRatings();
434 
435  // Deep copies everything except the ratings MATRIX.
436  // To get that use deep_copy below.
437  WERD_RES& operator=(const WERD_RES& source); //from this
438 
439  void CopySimpleFields(const WERD_RES& source);
440 
441  // Initializes a blank (default constructed) WERD_RES from one that has
442  // already been recognized.
443  // Use SetupFor*Recognition afterwards to complete the setup and make
444  // it ready for a retry recognition.
445  void InitForRetryRecognition(const WERD_RES& source);
446 
447  // Sets up the members used in recognition: bln_boxes, chopped_word,
448  // seam_array, denorm. Returns false if
449  // the word is empty and sets up fake results. If use_body_size is
450  // true and row->body_size is set, then body_size will be used for
451  // blob normalization instead of xheight + ascrise. This flag is for
452  // those languages that are using CJK pitch model and thus it has to
453  // be true if and only if tesseract->textord_use_cjk_fp_model is
454  // true.
455  // If allow_detailed_fx is true, the feature extractor will receive fine
456  // precision outline information, allowing smoother features and better
457  // features on low resolution images.
458  // The norm_mode sets the default mode for normalization in absence
459  // of any of the above flags. It should really be a tesseract::OcrEngineMode
460  // but is declared as int for ease of use with tessedit_ocr_engine_mode.
461  // Returns false if the word is empty and sets up fake results.
462  bool SetupForRecognition(const UNICHARSET& unicharset_in,
463  tesseract::Tesseract* tesseract, Pix* pix,
464  int norm_mode,
465  const TBOX* norm_box, bool numeric_mode,
466  bool use_body_size, bool allow_detailed_fx,
467  ROW *row, const BLOCK* block);
468 
469  // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
470  // accumulators from a made chopped word. We presume the fields are already
471  // empty.
472  void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in);
473 
474  // Sets up the members used in recognition for an empty recognition result:
475  // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
476  void SetupFake(const UNICHARSET& uch);
477 
478  // Set the word as having the script of the input unicharset.
479  void SetupWordScript(const UNICHARSET& unicharset_in);
480 
481  // Sets up the blamer_bundle if it is not null, using the initialized denorm.
482  void SetupBlamerBundle();
483 
484  // Computes the blob_widths and blob_gaps from the chopped_word.
485  void SetupBlobWidthsAndGaps();
486 
487  // Updates internal data to account for a new SEAM (chop) at the given
488  // blob_number. Fixes the ratings matrix and states in the choices, as well
489  // as the blob widths and gaps.
490  void InsertSeam(int blob_number, SEAM* seam);
491 
492  // Returns true if all the word choices except the first have adjust_factors
493  // worse than the given threshold.
494  bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const;
495 
496  // Returns true if the current word is ambiguous (by number of answers or
497  // by dangerous ambigs.)
498  bool IsAmbiguous();
499 
500  // Returns true if the ratings matrix size matches the sum of each of the
501  // segmentation states.
502  bool StatesAllValid();
503 
504  // Prints a list of words found if debug is true or the word result matches
505  // the word_to_debug.
506  void DebugWordChoices(bool debug, const char* word_to_debug);
507 
508  // Prints the top choice along with the accepted/done flags.
509  void DebugTopChoice(const char* msg) const;
510 
511  // Removes from best_choices all choices which are not within a reasonable
512  // range of the best choice.
513  void FilterWordChoices(int debug_level);
514 
515  // Computes a set of distance thresholds used to control adaption.
516  // Compares the best choice for the current word to the best raw choice
517  // to determine which characters were classified incorrectly by the
518  // classifier. Then places a separate threshold into thresholds for each
519  // character in the word. If the classifier was correct, max_rating is placed
520  // into thresholds. If the classifier was incorrect, the mean match rating
521  // (error percentage) of the classifier's incorrect choice minus some margin
522  // is placed into thresholds. This can then be used by the caller to try to
523  // create a new template for the desired class that will classify the
524  // character with a rating better than the threshold value. The match rating
525  // placed into thresholds is never allowed to be below min_rating in order to
526  // prevent trying to make overly tight templates.
527  // min_rating limits how tight to make a template.
528  // max_rating limits how loose to make a template.
529  // rating_margin denotes the amount of margin to put in template.
530  void ComputeAdaptionThresholds(float certainty_scale,
531  float min_rating,
532  float max_rating,
533  float rating_margin,
534  float* thresholds);
535 
536  // Saves a copy of the word_choice if it has the best unadjusted rating.
537  // Returns true if the word_choice was the new best.
538  bool LogNewRawChoice(WERD_CHOICE* word_choice);
539  // Consumes word_choice by adding it to best_choices, (taking ownership) if
540  // the certainty for word_choice is some distance of the best choice in
541  // best_choices, or by deleting the word_choice and returning false.
542  // The best_choices list is kept in sorted order by rating. Duplicates are
543  // removed, and the list is kept no longer than max_num_choices in length.
544  // Returns true if the word_choice is still a valid pointer.
545  bool LogNewCookedChoice(int max_num_choices, bool debug,
546  WERD_CHOICE* word_choice);
547 
548  // Prints a brief list of all the best choices.
549  void PrintBestChoices() const;
550 
551  // Returns the sum of the widths of the blob between start_blob and last_blob
552  // inclusive.
553  int GetBlobsWidth(int start_blob, int last_blob);
554  // Returns the width of a gap between the specified blob and the next one.
555  int GetBlobsGap(int blob_index);
556 
557  // Returns the BLOB_CHOICE corresponding to the given index in the
558  // best choice word taken from the appropriate cell in the ratings MATRIX.
559  // Borrowed pointer, so do not delete. May return nullptr if there is no
560  // BLOB_CHOICE matching the unichar_id at the given index.
561  BLOB_CHOICE* GetBlobChoice(int index) const;
562 
563  // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
564  // best choice word taken from the appropriate cell in the ratings MATRIX.
565  // Borrowed pointer, so do not delete.
566  BLOB_CHOICE_LIST* GetBlobChoices(int index) const;
567 
568  // Moves the results fields from word to this. This takes ownership of all
569  // the data, so src can be destructed.
570  // word1.ConsumeWordResult(word);
571  // delete word;
572  // is simpler and faster than:
573  // word1 = *word;
574  // delete word;
575  // as it doesn't need to copy and reallocate anything.
577 
578  // Replace the best choice and rebuild box word.
579  // choice must be from the current best_choices list.
580  void ReplaceBestChoice(WERD_CHOICE* choice);
581 
582  // Builds the rebuild_word and sets the best_state from the chopped_word and
583  // the best_choice->state.
584  void RebuildBestState();
585 
586  // Copies the chopped_word to the rebuild_word, faking a best_state as well.
587  // Also sets up the output box_word.
588  void CloneChoppedToRebuild();
589 
590  // Sets/replaces the box_word with one made from the rebuild_word.
591  void SetupBoxWord();
592 
593  // Sets up the script positions in the best_choice using the best_choice
594  // to get the unichars, and the unicharset to get the target positions.
595  void SetScriptPositions();
596  // Sets all the blobs in all the words (best choice and alternates) to be
597  // the given position. (When a sub/superscript is recognized as a separate
598  // word, it falls victim to the rule that a whole word cannot be sub or
599  // superscript, so this function overrides that problem.)
601 
602  // Classifies the word with some already-calculated BLOB_CHOICEs.
603  // The choices are an array of blob_count pointers to BLOB_CHOICE,
604  // providing a single classifier result for each blob.
605  // The BLOB_CHOICEs are consumed and the word takes ownership.
606  // The number of blobs in the box_word must match blob_count.
607  void FakeClassifyWord(int blob_count, BLOB_CHOICE** choices);
608 
609  // Creates a WERD_CHOICE for the word using the top choices from the leading
610  // diagonal of the ratings matrix.
611  void FakeWordFromRatings(PermuterType permuter);
612 
613  // Copies the best_choice strings to the correct_text for adaption/training.
615 
616  // Merges 2 adjacent blobs in the result if the permanent callback
617  // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
618  // callback box_cb is nullptr or returns true, setting the merged blob
619  // result to the class returned from class_cb.
620  // Returns true if anything was merged.
622  std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> class_cb,
623  std::function<bool(const TBOX&, const TBOX&)> box_cb);
624 
625  // Merges 2 adjacent blobs in the result (index and index+1) and corrects
626  // all the data to account for the change.
627  void MergeAdjacentBlobs(int index);
628 
629  // Callback helper for fix_quotes returns a double quote if both
630  // arguments are quote, otherwise INVALID_UNICHAR_ID.
632  void fix_quotes();
633 
634  // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
635  // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
637  // Callback helper for fix_hyphens returns true if box1 and box2 overlap
638  // (assuming both on the same textline, are in order and a chopped em dash.)
639  bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2);
640  void fix_hyphens();
641 
642  // Callback helper for merge_tess_fails returns a space if both
643  // arguments are space, otherwise INVALID_UNICHAR_ID.
645  void merge_tess_fails();
646 
647  // Returns a really deep copy of *src, including the ratings MATRIX.
648  static WERD_RES* deep_copy(const WERD_RES* src) {
649  auto* result = new WERD_RES(*src);
650  // That didn't copy the ratings, but we want a copy if there is one to
651  // begin with.
652  if (src->ratings != nullptr)
653  result->ratings = src->ratings->DeepCopy();
654  return result;
655  }
656 
657  // Copy blobs from word_res onto this word (eliminating spaces between).
658  // Since this may be called bidirectionally OR both the BOL and EOL flags.
659  void copy_on(WERD_RES *word_res) { //from this word
660  word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL));
661  word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL));
662  word->copy_on(word_res->word);
663  }
664 
665  // Returns true if the collection of count pieces, starting at start, are all
666  // natural connected components, ie there are no real chops involved.
667  bool PiecesAllNatural(int start, int count) const;
668 };
669 
670 /*************************************************************************
671  * PAGE_RES_IT - Page results iterator
672  *************************************************************************/
673 
674 class PAGE_RES_IT {
675  public:
676  PAGE_RES * page_res; // page being iterated
677 
678  PAGE_RES_IT() = default;
679 
680  PAGE_RES_IT(PAGE_RES *the_page_res) { // page result
681  page_res = the_page_res;
682  restart_page(); // ready to scan
683  }
684 
685  // Do two PAGE_RES_ITs point at the same word?
686  // This is much cheaper than cmp().
687  bool operator ==(const PAGE_RES_IT &other) const {
688  return word_res == other.word_res && row_res == other.row_res &&
689  block_res == other.block_res;
690  }
691 
692  bool operator !=(const PAGE_RES_IT &other) const {return !(*this == other); }
693 
694  // Given another PAGE_RES_IT to the same page,
695  // this before other: -1
696  // this equal to other: 0
697  // this later than other: 1
698  int cmp(const PAGE_RES_IT &other) const;
699 
701  return start_page(false); // Skip empty blocks.
702  }
704  return start_page(true); // Allow empty blocks.
705  }
706  WERD_RES *start_page(bool empty_ok);
707 
709 
710  // ============ Methods that mutate the underling structures ===========
711  // Note that these methods will potentially invalidate other PAGE_RES_ITs
712  // and are intended to be used only while a single PAGE_RES_IT is active.
713  // This problem needs to be taken into account if these mutation operators
714  // are ever provided to PageIterator or its subclasses.
715 
716  // Inserts the new_word and a corresponding WERD_RES before the current
717  // position. The simple fields of the WERD_RES are copied from clone_res and
718  // the resulting WERD_RES is returned for further setup with best_choice etc.
719  WERD_RES* InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word);
720 
721  // Replaces the current WERD/WERD_RES with the given words. The given words
722  // contain fake blobs that indicate the position of the characters. These are
723  // replaced with real blobs from the current word as much as possible.
725 
726  // Deletes the current WERD_RES and its underlying WERD.
727  void DeleteCurrentWord();
728 
729  // Makes the current word a fuzzy space if not already fuzzy. Updates
730  // corresponding part of combo if required.
731  void MakeCurrentWordFuzzy();
732 
733  WERD_RES *forward() { // Get next word.
734  return internal_forward(false, false);
735  }
736  // Move forward, but allow empty blocks to show as single nullptr words.
738  return internal_forward(false, true);
739  }
740 
741  WERD_RES *forward_paragraph(); // get first word in next non-empty paragraph
742  WERD_RES *forward_block(); // get first word in next non-empty block
743 
744  WERD_RES *prev_word() const { // previous word
745  return prev_word_res;
746  }
747  ROW_RES *prev_row() const { // row of prev word
748  return prev_row_res;
749  }
750  BLOCK_RES *prev_block() const { // block of prev word
751  return prev_block_res;
752  }
753  WERD_RES *word() const { // current word
754  return word_res;
755  }
756  ROW_RES *row() const { // row of current word
757  return row_res;
758  }
759  BLOCK_RES *block() const { // block of cur. word
760  return block_res;
761  }
762  WERD_RES *next_word() const { // next word
763  return next_word_res;
764  }
765  ROW_RES *next_row() const { // row of next word
766  return next_row_res;
767  }
768  BLOCK_RES *next_block() const { // block of next word
769  return next_block_res;
770  }
771  void rej_stat_word(); // for page/block/row
772  void ResetWordIterator();
773 
774  private:
775  WERD_RES *internal_forward(bool new_block, bool empty_ok);
776 
777  WERD_RES * prev_word_res; // previous word
778  ROW_RES *prev_row_res; // row of prev word
779  BLOCK_RES *prev_block_res; // block of prev word
780 
781  WERD_RES *word_res; // current word
782  ROW_RES *row_res; // row of current word
783  BLOCK_RES *block_res; // block of cur. word
784 
785  WERD_RES *next_word_res; // next word
786  ROW_RES *next_row_res; // row of next word
787  BLOCK_RES *next_block_res; // block of next word
788 
789  BLOCK_RES_IT block_res_it; // iterators
790  ROW_RES_IT row_res_it;
791  WERD_RES_IT word_res_it;
792  // Iterators used to get the state of word_res_it for the current word.
793  // Since word_res_it is 2 words further on, this is otherwise hard to do.
794  WERD_RES_IT wr_it_of_current_word;
795  WERD_RES_IT wr_it_of_next_word;
796 };
797 #endif
WERD_RES::done
bool done
Definition: pageres.h:299
UNICHARSET::get_direction
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:680
WERD_RES::SetupBoxWord
void SetupBoxWord()
Definition: pageres.cpp:845
WERD_RES::FakeWordFromRatings
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:894
elst.h
WERD_RES::ComputeAdaptionThresholds
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:557
strngs.h
normalis.h
WERD_RES::fix_hyphens
void fix_hyphens()
Definition: pageres.cpp:1042
BLOCK_RES::rej_count
int32_t rej_count
Definition: pageres.h:115
PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:728
WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:266
WERD_RES::blob_widths
GenericVector< int > blob_widths
Definition: pageres.h:210
BLOCK_RES::char_count
int32_t char_count
Definition: pageres.h:114
PAGE_RES_IT::next_block
BLOCK_RES * next_block() const
Definition: pageres.h:763
CR_DELETE
Definition: pageres.h:156
PAGE_RES_IT::forward_with_empties
WERD_RES * forward_with_empties()
Definition: pageres.h:732
WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:116
ROW_RES::rej_count
int32_t rej_count
Definition: pageres.h:138
WERD_RES::BestChoiceToCorrectText
void BestChoiceToCorrectText()
Definition: pageres.cpp:920
WERD_RES::PiecesAllNatural
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1074
BLOCK_RES::font_class
int16_t font_class
Definition: pageres.h:116
PAGE_RES_IT::PAGE_RES_IT
PAGE_RES_IT()=default
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
UNICHARSET::id_to_unichar_ext
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:298
WERD::copy_on
void copy_on(WERD *other)
Definition: werd.cpp:220
WERD_RES::AlternativeChoiceAdjustmentsWorseThan
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:435
WERD_RES::DebugTopChoice
void DebugTopChoice(const char *msg) const
Definition: pageres.cpp:495
WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:260
WERD_CHOICE
Definition: ratngs.h:261
TWERD
Definition: blobs.h:416
rejctmap.h
PAGE_RES_IT::operator==
bool operator==(const PAGE_RES_IT &other) const
Definition: pageres.h:682
WERD_RES::LogNewRawChoice
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:600
WERD_RES::bln_boxes
tesseract::BoxWord * bln_boxes
Definition: pageres.h:189
PAGE_RES_IT::forward_paragraph
WERD_RES * forward_paragraph()
Definition: pageres.cpp:1637
WERD_RES::odd_size
bool odd_size
Definition: pageres.h:301
WERD_RES::denorm
DENORM denorm
Definition: pageres.h:195
WERD_RES::GetBlobChoices
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:755
WERD_RES::BestUTF8
const char * BestUTF8(int blob_index, bool in_rtl_context) const
Definition: pageres.h:357
WERD_RES::BothSpaces
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1054
PermuterType
PermuterType
Definition: ratngs.h:230
PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:754
WERD_RES::ConditionalBlobMerge
bool ConditionalBlobMerge(std::function< UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> class_cb, std::function< bool(const TBOX &, const TBOX &)> box_cb)
Definition: pageres.cpp:935
PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:751
WERD_RES::unlv_crunch_mode
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:309
tesseract::Tesseract
Definition: tesseractclass.h:172
WERD_RES::RawUTF8
const char * RawUTF8(int blob_index) const
Definition: pageres.h:370
CR_NONE
Definition: pageres.h:153
WERD_RES::BothQuotes
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1003
MATRIX
Definition: matrix.h:574
PAGE_RES_IT::restart_page
WERD_RES * restart_page()
Definition: pageres.h:695
tesseract::PointerVector< WERD_RES >
PAGE_RES_IT::next_row
ROW_RES * next_row() const
Definition: pageres.h:760
WERD_RES::combination
bool combination
Definition: pageres.h:333
WERD_RES::x_height
float x_height
Definition: pageres.h:310
WERD_RES::fontinfo
const FontInfo * fontinfo
Definition: pageres.h:303
WERD_RES
Definition: pageres.h:160
UNICHARSET::U_OTHER_NEUTRAL
Definition: unicharset.h:167
WERD_RES::ConsumeWordResults
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:761
WERD_RES::merge_tess_fails
void merge_tess_fails()
Definition: pageres.cpp:1062
UNICHARSET::U_LEFT_TO_RIGHT
Definition: unicharset.h:157
PAGE_RES_IT::prev_row
ROW_RES * prev_row() const
Definition: pageres.h:742
WERD_RES::ratings
MATRIX * ratings
Definition: pageres.h:231
PAGE_RES_IT::prev_word
WERD_RES * prev_word() const
Definition: pageres.h:739
WERD_RES::best_state
GenericVector< int > best_state
Definition: pageres.h:279
WERD_RES::SetAllScriptPositions
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:861
WERD_RES::GetBlobChoice
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:746
rect.h
WERD_RES::SetupBasicsFromChoppedWord
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:339
BLOCK_RES
Definition: pageres.h:110
WERD_RES::CloneChoppedToRebuild
void CloneChoppedToRebuild()
Definition: pageres.cpp:831
PAGE_RES_IT::operator!=
bool operator!=(const PAGE_RES_IT &other) const
Definition: pageres.h:687
WERD_RES::fontinfo_id_count
int8_t fontinfo_id_count
Definition: pageres.h:305
SEAM
Definition: seam.h:36
WERD_RES::space_certainty
float space_certainty
Definition: pageres.h:315
ratngs.h
WERD_RES::tess_failed
bool tess_failed
Definition: pageres.h:289
WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:197
PAGE_RES_IT::InsertSimpleCloneWord
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1209
werd.h
WERD_RES::ep_choice
WERD_CHOICE * ep_choice
Definition: pageres.h:287
WERD_RES::blamer_bundle
BlamerBundle * blamer_bundle
Definition: pageres.h:246
WERD_RES::InitForRetryRecognition
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:273
WERD_RES::GetBlobsWidth
int GetBlobsWidth(int start_blob, int last_blob)
Definition: pageres.cpp:726
PAGE_RES_IT::MakeCurrentWordFuzzy
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1469
ROW_RES::char_count
int32_t char_count
Definition: pageres.h:137
ROW_RES::~ROW_RES
~ROW_RES()=default
genericvector.h
WERD_RES::segmented_timesteps
std::vector< std::vector< std::vector< std::pair< const char *, float > > > > segmented_timesteps
Definition: pageres.h:218
WERD_RES::AnyLtrCharsInWord
bool AnyLtrCharsInWord() const
Definition: pageres.h:403
tesseract::ScriptPos
ScriptPos
Definition: ratngs.h:250
BLOCK
Definition: ocrblock.h:28
WERD_RES::deep_copy
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:643
PAGE_RES_IT::restart_row
WERD_RES * restart_row()
Definition: pageres.cpp:1623
WERD_RES::PrintBestChoices
void PrintBestChoices() const
Definition: pageres.cpp:713
PAGE_RES_IT::forward_block
WERD_RES * forward_block()
Definition: pageres.cpp:1651
WERD_RES::DebugWordChoices
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:476
BLOCK_RES::row_count
int16_t row_count
Definition: pageres.h:117
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
WERD_RES::SetupForRecognition
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:298
WERD_RES::LogNewCookedChoice
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:616
unicharset.h
WERD_RES::InsertSeam
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:414
ROW_RES::row
ROW * row
Definition: pageres.h:136
WERD_RES::fontinfo2
const FontInfo * fontinfo2
Definition: pageres.h:304
WERD_RES::Clear
void Clear()
Definition: pageres.cpp:1090
CLISTIZEH
CLISTIZEH(STRING) CLISTIZE(STRING) namespace tesseract
Definition: reject.cpp:50
WERD::set_flag
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:117
WERD_RES::SymbolDirection
UNICHARSET::Direction SymbolDirection(int blob_index) const
Definition: pageres.h:379
WERD_RES::ClearResults
void ClearResults()
Definition: pageres.cpp:1100
WERD_RES::baseline_shift
float baseline_shift
Definition: pageres.h:312
W_EOL
end of line
Definition: werd.h:47
BLOCK_RES::font_assigned
bool font_assigned
Definition: pageres.h:119
ROW_RES::whole_word_rej_count
int32_t whole_word_rej_count
Definition: pageres.h:139
WERD_RES::CTC_symbol_choices
std::vector< std::vector< std::pair< const char *, float > > > CTC_symbol_choices
Definition: pageres.h:220
matrix.h
PAGE_RES_IT::prev_block
BLOCK_RES * prev_block() const
Definition: pageres.h:745
PAGE_RES_IT::DeleteCurrentWord
void DeleteCurrentWord()
Definition: pageres.cpp:1436
UNICHARSET::U_ARABIC_NUMBER
Definition: unicharset.h:162
UNICHARSET
Definition: unicharset.h:145
WERD_RES::AnyRtlCharsInWord
bool AnyRtlCharsInWord() const
Definition: pageres.h:387
WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:297
WERD_RES::FilterWordChoices
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:509
WERD_CHOICE::unichars_in_script_order
bool unichars_in_script_order() const
Definition: ratngs.h:523
UNICHARSET::get_mirror
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:687
WERD_RES::timesteps
std::vector< std::vector< std::pair< const char *, float > > > timesteps
Definition: pageres.h:215
WERD_RES::raw_choice
WERD_CHOICE * raw_choice
Definition: pageres.h:240
WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:206
WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:288
tesseract
Definition: baseapi.h:65
ELISTIZEH
ELISTIZEH(BLOCK_RES) CLISTIZEH(BLOCK_RES) class ROW_RES
PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:748
WERD_RES::CopySimpleFields
void CopySimpleFields(const WERD_RES &source)
Definition: pageres.cpp:247
PAGE_RES
Definition: pageres.h:73
WERD_RES::best_choices
WERD_CHOICE_LIST best_choices
Definition: pageres.h:243
WERD_RES::SetScriptPositions
void SetScriptPositions()
Definition: pageres.cpp:854
WERD_RES::SetupBlobWidthsAndGaps
void SetupBlobWidthsAndGaps()
Definition: pageres.cpp:396
WERD_RES::seam_array
GenericVector< SEAM * > seam_array
Definition: pageres.h:208
tesseract::FontInfo
Definition: fontinfo.h:62
PAGE_RES_IT::rej_stat_word
void rej_stat_word()
Definition: pageres.cpp:1658
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
MATRIX::DeepCopy
MATRIX * DeepCopy() const
Definition: matrix.cpp:92
WERD_RES::IsAmbiguous
bool IsAmbiguous()
Definition: pageres.cpp:448
GenericVector< int >
BLOCK_RES::row_res_list
ROW_RES_LIST row_res_list
Definition: pageres.h:122
PAGE_RES_IT
Definition: pageres.h:668
WERD_RES::copy_on
void copy_on(WERD_RES *word_res)
Definition: pageres.h:654
WERD_RES::caps_height
float caps_height
Definition: pageres.h:311
WERD_RES::StatesAllValid
bool StatesAllValid()
Definition: pageres.cpp:454
WERD_RES::leading_space
bool leading_space
Definition: pageres.h:222
IRR_NUM_REASONS
Definition: blamer.h:99
WERD_RES::tess_would_adapt
bool tess_would_adapt
Definition: pageres.h:298
WERD_RES::fix_quotes
void fix_quotes()
Definition: pageres.cpp:1013
WERD_RES::GetBlobsGap
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:736
WERD_RES::SetupWordScript
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:380
WERD_RES::RebuildBestState
void RebuildBestState()
Definition: pageres.cpp:804
WERD_RES::SetupFake
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:348
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
count
int count(LIST var_list)
Definition: oldlist.cpp:79
WERD_RES::operator=
WERD_RES & operator=(const WERD_RES &source)
Definition: pageres.cpp:184
BLOB_CHOICE
Definition: ratngs.h:49
ROW_RES
Definition: pageres.h:133
WERD_RES::blob_row
ROW * blob_row
Definition: pageres.h:191
WERD_RES::SetupBlamerBundle
void SetupBlamerBundle()
Definition: pageres.cpp:389
WERD
Definition: werd.h:55
PAGE_RES_IT::cmp
int cmp(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1141
PAGE_RES_IT::next_word
WERD_RES * next_word() const
Definition: pageres.h:757
BLOCK_RES::block
BLOCK * block
Definition: pageres.h:113
unichar.h
ROW
Definition: ocrrow.h:35
UNICHARSET::Direction
Direction
Definition: unicharset.h:156
UNICHARSET::U_RIGHT_TO_LEFT_ARABIC
Definition: unicharset.h:170
WERD_RES::FakeClassifyWord
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:873
ROW_RES::word_res_list
WERD_RES_LIST word_res_list
Definition: pageres.h:140
WERD_RES::fontinfo_id2_count
int8_t fontinfo_id2_count
Definition: pageres.h:306
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:706
PAGE_RES_IT::ReplaceCurrentWord
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1329
WERD_RES::correct_text
GenericVector< STRING > correct_text
Definition: pageres.h:283
WERD_RES::WERD_RES
WERD_RES()=default
WERD_RES::part_of_combo
bool part_of_combo
Definition: pageres.h:334
ROW_RES::ROW_RES
ROW_RES()=default
WERD_RES::blob_gaps
GenericVector< int > blob_gaps
Definition: pageres.h:213
BLOCK_RES::~BLOCK_RES
~BLOCK_RES()=default
CRUNCH_MODE
CRUNCH_MODE
Definition: pageres.h:150
WERD_RES::ClearWordChoices
void ClearWordChoices()
Definition: pageres.cpp:1125
ELIST_LINK
Definition: elst.h:74
WERD_RES::reject_spaces
bool reject_spaces
Definition: pageres.h:335
REJMAP
Definition: rejctmap.h:200
CR_LOOSE_SPACE
Definition: pageres.h:155
WERD_RES::word
WERD * word
Definition: pageres.h:180
WERD_RES::guessed_caps_ht
bool guessed_caps_ht
Definition: pageres.h:308
WERD_RES::BothHyphens
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1025
WERD_RES::MergeAdjacentBlobs
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:969
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
WERD_RES::ReplaceBestChoice
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:791
WERD_RES::WERD_RES
WERD_RES(const WERD_RES &source)
Definition: pageres.h:344
BLOCK_RES::BLOCK_RES
BLOCK_RES()=default
WERD_RES::end
int end
Definition: pageres.h:224
WERD_RES::HyphenBoxesOverlap
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
Definition: pageres.cpp:1036
CR_KEEP_SPACE
Definition: pageres.h:154
WERD_RES::UnicharsInReadingOrder
bool UnicharsInReadingOrder() const
Definition: pageres.h:421
BlamerBundle
Definition: blamer.h:103
blamer.h
WERD_RES::guessed_x_ht
bool guessed_x_ht
Definition: pageres.h:307
PAGE_RES_IT::start_page
WERD_RES * start_page(bool empty_ok)
Definition: pageres.cpp:1495
PAGE_RES_IT::page_res
PAGE_RES * page_res
Definition: pageres.h:671
PAGE_RES_IT::restart_page_with_empties
WERD_RES * restart_page_with_empties()
Definition: pageres.h:698
BLOCK_RES::x_height
float x_height
Definition: pageres.h:118
PAGE_RES_IT::ResetWordIterator
void ResetWordIterator()
Definition: pageres.cpp:1518
UNICHARSET::size
int size() const
Definition: unicharset.h:341
WERD_RES::small_caps
bool small_caps
Definition: pageres.h:300
W_BOL
start of line
Definition: werd.h:46
UNICHARSET::U_RIGHT_TO_LEFT
Definition: unicharset.h:158
WERD_RES::~WERD_RES
~WERD_RES()
Definition: pageres.cpp:1086
clst.h
WERD_RES::ClearRatings
void ClearRatings()
Definition: pageres.cpp:1133
TBOX
Definition: rect.h:33
DENORM
Definition: normalis.h:49
tesseract::BoxWord
Definition: boxword.h:36