tesseract  5.0.0-alpha-619-ge9db
lm_pain_points.h
Go to the documentation of this file.
1 // File: lm_pain_points.h
3 // Description: Functions that utilize the knowledge about the properties
4 // of the paths explored by the segmentation search in order
5 // to generate "pain points" - the locations in the ratings
6 // matrix which should be classified next.
7 // Author: Rika Antonova
8 //
9 // (C) Copyright 2012, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #ifndef TESSERACT_WORDREC_PAIN_POINTS_H_
23 #define TESSERACT_WORDREC_PAIN_POINTS_H_
24 
25 #include "genericheap.h" // for GenericHeap
26 #include "matrix.h" // for MATRIX_COORD (ptr only), MatrixCoordPair
27 #include "stopper.h" // for DANGERR
28 
29 class WERD_RES;
30 
31 namespace tesseract {
32 
33 class Dict;
34 struct ViterbiStateEntry;
35 
36 // Heap of pain points used for determining where to chop/join.
38 
39 // Types of pain points (ordered in the decreasing level of importance).
45 
47 };
48 
49 static const char * const LMPainPointsTypeName[] = {
50  "LM_PPTYPE_BLAMER",
51  "LM_PPTYPE_AMBIGS",
52  "LM_PPTYPE_PATH",
53  "LM_PPTYPE_SHAPE",
54 };
55 
56 class LMPainPoints {
57  public:
58 
60  // If there is a significant drop in character ngram probability or a
61  // dangerous ambiguity make the thresholds on what blob combinations
62  // can be classified looser.
63  static const float kLooseMaxCharWhRatio;
64  // Returns a description of the type of a pain point.
66  return LMPainPointsTypeName[type];
67  }
68 
69  LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb) :
70  max_heap_size_(max), max_char_wh_ratio_(rat), fixed_pitch_(fp),
71  dict_(d), debug_level_(deb) {}
73 
74  // Returns true if the heap of pain points of pp_type is not empty().
75  inline bool HasPainPoints(LMPainPointsType pp_type) const {
76  return !pain_points_heaps_[pp_type].empty();
77  }
78 
79  // Dequeues the next pain point from the pain points queue and copies
80  // its contents and priority to *pp and *priority.
81  // Returns LM_PPTYPE_NUM if pain points queue is empty, otherwise the type.
82  LMPainPointsType Deque(MATRIX_COORD *pp, float *priority);
83 
84  // Clears pain points heap.
85  void Clear() {
86  for (auto & pain_points_heap : pain_points_heaps_) pain_points_heap.clear();
87  }
88 
89  // For each cell, generate a "pain point" if the cell is not classified
90  // and has a left or right neighbor that was classified.
91  void GenerateInitial(WERD_RES *word_res);
92 
93  // Generate pain points from the given path.
94  void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse,
95  WERD_RES *word_res);
96 
97  // Generate pain points from dangerous ambiguities in best choice.
98  void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse,
99  WERD_RES *word_res);
100 
101  // Adds a pain point to classify chunks_record->ratings(col, row).
102  // Returns true if a new pain point was added to an appropriate heap.
103  // Pain point priority is set to special_priority for pain points of
104  // LM_PPTYPE_AMBIG or LM_PPTYPE_PATH, for other pain points
105  // AssociateStats::gap_sum is used.
106  bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type,
107  float special_priority, bool ok_to_extend,
108  float max_char_wh_ratio,
109  WERD_RES *word_res);
110 
111  // Adjusts the pain point coordinates to cope with expansion of the ratings
112  // matrix due to a split of the blob with the given index.
113  void RemapForSplit(int index);
114 
115  private:
116  // Priority queues containing pain points generated by the language model
117  // The priority is set by the language model components, adjustments like
118  // seam cost and width priority are factored into the priority.
119  PainPointHeap pain_points_heaps_[LM_PPTYPE_NUM];
120  // Maximum number of points to keep in the heap.
121  int max_heap_size_;
122  // Maximum character width/height ratio.
123  float max_char_wh_ratio_;
124  // Set to true if fixed pitch should be assumed.
125  bool fixed_pitch_;
126  // Cached pointer to dictionary.
127  const Dict *dict_;
128  // Debug level for print statements.
129  int debug_level_;
130 };
131 
132 } // namespace tesseract
133 
134 #endif // TESSERACT_WORDREC_PAIN_POINTS_H_
tesseract::GenericHeap< MatrixCoordPair >
tesseract::ViterbiStateEntry
Definition: lm_state.h:91
WERD_RES
Definition: pageres.h:160
stopper.h
tesseract::LMPainPoints::RemapForSplit
void RemapForSplit(int index)
Definition: lm_pain_points.cpp:211
tesseract::LMPainPoints::~LMPainPoints
~LMPainPoints()
Definition: lm_pain_points.h:72
tesseract::LMPainPoints::LMPainPoints
LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb)
Definition: lm_pain_points.h:69
tesseract::LMPainPoints::kLooseMaxCharWhRatio
static const float kLooseMaxCharWhRatio
Definition: lm_pain_points.h:63
tesseract::LMPainPoints::GenerateFromPath
void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res)
Definition: lm_pain_points.cpp:70
tesseract::LMPainPoints::PainPointDescription
static const char * PainPointDescription(LMPainPointsType type)
Definition: lm_pain_points.h:65
tesseract::LMPainPoints::GenerateFromAmbigs
void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res)
Definition: lm_pain_points.cpp:132
tesseract::LM_PPTYPE_NUM
Definition: lm_pain_points.h:46
tesseract::LMPainPoints::Deque
LMPainPointsType Deque(MATRIX_COORD *pp, float *priority)
Definition: lm_pain_points.cpp:39
tesseract::LM_PPTYPE_PATH
Definition: lm_pain_points.h:43
tesseract::LMPainPoints::HasPainPoints
bool HasPainPoints(LMPainPointsType pp_type) const
Definition: lm_pain_points.h:75
matrix.h
tesseract::LMPainPoints::GenerateInitial
void GenerateInitial(WERD_RES *word_res)
Definition: lm_pain_points.cpp:50
tesseract::LMPainPoints::GeneratePainPoint
bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority, bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res)
Definition: lm_pain_points.cpp:148
tesseract::LMPainPoints
Definition: lm_pain_points.h:56
tesseract::LM_PPTYPE_BLAMER
Definition: lm_pain_points.h:41
tesseract
Definition: baseapi.h:65
tesseract::LMPainPoints::kDefaultPainPointPriorityAdjustment
static const float kDefaultPainPointPriorityAdjustment
Definition: lm_pain_points.h:59
tesseract::LM_PPTYPE_SHAPE
Definition: lm_pain_points.h:44
tesseract::LMPainPoints::Clear
void Clear()
Definition: lm_pain_points.h:85
GenericVector< DANGERR_INFO >
tesseract::Dict
Definition: dict.h:91
MATRIX_COORD
Definition: matrix.h:604
tesstrain_utils.type
type
Definition: tesstrain_utils.py:141
tesseract::LM_PPTYPE_AMBIG
Definition: lm_pain_points.h:42
tesseract::LMPainPointsType
LMPainPointsType
Definition: lm_pain_points.h:40
tesseract::GenericHeap::empty
bool empty() const
Definition: genericheap.h:68
genericheap.h