tesseract  5.0.0-alpha-619-ge9db
shapeclassifier.cpp
Go to the documentation of this file.
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
4 // File: shapeclassifier.cpp
5 // Description: Base interface class for classifiers that return a
6 // shape index.
7 // Author: Ray Smith
8 //
9 // (C) Copyright 2011, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #ifdef HAVE_CONFIG_H
23 #include "config_auto.h"
24 #endif
25 
26 #include "shapeclassifier.h"
28 #include "scrollview.h"
29 #include "shapetable.h"
30 #include "svmnode.h"
31 #include "trainingsample.h"
32 #include "tprintf.h"
33 
34 namespace tesseract {
35 
36 // Classifies the given [training] sample, writing to results.
37 // See shapeclassifier.h for a full description.
38 // Default implementation calls the ShapeRating version.
40  const TrainingSample& sample, Pix* page_pix, int debug,
41  UNICHAR_ID keep_this, GenericVector<UnicharRating>* results) {
42  results->truncate(0);
43  GenericVector<ShapeRating> shape_results;
44  int num_shape_results = ClassifySample(sample, page_pix, debug, keep_this,
45  &shape_results);
46  const ShapeTable* shapes = GetShapeTable();
47  GenericVector<int> unichar_map;
48  unichar_map.init_to_size(shapes->unicharset().size(), -1);
49  for (int r = 0; r < num_shape_results; ++r) {
50  shapes->AddShapeToResults(shape_results[r], &unichar_map, results);
51  }
52  return results->size();
53 }
54 
55 // Classifies the given [training] sample, writing to results.
56 // See shapeclassifier.h for a full description.
57 // Default implementation aborts.
59  int debug, int keep_this,
60  GenericVector<ShapeRating>* results) {
61  ASSERT_HOST("Must implement ClassifySample!" == nullptr);
62  return 0;
63 }
64 
65 // Returns the shape that contains unichar_id that has the best result.
66 // If result is not nullptr, it is set with the shape_id and rating.
67 // Does not need to be overridden if ClassifySample respects the keep_this
68 // rule.
70  Pix* page_pix, UNICHAR_ID unichar_id,
71  ShapeRating* result) {
73  const ShapeTable* shapes = GetShapeTable();
74  int num_results = ClassifySample(sample, page_pix, 0, unichar_id, &results);
75  for (int r = 0; r < num_results; ++r) {
76  if (shapes->GetShape(results[r].shape_id).ContainsUnichar(unichar_id)) {
77  if (result != nullptr)
78  *result = results[r];
79  return results[r].shape_id;
80  }
81  }
82  return -1;
83 }
84 
85 // Provides access to the UNICHARSET that this classifier works with.
86 // Only needs to be overridden if GetShapeTable() can return nullptr.
88  return GetShapeTable()->unicharset();
89 }
90 
91 // Visual debugger classifies the given sample, displays the results and
92 // solicits user input to display other classifications. Returns when
93 // the user has finished with debugging the sample.
94 // Probably doesn't need to be overridden if the subclass provides
95 // DisplayClassifyAs.
97  Pix* page_pix,
98  UNICHAR_ID unichar_id) {
99 #ifndef GRAPHICS_DISABLED
100  static ScrollView* terminator = nullptr;
101  if (terminator == nullptr) {
102  terminator = new ScrollView("XIT", 0, 0, 50, 50, 50, 50, true);
103  }
104  ScrollView* debug_win = CreateFeatureSpaceWindow("ClassifierDebug", 0, 0);
105  // Provide a right-click menu to choose the class.
106  auto* popup_menu = new SVMenuNode();
107  popup_menu->AddChild("Choose class to debug", 0, "x", "Class to debug");
108  popup_menu->BuildMenu(debug_win, false);
109  // Display the features in green.
110  const INT_FEATURE_STRUCT* features = sample.features();
111  uint32_t num_features = sample.num_features();
112  for (uint32_t f = 0; f < num_features; ++f) {
113  RenderIntFeature(debug_win, &features[f], ScrollView::GREEN);
114  }
115  debug_win->Update();
117  // Debug classification until the user quits.
118  const UNICHARSET& unicharset = GetUnicharset();
119  SVEvent* ev;
120  SVEventType ev_type;
121  do {
123  if (unichar_id >= 0) {
124  tprintf("Debugging class %d = %s\n",
125  unichar_id, unicharset.id_to_unichar(unichar_id));
126  UnicharClassifySample(sample, page_pix, 1, unichar_id, &results);
127  DisplayClassifyAs(sample, page_pix, unichar_id, 1, &windows);
128  } else {
129  tprintf("Invalid unichar_id: %d\n", unichar_id);
130  UnicharClassifySample(sample, page_pix, 1, -1, &results);
131  }
132  if (unichar_id >= 0) {
133  tprintf("Debugged class %d = %s\n",
134  unichar_id, unicharset.id_to_unichar(unichar_id));
135  }
136  tprintf("Right-click in ClassifierDebug window to choose debug class,");
137  tprintf(" Left-click or close window to quit...\n");
138  UNICHAR_ID old_unichar_id;
139  do {
140  old_unichar_id = unichar_id;
141  ev = debug_win->AwaitEvent(SVET_ANY);
142  ev_type = ev->type;
143  if (ev_type == SVET_POPUP) {
144  if (unicharset.contains_unichar(ev->parameter)) {
145  unichar_id = unicharset.unichar_to_id(ev->parameter);
146  } else {
147  tprintf("Char class '%s' not found in unicharset", ev->parameter);
148  }
149  }
150  delete ev;
151  } while (unichar_id == old_unichar_id &&
152  ev_type != SVET_CLICK && ev_type != SVET_DESTROY);
153  } while (ev_type != SVET_CLICK && ev_type != SVET_DESTROY);
154  delete debug_win;
155 #endif // GRAPHICS_DISABLED
156 }
157 
158 // Displays classification as the given shape_id. Creates as many windows
159 // as it feels fit, using index as a guide for placement. Adds any created
160 // windows to the windows output and returns a new index that may be used
161 // by any subsequent classifiers. Caller waits for the user to view and
162 // then destroys the windows by clearing the vector.
164  const TrainingSample& sample, Pix* page_pix,
165  UNICHAR_ID unichar_id, int index,
166  PointerVector<ScrollView>* windows) {
167  // Does nothing in the default implementation.
168  return index;
169 }
170 
171 // Prints debug information on the results.
173  const char* context, const GenericVector<UnicharRating>& results) const {
174  tprintf("%s\n", context);
175  for (int i = 0; i < results.size(); ++i) {
176  tprintf("%g: c_id=%d=%s", results[i].rating, results[i].unichar_id,
177  GetUnicharset().id_to_unichar(results[i].unichar_id));
178  if (!results[i].fonts.empty()) {
179  tprintf(" Font Vector:");
180  for (int f = 0; f < results[i].fonts.size(); ++f) {
181  tprintf(" %d", results[i].fonts[f].fontinfo_id);
182  }
183  }
184  tprintf("\n");
185  }
186 }
188  const char* context, const GenericVector<ShapeRating>& results) const {
189  tprintf("%s\n", context);
190  for (int i = 0; i < results.size(); ++i) {
191  tprintf("%g:", results[i].rating);
192  if (results[i].joined)
193  tprintf("[J]");
194  if (results[i].broken)
195  tprintf("[B]");
196  tprintf(" %s\n", GetShapeTable()->DebugStr(results[i].shape_id).c_str());
197  }
198 }
199 
200 // Removes any result that has all its unichars covered by a better choice,
201 // regardless of font.
203  GenericVector<ShapeRating>* results) const {
204  GenericVector<ShapeRating> filtered_results;
205  // Copy results to filtered results and knock out duplicate unichars.
206  const ShapeTable* shapes = GetShapeTable();
207  for (int r = 0; r < results->size(); ++r) {
208  if (r > 0) {
209  const Shape& shape_r = shapes->GetShape((*results)[r].shape_id);
210  int c;
211  for (c = 0; c < shape_r.size(); ++c) {
212  int unichar_id = shape_r[c].unichar_id;
213  int s;
214  for (s = 0; s < r; ++s) {
215  const Shape& shape_s = shapes->GetShape((*results)[s].shape_id);
216  if (shape_s.ContainsUnichar(unichar_id))
217  break; // We found unichar_id.
218  }
219  if (s == r)
220  break; // We didn't find unichar_id.
221  }
222  if (c == shape_r.size())
223  continue; // We found all the unichar ids in previous answers.
224  }
225  filtered_results.push_back((*results)[r]);
226  }
227  *results = filtered_results;
228 }
229 
230 } // namespace tesseract.
tesseract::Shape::ContainsUnichar
bool ContainsUnichar(int unichar_id) const
Definition: shapetable.cpp:147
tesseract::ShapeClassifier::DebugDisplay
virtual void DebugDisplay(const TrainingSample &sample, Pix *page_pix, UNICHAR_ID unichar_id)
Definition: shapeclassifier.cpp:96
ScrollView
Definition: scrollview.h:97
SVET_DESTROY
Definition: scrollview.h:45
SVEventType
SVEventType
Definition: scrollview.h:44
SVET_CLICK
Definition: scrollview.h:47
tesseract::Shape
Definition: shapetable.h:184
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
SVET_POPUP
Definition: scrollview.h:53
tesseract::ShapeClassifier::UnicharPrintResults
virtual void UnicharPrintResults(const char *context, const GenericVector< UnicharRating > &results) const
Definition: shapeclassifier.cpp:172
tesseract::PointerVector
Definition: genericvector.h:417
tesseract::ShapeClassifier::GetShapeTable
virtual const ShapeTable * GetShapeTable() const =0
tesseract::ShapeClassifier::UnicharClassifySample
virtual int UnicharClassifySample(const TrainingSample &sample, Pix *page_pix, int debug, UNICHAR_ID keep_this, GenericVector< UnicharRating > *results)
Definition: shapeclassifier.cpp:39
genericvector.h
tesseract::ShapeClassifier::DisplayClassifyAs
virtual int DisplayClassifyAs(const TrainingSample &sample, Pix *page_pix, UNICHAR_ID unichar_id, int index, PointerVector< ScrollView > *windows)
Definition: shapeclassifier.cpp:163
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
svmnode.h
tesseract::ShapeTable::unicharset
const UNICHARSET & unicharset() const
Definition: shapetable.h:277
SVEvent::parameter
char * parameter
Definition: scrollview.h:65
tesseract::ShapeClassifier::BestShapeForUnichar
virtual int BestShapeForUnichar(const TrainingSample &sample, Pix *page_pix, UNICHAR_ID unichar_id, ShapeRating *result)
Definition: shapeclassifier.cpp:69
tesseract::ShapeClassifier::FilterDuplicateUnichars
void FilterDuplicateUnichars(GenericVector< ShapeRating > *results) const
Definition: shapeclassifier.cpp:202
tesseract::ShapeClassifier::PrintResults
virtual void PrintResults(const char *context, const GenericVector< ShapeRating > &results) const
Definition: shapeclassifier.cpp:187
trainingsample.h
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
shapetable.h
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
UNICHARSET
Definition: unicharset.h:145
tesseract::ShapeTable::GetShape
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:319
tesseract::ShapeRating
Definition: shapetable.h:92
CreateFeatureSpaceWindow
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1764
tesseract
Definition: baseapi.h:65
tesseract::ShapeClassifier::ClassifySample
virtual int ClassifySample(const TrainingSample &sample, Pix *page_pix, int debug, UNICHAR_ID keep_this, GenericVector< ShapeRating > *results)
Definition: shapeclassifier.cpp:58
SVEvent::type
SVEventType type
Definition: scrollview.h:63
tprintf.h
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
sample
Definition: cluster.h:31
GenericVector< UnicharRating >
SVET_ANY
Definition: scrollview.h:55
shapeclassifier.h
ScrollView::AwaitEvent
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:443
INT_FEATURE_STRUCT
Definition: intproto.h:131
SVMenuNode
Definition: svmnode.h:35
tesseract::TrainingSample
Definition: trainingsample.h:53
GenericVector::truncate
void truncate(int size)
Definition: genericvector.h:132
SVEvent
Definition: scrollview.h:60
UNICHARSET::contains_unichar
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
ScrollView::GREEN
Definition: scrollview.h:106
tesseract::ShapeClassifier::GetUnicharset
virtual const UNICHARSET & GetUnicharset() const
Definition: shapeclassifier.cpp:87
tesseract::Shape::size
int size() const
Definition: shapetable.h:199
tesseract::ShapeTable::AddShapeToResults
void AddShapeToResults(const ShapeRating &shape_rating, GenericVector< int > *unichar_map, GenericVector< UnicharRating > *results) const
Definition: shapetable.cpp:687
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:706
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::ShapeTable
Definition: shapetable.h:261
ScrollView::Update
static void Update()
Definition: scrollview.cpp:708
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
GenericVector::size
int size() const
Definition: genericvector.h:71
scrollview.h
UNICHARSET::size
int size() const
Definition: unicharset.h:341
RenderIntFeature
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1603