tesseract  4.0.0-1-g2a2b
pieces.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: pieces.cpp (Formerly pieces.c)
5  * Description:
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Mon May 20 12:12:35 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I n c l u d e s
27 ----------------------------------------------------------------------*/
28 
29 #include "blobs.h"
30 #include "helpers.h"
31 #include "matrix.h"
32 #include "ratngs.h"
33 #include "seam.h"
34 #include "wordrec.h"
35 
36 // Include automatically generated configuration file if running autoconf.
37 #ifdef HAVE_CONFIG_H
38 #include "config_auto.h"
39 #endif
40 
42 
43 /*----------------------------------------------------------------------
44  F u n c t i o n s
45 ----------------------------------------------------------------------*/
46 
47 /**********************************************************************
48  * classify_piece
49  *
50  * Create a larger piece from a collection of smaller ones. Classify
51  * it and return the results. Take the large piece apart to leave
52  * the collection of small pieces un modified.
53  **********************************************************************/
54 namespace tesseract {
55 BLOB_CHOICE_LIST *Wordrec::classify_piece(const GenericVector<SEAM*>& seams,
56  int16_t start,
57  int16_t end,
58  const char* description,
59  TWERD *word,
60  BlamerBundle *blamer_bundle) {
61  if (end > start) SEAM::JoinPieces(seams, word->blobs, start, end);
62  BLOB_CHOICE_LIST *choices = classify_blob(word->blobs[start], description,
63  White, blamer_bundle);
64  // Set the matrix_cell_ entries in all the BLOB_CHOICES.
65  BLOB_CHOICE_IT bc_it(choices);
66  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
67  bc_it.data()->set_matrix_cell(start, end);
68  }
69 
70  if (end > start) SEAM::BreakPieces(seams, word->blobs, start, end);
71 
72  return (choices);
73 }
74 
75 template<class BLOB_CHOICE>
76 int SortByUnicharID(const void *void1, const void *void2) {
77  const BLOB_CHOICE *p1 = *static_cast<const BLOB_CHOICE *const *>(void1);
78  const BLOB_CHOICE *p2 = *static_cast<const BLOB_CHOICE *const *>(void2);
79 
80  return p1->unichar_id() - p2->unichar_id();
81 }
82 
83 template<class BLOB_CHOICE>
84 int SortByRating(const void *void1, const void *void2) {
85  const BLOB_CHOICE *p1 = *static_cast<const BLOB_CHOICE *const *>(void1);
86  const BLOB_CHOICE *p2 = *static_cast<const BLOB_CHOICE *const *>(void2);
87 
88  if (p1->rating() < p2->rating())
89  return 1;
90  return -1;
91 }
92 
93 
94 /**********************************************************************
95  * fill_filtered_fragment_list
96  *
97  * Filter the fragment list so that the filtered_choices only contain
98  * fragments that are in the correct position. choices is the list
99  * that we are going to filter. fragment_pos is the position in the
100  * fragment that we are looking for and num_frag_parts is the the
101  * total number of pieces. The result will be appended to
102  * filtered_choices.
103  **********************************************************************/
104 void Wordrec::fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices,
105  int fragment_pos,
106  int num_frag_parts,
107  BLOB_CHOICE_LIST *filtered_choices) {
108  BLOB_CHOICE_IT filtered_choices_it(filtered_choices);
109  BLOB_CHOICE_IT choices_it(choices);
110 
111  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
112  choices_it.forward()) {
113  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
114  const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id);
115 
116  if (frag != nullptr && frag->get_pos() == fragment_pos &&
117  frag->get_total() == num_frag_parts) {
118  // Recover the unichar_id of the unichar that this fragment is
119  // a part of
120  BLOB_CHOICE *b = new BLOB_CHOICE(*choices_it.data());
121  int original_unichar = unicharset.unichar_to_id(frag->get_unichar());
122  b->set_unichar_id(original_unichar);
123  filtered_choices_it.add_to_end(b);
124  }
125  }
126 
127  filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>);
128 }
129 
130 
131 /**********************************************************************
132  * merge_and_put_fragment_lists
133  *
134  * Merge the fragment lists in choice_lists and append it to the
135  * ratings matrix.
136  **********************************************************************/
137 void Wordrec::merge_and_put_fragment_lists(int16_t row, int16_t column,
138  int16_t num_frag_parts,
139  BLOB_CHOICE_LIST *choice_lists,
140  MATRIX *ratings) {
141  BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];
142 
143  for (int i = 0; i < num_frag_parts; i++) {
144  choice_lists_it[i].set_to_list(&choice_lists[i]);
145  choice_lists_it[i].mark_cycle_pt();
146  }
147 
148  BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
149  if (merged_choice == nullptr)
150  merged_choice = new BLOB_CHOICE_LIST;
151 
152  bool end_of_list = false;
153  BLOB_CHOICE_IT merged_choice_it(merged_choice);
154  while (!end_of_list) {
155  // Find the maximum unichar_id of the current entry the iterators
156  // are pointing at
157  UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
158  for (int i = 0; i < num_frag_parts; i++) {
159  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
160  if (max_unichar_id < unichar_id) {
161  max_unichar_id = unichar_id;
162  }
163  }
164 
165  // Move the each iterators until it gets to an entry that has a
166  // value greater than or equal to max_unichar_id
167  for (int i = 0; i < num_frag_parts; i++) {
168  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
169  while (!choice_lists_it[i].cycled_list() &&
170  unichar_id < max_unichar_id) {
171  choice_lists_it[i].forward();
172  unichar_id = choice_lists_it[i].data()->unichar_id();
173  }
174  if (choice_lists_it[i].cycled_list()) {
175  end_of_list = true;
176  break;
177  }
178  }
179 
180  if (end_of_list)
181  break;
182 
183  // Checks if the fragments are parts of the same character
184  UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
185  bool same_unichar = true;
186  for (int i = 1; i < num_frag_parts; i++) {
187  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
188  if (unichar_id != first_unichar_id) {
189  same_unichar = false;
190  break;
191  }
192  }
193 
194  if (same_unichar) {
195  // Add the merged character to the result
196  UNICHAR_ID merged_unichar_id = first_unichar_id;
197  GenericVector<ScoredFont> merged_fonts =
198  choice_lists_it[0].data()->fonts();
199  float merged_min_xheight = choice_lists_it[0].data()->min_xheight();
200  float merged_max_xheight = choice_lists_it[0].data()->max_xheight();
201  float positive_yshift = 0, negative_yshift = 0;
202  int merged_script_id = choice_lists_it[0].data()->script_id();
203  BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier();
204 
205  float merged_rating = 0, merged_certainty = 0;
206  for (int i = 0; i < num_frag_parts; i++) {
207  float rating = choice_lists_it[i].data()->rating();
208  float certainty = choice_lists_it[i].data()->certainty();
209 
210  if (i == 0 || certainty < merged_certainty)
211  merged_certainty = certainty;
212  merged_rating += rating;
213 
214  choice_lists_it[i].forward();
215  if (choice_lists_it[i].cycled_list())
216  end_of_list = true;
217  IntersectRange(choice_lists_it[i].data()->min_xheight(),
218  choice_lists_it[i].data()->max_xheight(),
219  &merged_min_xheight, &merged_max_xheight);
220  float yshift = choice_lists_it[i].data()->yshift();
221  if (yshift > positive_yshift) positive_yshift = yshift;
222  if (yshift < negative_yshift) negative_yshift = yshift;
223  // Use the min font rating over the parts.
224  // TODO(rays) font lists are unsorted. Need to be faster?
225  const GenericVector<ScoredFont>& frag_fonts =
226  choice_lists_it[i].data()->fonts();
227  for (int f = 0; f < frag_fonts.size(); ++f) {
228  int merged_f = 0;
229  for (merged_f = 0; merged_f < merged_fonts.size() &&
230  merged_fonts[merged_f].fontinfo_id != frag_fonts[f].fontinfo_id;
231  ++merged_f) {}
232  if (merged_f == merged_fonts.size()) {
233  merged_fonts.push_back(frag_fonts[f]);
234  } else if (merged_fonts[merged_f].score > frag_fonts[f].score) {
235  merged_fonts[merged_f].score = frag_fonts[f].score;
236  }
237  }
238  }
239 
240  float merged_yshift = positive_yshift != 0
241  ? (negative_yshift != 0 ? 0 : positive_yshift)
242  : negative_yshift;
243  BLOB_CHOICE* choice = new BLOB_CHOICE(merged_unichar_id,
244  merged_rating,
245  merged_certainty,
246  merged_script_id,
247  merged_min_xheight,
248  merged_max_xheight,
249  merged_yshift,
250  classifier);
251  choice->set_fonts(merged_fonts);
252  merged_choice_it.add_to_end(choice);
253  }
254  }
255 
257  print_ratings_list("Merged Fragments", merged_choice,
258  unicharset);
259 
260  if (merged_choice->empty())
261  delete merged_choice;
262  else
263  ratings->put(row, column, merged_choice);
264 
265  delete [] choice_lists_it;
266 }
267 
268 /**********************************************************************
269  * get_fragment_lists
270  *
271  * Recursively go through the ratings matrix to find lists of fragments
272  * to be merged in the function merge_and_put_fragment_lists.
273  * current_frag is the position of the piece we are looking for.
274  * current_row is the row in the rating matrix we are currently at.
275  * start is the row we started initially, so that we can know where
276  * to append the results to the matrix. num_frag_parts is the total
277  * number of pieces we are looking for and num_blobs is the size of the
278  * ratings matrix.
279  **********************************************************************/
280 void Wordrec::get_fragment_lists(int16_t current_frag, int16_t current_row,
281  int16_t start, int16_t num_frag_parts,
282  int16_t num_blobs, MATRIX *ratings,
283  BLOB_CHOICE_LIST *choice_lists) {
284  if (current_frag == num_frag_parts) {
285  merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts,
286  choice_lists, ratings);
287  return;
288  }
289 
290  for (int16_t x = current_row; x < num_blobs; x++) {
291  BLOB_CHOICE_LIST *choices = ratings->get(current_row, x);
292  if (choices == nullptr)
293  continue;
294 
295  fill_filtered_fragment_list(choices, current_frag, num_frag_parts,
296  &choice_lists[current_frag]);
297  if (!choice_lists[current_frag].empty()) {
298  get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts,
299  num_blobs, ratings, choice_lists);
300  choice_lists[current_frag].clear();
301  }
302  }
303 }
304 
305 
306 /**********************************************************************
307  * merge_fragments
308  *
309  * Try to merge fragments in the ratings matrix and put the result in
310  * the corresponding row and column
311  **********************************************************************/
312 void Wordrec::merge_fragments(MATRIX *ratings, int16_t num_blobs) {
313  BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks];
314  for (int16_t start = 0; start < num_blobs; start++) {
315  for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks;
316  frag_parts++) {
317  get_fragment_lists(0, start, start, frag_parts, num_blobs,
318  ratings, choice_lists);
319  }
320  }
321 
322  // Delete fragments from the rating matrix
323  for (int16_t x = 0; x < num_blobs; x++) {
324  for (int16_t y = x; y < num_blobs; y++) {
325  BLOB_CHOICE_LIST *choices = ratings->get(x, y);
326  if (choices != nullptr) {
327  BLOB_CHOICE_IT choices_it(choices);
328  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
329  choices_it.forward()) {
330  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
331  const CHAR_FRAGMENT *frag =
332  unicharset.get_fragment(choice_unichar_id);
333  if (frag != nullptr)
334  delete choices_it.extract();
335  }
336  }
337  }
338  }
339 }
340 
341 
342 } // namespace tesseract
int UNICHAR_ID
Definition: unichar.h:35
int size() const
Definition: genericvector.h:71
Definition: blobs.h:402
Definition: callcpp.h:31
void set_fonts(const GenericVector< tesseract::ScoredFont > &fonts)
Definition: ratngs.h:95
static void JoinPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:216
static void BreakPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:194
void merge_and_put_fragment_lists(int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
Definition: pieces.cpp:137
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
int get_total() const
Definition: unicharset.h:73
const char * get_unichar() const
Definition: unicharset.h:71
int SortByRating(const void *void1, const void *void2)
Definition: pieces.cpp:84
int SortByUnicharID(const void *void1, const void *void2)
Definition: pieces.cpp:76
UNICHARSET unicharset
Definition: ccutil.h:68
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:145
BLOB_CHOICE_LIST * classify_blob(TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
Definition: wordclass.cpp:54
void put(ICOORD pos, const T &thing)
Definition: matrix.h:220
int get_pos() const
Definition: unicharset.h:72
int push_back(T object)
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
float rating() const
Definition: ratngs.h:80
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729
void IntersectRange(const T &lower1, const T &upper1, T *lower2, T *upper2)
Definition: helpers.h:142
BlobChoiceClassifier
Definition: ratngs.h:41
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:836
void merge_fragments(MATRIX *ratings, int16_t num_blobs)
Definition: pieces.cpp:312
Definition: matrix.h:575
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM *> &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:55
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
void get_fragment_lists(int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
Definition: pieces.cpp:280
void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
Definition: pieces.cpp:104
static const int kMaxChunks
Definition: unicharset.h:56
T get(ICOORD pos) const
Definition: matrix.h:228