All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
pieces.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: pieces.c (Formerly pieces.c)
5  * Description:
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Mon May 20 12:12:35 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I n c l u d e s
27 ----------------------------------------------------------------------*/
28 
29 #include "blobs.h"
30 #include "freelist.h"
31 #include "helpers.h"
32 #include "matrix.h"
33 #include "ndminx.h"
34 #include "ratngs.h"
35 #include "seam.h"
36 #include "wordrec.h"
37 
38 // Include automatically generated configuration file if running autoconf.
39 #ifdef HAVE_CONFIG_H
40 #include "config_auto.h"
41 #endif
42 
44 
45 /*----------------------------------------------------------------------
46  F u n c t i o n s
47 ----------------------------------------------------------------------*/
48 
49 /**********************************************************************
50  * classify_piece
51  *
52  * Create a larger piece from a collection of smaller ones. Classify
53  * it and return the results. Take the large piece apart to leave
54  * the collection of small pieces un modified.
55  **********************************************************************/
56 namespace tesseract {
57 BLOB_CHOICE_LIST *Wordrec::classify_piece(const GenericVector<SEAM*>& seams,
58  inT16 start,
59  inT16 end,
60  const char* description,
61  TWERD *word,
62  BlamerBundle *blamer_bundle) {
63  if (end > start) SEAM::JoinPieces(seams, word->blobs, start, end);
64  BLOB_CHOICE_LIST *choices = classify_blob(word->blobs[start], description,
65  White, blamer_bundle);
66  // Set the matrix_cell_ entries in all the BLOB_CHOICES.
67  BLOB_CHOICE_IT bc_it(choices);
68  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
69  bc_it.data()->set_matrix_cell(start, end);
70  }
71 
72  if (end > start) SEAM::BreakPieces(seams, word->blobs, start, end);
73 
74  return (choices);
75 }
76 
77 template<class BLOB_CHOICE>
78 int SortByUnicharID(const void *void1, const void *void2) {
79  const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
80  const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);
81 
82  return p1->unichar_id() - p2->unichar_id();
83 }
84 
85 template<class BLOB_CHOICE>
86 int SortByRating(const void *void1, const void *void2) {
87  const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
88  const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);
89 
90  if (p1->rating() < p2->rating())
91  return 1;
92  return -1;
93 }
94 
95 
96 /**********************************************************************
97  * fill_filtered_fragment_list
98  *
99  * Filter the fragment list so that the filtered_choices only contain
100  * fragments that are in the correct position. choices is the list
101  * that we are going to filter. fragment_pos is the position in the
102  * fragment that we are looking for and num_frag_parts is the the
103  * total number of pieces. The result will be appended to
104  * filtered_choices.
105  **********************************************************************/
106 void Wordrec::fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices,
107  int fragment_pos,
108  int num_frag_parts,
109  BLOB_CHOICE_LIST *filtered_choices) {
110  BLOB_CHOICE_IT filtered_choices_it(filtered_choices);
111  BLOB_CHOICE_IT choices_it(choices);
112 
113  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
114  choices_it.forward()) {
115  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
116  const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id);
117 
118  if (frag != NULL && frag->get_pos() == fragment_pos &&
119  frag->get_total() == num_frag_parts) {
120  // Recover the unichar_id of the unichar that this fragment is
121  // a part of
122  BLOB_CHOICE *b = new BLOB_CHOICE(*choices_it.data());
123  int original_unichar = unicharset.unichar_to_id(frag->get_unichar());
124  b->set_unichar_id(original_unichar);
125  filtered_choices_it.add_to_end(b);
126  }
127  }
128 
129  filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>);
130 }
131 
132 
133 /**********************************************************************
134  * merge_and_put_fragment_lists
135  *
136  * Merge the fragment lists in choice_lists and append it to the
137  * ratings matrix.
138  **********************************************************************/
140  inT16 num_frag_parts,
141  BLOB_CHOICE_LIST *choice_lists,
142  MATRIX *ratings) {
143  BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];
144 
145  for (int i = 0; i < num_frag_parts; i++) {
146  choice_lists_it[i].set_to_list(&choice_lists[i]);
147  choice_lists_it[i].mark_cycle_pt();
148  }
149 
150  BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
151  if (merged_choice == NULL)
152  merged_choice = new BLOB_CHOICE_LIST;
153 
154  bool end_of_list = false;
155  BLOB_CHOICE_IT merged_choice_it(merged_choice);
156  while (!end_of_list) {
157  // Find the maximum unichar_id of the current entry the iterators
158  // are pointing at
159  UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
160  for (int i = 0; i < num_frag_parts; i++) {
161  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
162  if (max_unichar_id < unichar_id) {
163  max_unichar_id = unichar_id;
164  }
165  }
166 
167  // Move the each iterators until it gets to an entry that has a
168  // value greater than or equal to max_unichar_id
169  for (int i = 0; i < num_frag_parts; i++) {
170  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
171  while (!choice_lists_it[i].cycled_list() &&
172  unichar_id < max_unichar_id) {
173  choice_lists_it[i].forward();
174  unichar_id = choice_lists_it[i].data()->unichar_id();
175  }
176  if (choice_lists_it[i].cycled_list()) {
177  end_of_list = true;
178  break;
179  }
180  }
181 
182  if (end_of_list)
183  break;
184 
185  // Checks if the fragments are parts of the same character
186  UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
187  bool same_unichar = true;
188  for (int i = 1; i < num_frag_parts; i++) {
189  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
190  if (unichar_id != first_unichar_id) {
191  same_unichar = false;
192  break;
193  }
194  }
195 
196  if (same_unichar) {
197  // Add the merged character to the result
198  UNICHAR_ID merged_unichar_id = first_unichar_id;
199  GenericVector<ScoredFont> merged_fonts =
200  choice_lists_it[0].data()->fonts();
201  float merged_min_xheight = choice_lists_it[0].data()->min_xheight();
202  float merged_max_xheight = choice_lists_it[0].data()->max_xheight();
203  float positive_yshift = 0, negative_yshift = 0;
204  int merged_script_id = choice_lists_it[0].data()->script_id();
205  BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier();
206 
207  float merged_rating = 0, merged_certainty = 0;
208  for (int i = 0; i < num_frag_parts; i++) {
209  float rating = choice_lists_it[i].data()->rating();
210  float certainty = choice_lists_it[i].data()->certainty();
211 
212  if (i == 0 || certainty < merged_certainty)
213  merged_certainty = certainty;
214  merged_rating += rating;
215 
216  choice_lists_it[i].forward();
217  if (choice_lists_it[i].cycled_list())
218  end_of_list = true;
219  IntersectRange(choice_lists_it[i].data()->min_xheight(),
220  choice_lists_it[i].data()->max_xheight(),
221  &merged_min_xheight, &merged_max_xheight);
222  float yshift = choice_lists_it[i].data()->yshift();
223  if (yshift > positive_yshift) positive_yshift = yshift;
224  if (yshift < negative_yshift) negative_yshift = yshift;
225  // Use the min font rating over the parts.
226  // TODO(rays) font lists are unsorted. Need to be faster?
227  const GenericVector<ScoredFont>& frag_fonts =
228  choice_lists_it[i].data()->fonts();
229  for (int f = 0; f < frag_fonts.size(); ++f) {
230  int merged_f = 0;
231  for (merged_f = 0; merged_f < merged_fonts.size() &&
232  merged_fonts[merged_f].fontinfo_id != frag_fonts[f].fontinfo_id;
233  ++merged_f) {}
234  if (merged_f == merged_fonts.size()) {
235  merged_fonts.push_back(frag_fonts[f]);
236  } else if (merged_fonts[merged_f].score > frag_fonts[f].score) {
237  merged_fonts[merged_f].score = frag_fonts[f].score;
238  }
239  }
240  }
241 
242  float merged_yshift = positive_yshift != 0
243  ? (negative_yshift != 0 ? 0 : positive_yshift)
244  : negative_yshift;
245  BLOB_CHOICE* choice = new BLOB_CHOICE(merged_unichar_id,
246  merged_rating,
247  merged_certainty,
248  merged_script_id,
249  merged_min_xheight,
250  merged_max_xheight,
251  merged_yshift,
252  classifier);
253  choice->set_fonts(merged_fonts);
254  merged_choice_it.add_to_end(choice);
255  }
256  }
257 
259  print_ratings_list("Merged Fragments", merged_choice,
260  unicharset);
261 
262  if (merged_choice->empty())
263  delete merged_choice;
264  else
265  ratings->put(row, column, merged_choice);
266 
267  delete [] choice_lists_it;
268 }
269 
270 
271 /**********************************************************************
272  * get_fragment_lists
273  *
274  * Recursively go through the ratings matrix to find lists of fragments
275  * to be merged in the function merge_and_put_fragment_lists.
276  * current_frag is the postion of the piece we are looking for.
277  * current_row is the row in the rating matrix we are currently at.
278  * start is the row we started initially, so that we can know where
279  * to append the results to the matrix. num_frag_parts is the total
280  * number of pieces we are looking for and num_blobs is the size of the
281  * ratings matrix.
282  **********************************************************************/
283 void Wordrec::get_fragment_lists(inT16 current_frag, inT16 current_row,
284  inT16 start, inT16 num_frag_parts,
285  inT16 num_blobs, MATRIX *ratings,
286  BLOB_CHOICE_LIST *choice_lists) {
287  if (current_frag == num_frag_parts) {
288  merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts,
289  choice_lists, ratings);
290  return;
291  }
292 
293  for (inT16 x = current_row; x < num_blobs; x++) {
294  BLOB_CHOICE_LIST *choices = ratings->get(current_row, x);
295  if (choices == NULL)
296  continue;
297 
298  fill_filtered_fragment_list(choices, current_frag, num_frag_parts,
299  &choice_lists[current_frag]);
300  if (!choice_lists[current_frag].empty()) {
301  get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts,
302  num_blobs, ratings, choice_lists);
303  choice_lists[current_frag].clear();
304  }
305  }
306 }
307 
308 
309 /**********************************************************************
310  * merge_fragments
311  *
312  * Try to merge fragments in the ratings matrix and put the result in
313  * the corresponding row and column
314  **********************************************************************/
315 void Wordrec::merge_fragments(MATRIX *ratings, inT16 num_blobs) {
316  BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks];
317  for (inT16 start = 0; start < num_blobs; start++) {
318  for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks;
319  frag_parts++) {
320  get_fragment_lists(0, start, start, frag_parts, num_blobs,
321  ratings, choice_lists);
322  }
323  }
324 
325  // Delete fragments from the rating matrix
326  for (inT16 x = 0; x < num_blobs; x++) {
327  for (inT16 y = x; y < num_blobs; y++) {
328  BLOB_CHOICE_LIST *choices = ratings->get(x, y);
329  if (choices != NULL) {
330  BLOB_CHOICE_IT choices_it(choices);
331  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
332  choices_it.forward()) {
333  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
334  const CHAR_FRAGMENT *frag =
335  unicharset.get_fragment(choice_unichar_id);
336  if (frag != NULL)
337  delete choices_it.extract();
338  }
339  }
340  }
341  }
342 }
343 
344 
345 } // namespace tesseract
int size() const
Definition: genericvector.h:72
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
int SortByUnicharID(const void *void1, const void *void2)
Definition: pieces.cpp:78
Definition: callcpp.h:34
int push_back(T object)
T get(int column, int row) const
Definition: matrix.h:171
UNICHARSET unicharset
Definition: ccutil.h:72
void IntersectRange(const T &lower1, const T &upper1, T *lower2, T *upper2)
Definition: helpers.h:146
int get_total() const
Definition: unicharset.h:66
void put(int column, int row, const T &thing)
Definition: matrix.h:166
static const int kMaxChunks
Definition: unicharset.h:49
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
Definition: pieces.cpp:106
float rating() const
Definition: ratngs.h:79
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM * > &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:57
int UNICHAR_ID
Definition: unichar.h:33
void get_fragment_lists(inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
Definition: pieces.cpp:283
static void JoinPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:216
const char * get_unichar() const
Definition: unicharset.h:64
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
BlobChoiceClassifier
Definition: ratngs.h:40
void merge_and_put_fragment_lists(inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
Definition: pieces.cpp:139
void merge_fragments(MATRIX *ratings, inT16 num_blobs)
Definition: pieces.cpp:315
void set_fonts(const GenericVector< tesseract::ScoredFont > &fonts)
Definition: ratngs.h:94
Definition: matrix.h:289
#define NULL
Definition: host.h:144
Definition: blobs.h:395
static void BreakPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:194
int SortByRating(const void *void1, const void *void2)
Definition: pieces.cpp:86
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
int get_pos() const
Definition: unicharset.h:65
BLOB_CHOICE_LIST * classify_blob(TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
Definition: wordclass.cpp:56
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:144
short inT16
Definition: host.h:100