tesseract  5.0.0-alpha-619-ge9db
shapetable.cpp
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
4 // File: shapetable.cpp
5 // Description: Class to map a classifier shape index to unicharset
6 // indices and font indices.
7 // Author: Ray Smith
8 // Created: Tue Nov 02 15:31:32 PDT 2010
9 //
10 // (C) Copyright 2010, Google Inc.
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 // http://www.apache.org/licenses/LICENSE-2.0
15 // Unless required by applicable law or agreed to in writing, software
16 // distributed under the License is distributed on an "AS IS" BASIS,
17 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 // See the License for the specific language governing permissions and
19 // limitations under the License.
20 //
22 
23 #include "shapetable.h"
24 
25 #include "bitvector.h"
26 #include "fontinfo.h"
27 #include "intfeaturespace.h"
28 #include <tesseract/strngs.h>
29 #include "unicharset.h"
30 #include "unicity_table.h"
31 
32 #include <algorithm>
33 
34 namespace tesseract {
35 
36 // Helper function to get the index of the first result with the required
37 // unichar_id. If the results are sorted by rating, this will also be the
38 // best result with the required unichar_id.
39 // Returns -1 if the unichar_id is not found
41  const GenericVector<ShapeRating>& results,
42  const ShapeTable& shape_table,
43  UNICHAR_ID unichar_id) {
44  for (int r = 0; r < results.size(); ++r) {
45  const int shape_id = results[r].shape_id;
46  const Shape& shape = shape_table.GetShape(shape_id);
47  if (shape.ContainsUnichar(unichar_id)) {
48  return r;
49  }
50  }
51  return -1;
52 }
53 
54 // Helper function to get the index of the first result with the required
55 // unichar_id. If the results are sorted by rating, this will also be the
56 // best result with the required unichar_id.
57 // Returns -1 if the unichar_id is not found
59  const GenericVector<UnicharRating>& results,
60  UNICHAR_ID unichar_id) {
61  for (int r = 0; r < results.size(); ++r) {
62  if (results[r].unichar_id == unichar_id)
63  return r;
64  }
65  return -1;
66 }
67 
68 // Writes to the given file. Returns false in case of error.
69 bool UnicharAndFonts::Serialize(FILE* fp) const {
71 }
72 // Reads from the given file. Returns false in case of error.
73 
75  return fp->DeSerialize(&unichar_id) && font_ids.DeSerialize(fp);
76 }
77 
78 // Sort function to sort a pair of UnicharAndFonts by unichar_id.
79 int UnicharAndFonts::SortByUnicharId(const void* v1, const void* v2) {
80  const auto* p1 = static_cast<const UnicharAndFonts*>(v1);
81  const auto* p2 = static_cast<const UnicharAndFonts*>(v2);
82  return p1->unichar_id - p2->unichar_id;
83 }
84 
85 // Writes to the given file. Returns false in case of error.
86 bool Shape::Serialize(FILE* fp) const {
87  uint8_t sorted = unichars_sorted_;
88  return tesseract::Serialize(fp, &sorted) && unichars_.SerializeClasses(fp);
89 }
90 // Reads from the given file. Returns false in case of error.
91 
93  uint8_t sorted;
94  if (!fp->DeSerialize(&sorted)) return false;
95  unichars_sorted_ = sorted != 0;
96  return unichars_.DeSerializeClasses(fp);
97 }
98 
99 // Adds a font_id for the given unichar_id. If the unichar_id is not
100 // in the shape, it is added.
101 void Shape::AddToShape(int unichar_id, int font_id) {
102  for (int c = 0; c < unichars_.size(); ++c) {
103  if (unichars_[c].unichar_id == unichar_id) {
104  // Found the unichar in the shape table.
105  GenericVector<int>& font_list = unichars_[c].font_ids;
106  for (int f = 0; f < font_list.size(); ++f) {
107  if (font_list[f] == font_id)
108  return; // Font is already there.
109  }
110  font_list.push_back(font_id);
111  return;
112  }
113  }
114  // Unichar_id is not in shape, so add it to shape.
115  unichars_.push_back(UnicharAndFonts(unichar_id, font_id));
116  unichars_sorted_ = unichars_.size() <= 1;
117 }
118 
119 // Adds everything in other to this.
120 void Shape::AddShape(const Shape& other) {
121  for (int c = 0; c < other.unichars_.size(); ++c) {
122  for (int f = 0; f < other.unichars_[c].font_ids.size(); ++f) {
123  AddToShape(other.unichars_[c].unichar_id,
124  other.unichars_[c].font_ids[f]);
125  }
126  }
127  unichars_sorted_ = unichars_.size() <= 1;
128 }
129 
130 // Returns true if the shape contains the given unichar_id, font_id pair.
131 bool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const {
132  for (int c = 0; c < unichars_.size(); ++c) {
133  if (unichars_[c].unichar_id == unichar_id) {
134  // Found the unichar, so look for the font.
135  GenericVector<int>& font_list = unichars_[c].font_ids;
136  for (int f = 0; f < font_list.size(); ++f) {
137  if (font_list[f] == font_id)
138  return true;
139  }
140  return false;
141  }
142  }
143  return false;
144 }
145 
146 // Returns true if the shape contains the given unichar_id, ignoring font.
147 bool Shape::ContainsUnichar(int unichar_id) const {
148  for (int c = 0; c < unichars_.size(); ++c) {
149  if (unichars_[c].unichar_id == unichar_id) {
150  return true;
151  }
152  }
153  return false;
154 }
155 
156 // Returns true if the shape contains the given font, ignoring unichar_id.
157 bool Shape::ContainsFont(int font_id) const {
158  for (int c = 0; c < unichars_.size(); ++c) {
159  GenericVector<int>& font_list = unichars_[c].font_ids;
160  for (int f = 0; f < font_list.size(); ++f) {
161  if (font_list[f] == font_id)
162  return true;
163  }
164  }
165  return false;
166 }
167 // Returns true if the shape contains the given font properties, ignoring
168 // unichar_id.
170  uint32_t properties) const {
171  for (int c = 0; c < unichars_.size(); ++c) {
172  GenericVector<int>& font_list = unichars_[c].font_ids;
173  for (int f = 0; f < font_list.size(); ++f) {
174  if (font_table.get(font_list[f]).properties == properties)
175  return true;
176  }
177  }
178  return false;
179 }
180 // Returns true if the shape contains multiple different font properties,
181 // ignoring unichar_id.
183  const FontInfoTable& font_table) const {
184  uint32_t properties = font_table.get(unichars_[0].font_ids[0]).properties;
185  for (int c = 0; c < unichars_.size(); ++c) {
186  GenericVector<int>& font_list = unichars_[c].font_ids;
187  for (int f = 0; f < font_list.size(); ++f) {
188  if (font_table.get(font_list[f]).properties != properties)
189  return true;
190  }
191  }
192  return false;
193 }
194 
195 // Returns true if this shape is equal to other (ignoring order of unichars
196 // and fonts).
197 bool Shape::operator==(const Shape& other) const {
198  return IsSubsetOf(other) && other.IsSubsetOf(*this);
199 }
200 
201 // Returns true if this is a subset (including equal) of other.
202 bool Shape::IsSubsetOf(const Shape& other) const {
203  for (int c = 0; c < unichars_.size(); ++c) {
204  int unichar_id = unichars_[c].unichar_id;
205  const GenericVector<int>& font_list = unichars_[c].font_ids;
206  for (int f = 0; f < font_list.size(); ++f) {
207  if (!other.ContainsUnicharAndFont(unichar_id, font_list[f]))
208  return false;
209  }
210  }
211  return true;
212 }
213 
214 // Returns true if the lists of unichar ids are the same in this and other,
215 // ignoring fonts.
216 // NOT const, as it will sort the unichars on demand.
218  if (unichars_.size() != other->unichars_.size()) return false;
219  if (!unichars_sorted_) SortUnichars();
220  if (!other->unichars_sorted_) other->SortUnichars();
221  for (int c = 0; c < unichars_.size(); ++c) {
222  if (unichars_[c].unichar_id != other->unichars_[c].unichar_id)
223  return false;
224  }
225  return true;
226 }
227 
228 // Sorts the unichars_ vector by unichar.
229 void Shape::SortUnichars() {
230  unichars_.sort(UnicharAndFonts::SortByUnicharId);
231  unichars_sorted_ = true;
232 }
233 
234 ShapeTable::ShapeTable() : unicharset_(nullptr), num_fonts_(0) {
235 }
237  : unicharset_(&unicharset), num_fonts_(0) {
238 }
239 
240 // Writes to the given file. Returns false in case of error.
241 bool ShapeTable::Serialize(FILE* fp) const {
242  return shape_table_.Serialize(fp);
243 }
244 // Reads from the given file. Returns false in case of error.
245 
247  if (!shape_table_.DeSerialize(fp)) return false;
248  num_fonts_ = 0;
249  return true;
250 }
251 
252 // Returns the number of fonts used in this ShapeTable, computing it if
253 // necessary.
254 int ShapeTable::NumFonts() const {
255  if (num_fonts_ <= 0) {
256  for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) {
257  const Shape& shape = *shape_table_[shape_id];
258  for (int c = 0; c < shape.size(); ++c) {
259  for (int f = 0; f < shape[c].font_ids.size(); ++f) {
260  if (shape[c].font_ids[f] >= num_fonts_)
261  num_fonts_ = shape[c].font_ids[f] + 1;
262  }
263  }
264  }
265  }
266  return num_fonts_;
267 }
268 
269 // Re-indexes the class_ids in the shapetable according to the given map.
270 // Useful in conjunction with set_unicharset.
271 void ShapeTable::ReMapClassIds(const GenericVector<int>& unicharset_map) {
272  for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) {
273  Shape* shape = shape_table_[shape_id];
274  for (int c = 0; c < shape->size(); ++c) {
275  shape->SetUnicharId(c, unicharset_map[(*shape)[c].unichar_id]);
276  }
277  }
278 }
279 
280 // Returns a string listing the classes/fonts in a shape.
281 STRING ShapeTable::DebugStr(int shape_id) const {
282  if (shape_id < 0 || shape_id >= shape_table_.size())
283  return STRING("INVALID_UNICHAR_ID");
284  const Shape& shape = GetShape(shape_id);
285  STRING result;
286  result.add_str_int("Shape", shape_id);
287  if (shape.size() > 100) {
288  result.add_str_int(" Num unichars=", shape.size());
289  return result;
290  }
291  for (int c = 0; c < shape.size(); ++c) {
292  result.add_str_int(" c_id=", shape[c].unichar_id);
293  result += "=";
294  result += unicharset_->id_to_unichar(shape[c].unichar_id);
295  if (shape.size() < 10) {
296  result.add_str_int(", ", shape[c].font_ids.size());
297  result += " fonts =";
298  int num_fonts = shape[c].font_ids.size();
299  if (num_fonts > 10) {
300  result.add_str_int(" ", shape[c].font_ids[0]);
301  result.add_str_int(" ... ", shape[c].font_ids[num_fonts - 1]);
302  } else {
303  for (int f = 0; f < num_fonts; ++f) {
304  result.add_str_int(" ", shape[c].font_ids[f]);
305  }
306  }
307  }
308  }
309  return result;
310 }
311 
312 // Returns a debug string summarizing the table.
314  int max_unichars = 0;
315  int num_multi_shapes = 0;
316  int num_master_shapes = 0;
317  for (int s = 0; s < shape_table_.size(); ++s) {
318  if (MasterDestinationIndex(s) != s) continue;
319  ++num_master_shapes;
320  int shape_size = GetShape(s).size();
321  if (shape_size > 1)
322  ++num_multi_shapes;
323  if (shape_size > max_unichars)
324  max_unichars = shape_size;
325  }
326  STRING result;
327  result.add_str_int("Number of shapes = ", num_master_shapes);
328  result.add_str_int(" max unichars = ", max_unichars);
329  result.add_str_int(" number with multiple unichars = ", num_multi_shapes);
330  return result;
331 }
332 
333 
334 // Adds a new shape starting with the given unichar_id and font_id.
335 // Returns the assigned index.
336 int ShapeTable::AddShape(int unichar_id, int font_id) {
337  int index = shape_table_.size();
338  auto* shape = new Shape;
339  shape->AddToShape(unichar_id, font_id);
340  shape_table_.push_back(shape);
341  num_fonts_ = std::max(num_fonts_, font_id + 1);
342  return index;
343 }
344 
345 // Adds a copy of the given shape unless it is already present.
346 // Returns the assigned index or index of existing shape if already present.
347 int ShapeTable::AddShape(const Shape& other) {
348  int index;
349  for (index = 0; index < shape_table_.size() &&
350  !(other == *shape_table_[index]); ++index)
351  continue;
352  if (index == shape_table_.size()) {
353  auto* shape = new Shape(other);
354  shape_table_.push_back(shape);
355  }
356  num_fonts_ = 0;
357  return index;
358 }
359 
360 // Removes the shape given by the shape index.
361 void ShapeTable::DeleteShape(int shape_id) {
362  delete shape_table_[shape_id];
363  shape_table_[shape_id] = nullptr;
364  shape_table_.remove(shape_id);
365 }
366 
367 // Adds a font_id to the given existing shape index for the given
368 // unichar_id. If the unichar_id is not in the shape, it is added.
369 void ShapeTable::AddToShape(int shape_id, int unichar_id, int font_id) {
370  Shape& shape = *shape_table_[shape_id];
371  shape.AddToShape(unichar_id, font_id);
372  num_fonts_ = std::max(num_fonts_, font_id + 1);
373 }
374 
375 // Adds the given shape to the existing shape with the given index.
376 void ShapeTable::AddShapeToShape(int shape_id, const Shape& other) {
377  Shape& shape = *shape_table_[shape_id];
378  shape.AddShape(other);
379  num_fonts_ = 0;
380 }
381 
382 // Returns the id of the shape that contains the given unichar and font.
383 // If not found, returns -1.
384 // If font_id < 0, the font_id is ignored and the first shape that matches
385 // the unichar_id is returned.
386 int ShapeTable::FindShape(int unichar_id, int font_id) const {
387  for (int s = 0; s < shape_table_.size(); ++s) {
388  const Shape& shape = GetShape(s);
389  for (int c = 0; c < shape.size(); ++c) {
390  if (shape[c].unichar_id == unichar_id) {
391  if (font_id < 0)
392  return s; // We don't care about the font.
393  for (int f = 0; f < shape[c].font_ids.size(); ++f) {
394  if (shape[c].font_ids[f] == font_id)
395  return s;
396  }
397  }
398  }
399  }
400  return -1;
401 }
402 
403 // Returns the first unichar_id and font_id in the given shape.
405  int* unichar_id, int* font_id) const {
406  const UnicharAndFonts& unichar_and_fonts = (*shape_table_[shape_id])[0];
407  *unichar_id = unichar_and_fonts.unichar_id;
408  *font_id = unichar_and_fonts.font_ids[0];
409 }
410 
411 // Expands all the classes/fonts in the shape individually to build
412 // a ShapeTable.
414  const ShapeTable& master_shapes) {
415  BitVector shape_map(master_shapes.NumShapes());
416  for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
417  for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
418  int c = shape[u_ind].unichar_id;
419  int f = shape[u_ind].font_ids[f_ind];
420  int master_id = master_shapes.FindShape(c, f);
421  if (master_id >= 0) {
422  shape_map.SetBit(master_id);
423  } else if (FindShape(c, f) < 0) {
424  AddShape(c, f);
425  }
426  }
427  }
428  int num_masters = 0;
429  for (int s = 0; s < master_shapes.NumShapes(); ++s) {
430  if (shape_map[s]) {
431  AddShape(master_shapes.GetShape(s));
432  ++num_masters;
433  }
434  }
435  return num_masters;
436 }
437 
438 // Returns true if the shapes are already merged.
439 bool ShapeTable::AlreadyMerged(int shape_id1, int shape_id2) const {
440  return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2);
441 }
442 
443 // Returns true if any shape contains multiple unichars.
445  int num_shapes = NumShapes();
446  for (int s1 = 0; s1 < num_shapes; ++s1) {
447  if (MasterDestinationIndex(s1) != s1) continue;
448  if (GetShape(s1).size() > 1)
449  return true;
450  }
451  return false;
452 }
453 
454 // Returns the maximum number of unichars over all shapes.
456  int max_num_unichars = 0;
457  int num_shapes = NumShapes();
458  for (int s = 0; s < num_shapes; ++s) {
459  if (GetShape(s).size() > max_num_unichars)
460  max_num_unichars = GetShape(s).size();
461  }
462  return max_num_unichars;
463 }
464 
465 
466 // Merges shapes with a common unichar over the [start, end) interval.
467 // Assumes single unichar per shape.
468 void ShapeTable::ForceFontMerges(int start, int end) {
469  for (int s1 = start; s1 < end; ++s1) {
470  if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) {
471  int unichar_id = GetShape(s1)[0].unichar_id;
472  for (int s2 = s1 + 1; s2 < end; ++s2) {
473  if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 &&
474  unichar_id == GetShape(s2)[0].unichar_id) {
475  MergeShapes(s1, s2);
476  }
477  }
478  }
479  }
480  ShapeTable compacted(*unicharset_);
481  compacted.AppendMasterShapes(*this, nullptr);
482  *this = compacted;
483 }
484 
485 // Returns the number of unichars in the master shape.
486 int ShapeTable::MasterUnicharCount(int shape_id) const {
487  int master_id = MasterDestinationIndex(shape_id);
488  return GetShape(master_id).size();
489 }
490 
491 // Returns the sum of the font counts in the master shape.
492 int ShapeTable::MasterFontCount(int shape_id) const {
493  int master_id = MasterDestinationIndex(shape_id);
494  const Shape& shape = GetShape(master_id);
495  int font_count = 0;
496  for (int c = 0; c < shape.size(); ++c) {
497  font_count += shape[c].font_ids.size();
498  }
499  return font_count;
500 }
501 
502 // Returns the number of unichars that would result from merging the shapes.
503 int ShapeTable::MergedUnicharCount(int shape_id1, int shape_id2) const {
504  // Do it the easy way for now.
505  int master_id1 = MasterDestinationIndex(shape_id1);
506  int master_id2 = MasterDestinationIndex(shape_id2);
507  Shape combined_shape(*shape_table_[master_id1]);
508  combined_shape.AddShape(*shape_table_[master_id2]);
509  return combined_shape.size();
510 }
511 
512 // Merges two shape_ids, leaving shape_id2 marked as merged.
513 void ShapeTable::MergeShapes(int shape_id1, int shape_id2) {
514  int master_id1 = MasterDestinationIndex(shape_id1);
515  int master_id2 = MasterDestinationIndex(shape_id2);
516  // Point master_id2 (and all merged shapes) to master_id1.
517  shape_table_[master_id2]->set_destination_index(master_id1);
518  // Add all the shapes of master_id2 to master_id1.
519  shape_table_[master_id1]->AddShape(*shape_table_[master_id2]);
520 }
521 
522 // Swaps two shape_ids.
523 void ShapeTable::SwapShapes(int shape_id1, int shape_id2) {
524  Shape* tmp = shape_table_[shape_id1];
525  shape_table_[shape_id1] = shape_table_[shape_id2];
526  shape_table_[shape_id2] = tmp;
527 }
528 
529 // Returns the destination of this shape, (if merged), taking into account
530 // the fact that the destination may itself have been merged.
531 int ShapeTable::MasterDestinationIndex(int shape_id) const {
532  int dest_id = shape_table_[shape_id]->destination_index();
533  if (dest_id == shape_id || dest_id < 0)
534  return shape_id; // Is master already.
535  int master_id = shape_table_[dest_id]->destination_index();
536  if (master_id == dest_id || master_id < 0)
537  return dest_id; // Dest is the master and shape_id points to it.
538  master_id = MasterDestinationIndex(master_id);
539  return master_id;
540 }
541 
542 // Returns false if the unichars in neither shape is a subset of the other.
543 bool ShapeTable::SubsetUnichar(int shape_id1, int shape_id2) const {
544  const Shape& shape1 = GetShape(shape_id1);
545  const Shape& shape2 = GetShape(shape_id2);
546  int c1, c2;
547  for (c1 = 0; c1 < shape1.size(); ++c1) {
548  int unichar_id1 = shape1[c1].unichar_id;
549  if (!shape2.ContainsUnichar(unichar_id1))
550  break;
551  }
552  for (c2 = 0; c2 < shape2.size(); ++c2) {
553  int unichar_id2 = shape2[c2].unichar_id;
554  if (!shape1.ContainsUnichar(unichar_id2))
555  break;
556  }
557  return c1 == shape1.size() || c2 == shape2.size();
558 }
559 
560 // Returns false if the unichars in neither shape is a subset of the other.
561 bool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2,
562  int shape_id) const {
563  const Shape& merge1 = GetShape(merge_id1);
564  const Shape& merge2 = GetShape(merge_id2);
565  const Shape& shape = GetShape(shape_id);
566  int cm1, cm2, cs;
567  for (cs = 0; cs < shape.size(); ++cs) {
568  int unichar_id = shape[cs].unichar_id;
569  if (!merge1.ContainsUnichar(unichar_id) &&
570  !merge2.ContainsUnichar(unichar_id))
571  break; // Shape is not a subset of the merge.
572  }
573  for (cm1 = 0; cm1 < merge1.size(); ++cm1) {
574  int unichar_id1 = merge1[cm1].unichar_id;
575  if (!shape.ContainsUnichar(unichar_id1))
576  break; // Merge is not a subset of shape
577  }
578  for (cm2 = 0; cm2 < merge2.size(); ++cm2) {
579  int unichar_id2 = merge2[cm2].unichar_id;
580  if (!shape.ContainsUnichar(unichar_id2))
581  break; // Merge is not a subset of shape
582  }
583  return cs == shape.size() || (cm1 == merge1.size() && cm2 == merge2.size());
584 }
585 
586 // Returns true if the unichar sets are equal between the shapes.
587 bool ShapeTable::EqualUnichars(int shape_id1, int shape_id2) const {
588  const Shape& shape1 = GetShape(shape_id1);
589  const Shape& shape2 = GetShape(shape_id2);
590  for (int c1 = 0; c1 < shape1.size(); ++c1) {
591  int unichar_id1 = shape1[c1].unichar_id;
592  if (!shape2.ContainsUnichar(unichar_id1))
593  return false;
594  }
595  for (int c2 = 0; c2 < shape2.size(); ++c2) {
596  int unichar_id2 = shape2[c2].unichar_id;
597  if (!shape1.ContainsUnichar(unichar_id2))
598  return false;
599  }
600  return true;
601 }
602 
603 // Returns true if the unichar sets are equal between the shapes.
604 bool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2,
605  int shape_id) const {
606  const Shape& merge1 = GetShape(merge_id1);
607  const Shape& merge2 = GetShape(merge_id2);
608  const Shape& shape = GetShape(shape_id);
609  for (int cs = 0; cs < shape.size(); ++cs) {
610  int unichar_id = shape[cs].unichar_id;
611  if (!merge1.ContainsUnichar(unichar_id) &&
612  !merge2.ContainsUnichar(unichar_id))
613  return false; // Shape has a unichar that appears in neither merge.
614  }
615  for (int cm1 = 0; cm1 < merge1.size(); ++cm1) {
616  int unichar_id1 = merge1[cm1].unichar_id;
617  if (!shape.ContainsUnichar(unichar_id1))
618  return false; // Merge has a unichar that is not in shape.
619  }
620  for (int cm2 = 0; cm2 < merge2.size(); ++cm2) {
621  int unichar_id2 = merge2[cm2].unichar_id;
622  if (!shape.ContainsUnichar(unichar_id2))
623  return false; // Merge has a unichar that is not in shape.
624  }
625  return true;
626 }
627 
628 // Returns true if there is a common unichar between the shapes.
629 bool ShapeTable::CommonUnichars(int shape_id1, int shape_id2) const {
630  const Shape& shape1 = GetShape(shape_id1);
631  const Shape& shape2 = GetShape(shape_id2);
632  for (int c1 = 0; c1 < shape1.size(); ++c1) {
633  int unichar_id1 = shape1[c1].unichar_id;
634  if (shape2.ContainsUnichar(unichar_id1))
635  return true;
636  }
637  return false;
638 }
639 
640 // Returns true if there is a common font id between the shapes.
641 bool ShapeTable::CommonFont(int shape_id1, int shape_id2) const {
642  const Shape& shape1 = GetShape(shape_id1);
643  const Shape& shape2 = GetShape(shape_id2);
644  for (int c1 = 0; c1 < shape1.size(); ++c1) {
645  const GenericVector<int>& font_list1 = shape1[c1].font_ids;
646  for (int f = 0; f < font_list1.size(); ++f) {
647  if (shape2.ContainsFont(font_list1[f]))
648  return true;
649  }
650  }
651  return false;
652 }
653 
654 // Appends the master shapes from other to this.
655 // If not nullptr, shape_map is set to map other shape_ids to this's shape_ids.
657  GenericVector<int>* shape_map) {
658  if (shape_map != nullptr)
659  shape_map->init_to_size(other.NumShapes(), -1);
660  for (int s = 0; s < other.shape_table_.size(); ++s) {
661  if (other.shape_table_[s]->destination_index() < 0) {
662  int index = AddShape(*other.shape_table_[s]);
663  if (shape_map != nullptr)
664  (*shape_map)[s] = index;
665  }
666  }
667 }
668 
669 // Returns the number of master shapes remaining after merging.
671  int num_shapes = 0;
672  for (int s = 0; s < shape_table_.size(); ++s) {
673  if (shape_table_[s]->destination_index() < 0)
674  ++num_shapes;
675  }
676  return num_shapes;
677 }
678 
679 
680 // Adds the unichars of the given shape_id to the vector of results. Any
681 // unichar_id that is already present just has the fonts added to the
682 // font set for that result without adding a new entry in the vector.
683 // NOTE: it is assumed that the results are given to this function in order
684 // of decreasing rating.
685 // The unichar_map vector indicates the index of the results entry containing
686 // each unichar, or -1 if the unichar is not yet included in results.
688  GenericVector<int>* unichar_map,
689  GenericVector<UnicharRating>* results)const {
690  if (shape_rating.joined) {
691  AddUnicharToResults(UNICHAR_JOINED, shape_rating.rating, unichar_map,
692  results);
693  }
694  if (shape_rating.broken) {
695  AddUnicharToResults(UNICHAR_BROKEN, shape_rating.rating, unichar_map,
696  results);
697  }
698  const Shape& shape = GetShape(shape_rating.shape_id);
699  for (int u = 0; u < shape.size(); ++u) {
700  int result_index = AddUnicharToResults(shape[u].unichar_id,
701  shape_rating.rating,
702  unichar_map, results);
703  for (int f = 0; f < shape[u].font_ids.size(); ++f) {
704  (*results)[result_index].fonts.push_back(
705  ScoredFont(shape[u].font_ids[f],
706  IntCastRounded(shape_rating.rating * INT16_MAX)));
707  }
708  }
709 }
710 
711 // Adds the given unichar_id to the results if needed, updating unichar_map
712 // and returning the index of unichar in results.
713 int ShapeTable::AddUnicharToResults(
714  int unichar_id, float rating, GenericVector<int>* unichar_map,
715  GenericVector<UnicharRating>* results) const {
716  int result_index = unichar_map->get(unichar_id);
717  if (result_index < 0) {
718  UnicharRating result(unichar_id, rating);
719  result_index = results->push_back(result);
720  (*unichar_map)[unichar_id] = result_index;
721  }
722  return result_index;
723 }
724 
725 
726 } // namespace tesseract
tesseract::Shape::ContainsUnichar
bool ContainsUnichar(int unichar_id) const
Definition: shapetable.cpp:147
tesseract::ShapeRating::FirstResultWithUnichar
static int FirstResultWithUnichar(const GenericVector< ShapeRating > &results, const ShapeTable &shape_table, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:40
tesseract::ShapeTable::Serialize
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:241
strngs.h
tesseract::Shape::operator==
bool operator==(const Shape &other) const
Definition: shapetable.cpp:197
STRING::add_str_int
void add_str_int(const char *str, int number)
Definition: strngs.cpp:370
tesseract::ShapeTable::SwapShapes
void SwapShapes(int shape_id1, int shape_id2)
Definition: shapetable.cpp:523
tesseract::Shape::ContainsFont
bool ContainsFont(int font_id) const
Definition: shapetable.cpp:157
tesseract::ShapeTable::CommonFont
bool CommonFont(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:641
unicity_table.h
tesseract::Shape
Definition: shapetable.h:184
tesseract::Shape::ContainsMultipleFontProperties
bool ContainsMultipleFontProperties(const FontInfoTable &font_table) const
Definition: shapetable.cpp:182
tesseract::UnicharRating
Definition: shapetable.h:40
tesseract::UnicharRating::unichar_id
UNICHAR_ID unichar_id
Definition: shapetable.h:74
tesseract::ShapeTable::NumMasterShapes
int NumMasterShapes() const
Definition: shapetable.cpp:670
tesseract::ShapeRating::shape_id
int shape_id
Definition: shapetable.h:121
tesseract::ShapeTable::AppendMasterShapes
void AppendMasterShapes(const ShapeTable &other, GenericVector< int > *shape_map)
Definition: shapetable.cpp:656
tesseract::ShapeTable::NumShapes
int NumShapes() const
Definition: shapetable.h:274
intfeaturespace.h
STRING
Definition: strngs.h:45
tesseract::ShapeTable::ForceFontMerges
void ForceFontMerges(int start, int end)
Definition: shapetable.cpp:468
tesseract::UnicharAndFonts::SortByUnicharId
static int SortByUnicharId(const void *v1, const void *v2)
Definition: shapetable.cpp:79
tesseract::Shape::AddShape
void AddShape(const Shape &other)
Definition: shapetable.cpp:120
tesseract::ShapeTable::AddToShape
void AddToShape(int shape_id, int unichar_id, int font_id)
Definition: shapetable.cpp:369
tesseract::UnicharAndFonts
Definition: shapetable.h:159
tesseract::Shape::ContainsUnicharAndFont
bool ContainsUnicharAndFont(int unichar_id, int font_id) const
Definition: shapetable.cpp:131
GenericVector::Serialize
bool Serialize(FILE *fp) const
Definition: genericvector.h:929
IntCastRounded
int IntCastRounded(double x)
Definition: helpers.h:173
tesseract::ShapeRating::rating
float rating
Definition: shapetable.h:124
tesseract::Shape::DeSerialize
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:92
tesseract::ShapeTable::ShapeTable
ShapeTable()
Definition: shapetable.cpp:234
tesseract::ShapeTable::AlreadyMerged
bool AlreadyMerged(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:439
tesseract::FontInfoTable
Definition: fontinfo.h:146
tesseract::ShapeTable::DeSerialize
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:246
tesseract::ShapeTable::EqualUnichars
bool EqualUnichars(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:587
tesseract::ShapeTable::SubsetUnichar
bool SubsetUnichar(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:543
tesseract::ShapeTable::MergeEqualUnichars
bool MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id) const
Definition: shapetable.cpp:604
tesseract::ShapeTable::NumFonts
int NumFonts() const
Definition: shapetable.cpp:254
UNICHAR_BROKEN
Definition: unicharset.h:36
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
tesseract::UnicharAndFonts::unichar_id
int32_t unichar_id
Definition: shapetable.h:175
tesseract::ShapeTable::GetFirstUnicharAndFont
void GetFirstUnicharAndFont(int shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:404
tesseract::ShapeTable::MergeSubsetUnichar
bool MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id) const
Definition: shapetable.cpp:561
unicharset.h
tesseract::Shape::Serialize
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:86
tesseract::TFile::DeSerialize
bool DeSerialize(char *data, size_t count=1)
Definition: serialis.cpp:117
GenericVector::DeSerialize
bool DeSerialize(bool swap, FILE *fp)
Definition: genericvector.h:954
tesseract::ShapeTable::CommonUnichars
bool CommonUnichars(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:629
shapetable.h
tesseract::ScoredFont
Definition: fontinfo.h:38
tesseract::TFile
Definition: serialis.h:75
UNICHARSET
Definition: unicharset.h:145
tesseract::Shape::SetUnicharId
void SetUnicharId(int index, int unichar_id)
Definition: shapetable.h:208
tesseract::ShapeTable::DebugStr
STRING DebugStr(int shape_id) const
Definition: shapetable.cpp:281
tesseract::ShapeTable::GetShape
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:319
tesseract::ShapeRating
Definition: shapetable.h:92
tesseract::ShapeTable::ReMapClassIds
void ReMapClassIds(const GenericVector< int > &unicharset_map)
Definition: shapetable.cpp:271
tesseract
Definition: baseapi.h:65
fontinfo.h
tesseract::ShapeTable::AddShapeToShape
void AddShapeToShape(int shape_id, const Shape &other)
Definition: shapetable.cpp:376
tesseract::ShapeTable::MasterDestinationIndex
int MasterDestinationIndex(int shape_id) const
Definition: shapetable.cpp:531
unicharset_
UNICHARSET unicharset_
Definition: unicharcompress_test.cc:167
bitvector.h
tesseract::Shape::IsSubsetOf
bool IsSubsetOf(const Shape &other) const
Definition: shapetable.cpp:202
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
tesseract::BitVector
Definition: bitvector.h:30
GenericVector
Definition: baseapi.h:40
tesseract::ShapeTable::MergedUnicharCount
int MergedUnicharCount(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:503
tesseract::Shape::IsEqualUnichars
bool IsEqualUnichars(Shape *other)
Definition: shapetable.cpp:217
tesseract::UnicharAndFonts::DeSerialize
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:74
tesseract::ShapeTable::MasterUnicharCount
int MasterUnicharCount(int shape_id) const
Definition: shapetable.cpp:486
tesseract::ShapeTable::MasterFontCount
int MasterFontCount(int shape_id) const
Definition: shapetable.cpp:492
tesseract::Shape::AddToShape
void AddToShape(int unichar_id, int font_id)
Definition: shapetable.cpp:101
tesseract::ShapeRating::broken
bool broken
Definition: shapetable.h:132
GenericVector::get
T & get(int index) const
Definition: genericvector.h:716
tesseract::Shape::size
int size() const
Definition: shapetable.h:199
tesseract::ShapeTable::AddShapeToResults
void AddShapeToResults(const ShapeRating &shape_rating, GenericVector< int > *unichar_map, GenericVector< UnicharRating > *results) const
Definition: shapetable.cpp:687
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:706
tesseract::ShapeTable::FindShape
int FindShape(int unichar_id, int font_id) const
Definition: shapetable.cpp:386
tesseract::ShapeTable::AnyMultipleUnichars
bool AnyMultipleUnichars() const
Definition: shapetable.cpp:444
tesseract::ShapeTable
Definition: shapetable.h:261
tesseract::ShapeRating::joined
bool joined
Definition: shapetable.h:130
tesseract::ShapeTable::SummaryStr
STRING SummaryStr() const
Definition: shapetable.cpp:313
tesseract::ShapeTable::AddShape
int AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:336
tesseract::UnicharAndFonts::Serialize
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:69
tesseract::UnicharRating::FirstResultWithUnichar
static int FirstResultWithUnichar(const GenericVector< UnicharRating > &results, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:58
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
tesseract::ShapeTable::DeleteShape
void DeleteShape(int shape_id)
Definition: shapetable.cpp:361
tesseract::Shape::ContainsFontProperties
bool ContainsFontProperties(const FontInfoTable &font_table, uint32_t properties) const
Definition: shapetable.cpp:169
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::Serialize
bool Serialize(FILE *fp, const char *data, size_t n=1)
Definition: serialis.cpp:73
tesseract::ShapeTable::MaxNumUnichars
int MaxNumUnichars() const
Definition: shapetable.cpp:455
tesseract::ShapeTable::MergeShapes
void MergeShapes(int shape_id1, int shape_id2)
Definition: shapetable.cpp:513
tesseract::ShapeTable::BuildFromShape
int BuildFromShape(const Shape &shape, const ShapeTable &master_shapes)
Definition: shapetable.cpp:413
tesseract::UnicharAndFonts::font_ids
GenericVector< int32_t > font_ids
Definition: shapetable.h:174
UNICHAR_JOINED
Definition: unicharset.h:35