tesseract  5.0.0-alpha-619-ge9db
colpartition.cpp
Go to the documentation of this file.
1 // File: colpartition.cpp
3 // Description: Class to hold partitions of the page that correspond
4 // roughly to text lines.
5 // Author: Ray Smith
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #include "colpartition.h"
25 #include "colpartitiongrid.h"
26 #include "colpartitionset.h"
27 #include "detlinefit.h"
28 #include "dppoint.h"
29 #include "imagefind.h"
30 #include "workingpartset.h"
31 #include "host.h" // for NearlyEqual
32 
33 #include <algorithm>
34 
35 namespace tesseract {
36 
37 ELIST2IZE(ColPartition)
38 CLISTIZE(ColPartition)
39 
40 
42 // Maximum change in spacing (in inches) to ignore.
43 const double kMaxSpacingDrift = 1.0 / 72; // 1/72 is one point.
44 // Maximum fraction of line height used as an additional allowance
45 // for top spacing.
46 const double kMaxTopSpacingFraction = 0.25;
47 // What multiple of the largest line height should be used as an upper bound
48 // for whether lines are in the same text block?
49 const double kMaxSameBlockLineSpacing = 3;
50 // Maximum ratio of sizes for lines to be considered the same size.
51 const double kMaxSizeRatio = 1.5;
52 // Fraction of max of leader width and gap for max IQR of gaps.
53 const double kMaxLeaderGapFractionOfMax = 0.25;
54 // Fraction of min of leader width and gap for max IQR of gaps.
55 const double kMaxLeaderGapFractionOfMin = 0.5;
56 // Minimum number of blobs to be considered a leader.
57 const int kMinLeaderCount = 5;
58 // Minimum score for a STRONG_CHAIN textline.
59 const int kMinStrongTextValue = 6;
60 // Minimum score for a CHAIN textline.
61 const int kMinChainTextValue = 3;
62 // Minimum number of blobs for strong horizontal text lines.
64 // Minimum height (in image pixels) for strong horizontal text lines.
66 // Minimum aspect ratio for strong horizontal text lines.
68 // Maximum upper quartile error allowed on a baseline fit as a fraction
69 // of height.
70 const double kMaxBaselineError = 0.4375;
71 // Min coverage for a good baseline between vectors
72 const double kMinBaselineCoverage = 0.5;
73 // Max RMS color noise to compare colors.
74 const int kMaxRMSColorNoise = 128;
75 // Maximum distance to allow a partition color to be to use that partition
76 // in smoothing neighbouring types. This is a squared distance.
77 const int kMaxColorDistance = 900;
78 
79 // blob_type is the blob_region_type_ of the blobs in this partition.
80 // Vertical is the direction of logical vertical on the possibly skewed image.
81 ColPartition::ColPartition(BlobRegionType blob_type, const ICOORD& vertical)
82  : left_margin_(-INT32_MAX), right_margin_(INT32_MAX),
83  median_bottom_(INT32_MAX), median_top_(-INT32_MAX),
84  median_left_(INT32_MAX), median_right_(-INT32_MAX),
85  blob_type_(blob_type),
86  vertical_(vertical) {
87  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
88 }
89 
90 // Constructs a fake ColPartition with a single fake BLOBNBOX, all made
91 // from a single TBOX.
92 // WARNING: Despite being on C_LISTs, the BLOBNBOX owns the C_BLOB and
93 // the ColPartition owns the BLOBNBOX!!!
94 // Call DeleteBoxes before deleting the ColPartition.
96  PolyBlockType block_type,
97  BlobRegionType blob_type,
98  BlobTextFlowType flow) {
99  ColPartition* part = new ColPartition(blob_type, ICOORD(0, 1));
100  part->set_type(block_type);
101  part->set_flow(flow);
102  part->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(box)));
103  part->set_left_margin(box.left());
104  part->set_right_margin(box.right());
105  part->SetBlobTypes();
106  part->ComputeLimits();
107  part->ClaimBoxes();
108  return part;
109 }
110 
111 // Constructs and returns a ColPartition with the given real BLOBNBOX,
112 // and sets it up to be a "big" partition (single-blob partition bigger
113 // than the surrounding text that may be a dropcap, two or more vertically
114 // touching characters, or some graphic element.
115 // If the given list is not nullptr, the partition is also added to the list.
117  ColPartition_LIST* big_part_list) {
118  box->set_owner(nullptr);
119  ColPartition* single = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
120  single->set_flow(BTFT_NONE);
121  single->AddBox(box);
122  single->ComputeLimits();
123  single->ClaimBoxes();
124  single->SetBlobTypes();
125  single->set_block_owned(true);
126  if (big_part_list != nullptr) {
127  ColPartition_IT part_it(big_part_list);
128  part_it.add_to_end(single);
129  }
130  return single;
131 }
132 
134  // Remove this as a partner of all partners, as we don't want them
135  // referring to a deleted object.
136  ColPartition_C_IT it(&upper_partners_);
137  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
138  it.data()->RemovePartner(false, this);
139  }
140  it.set_to_list(&lower_partners_);
141  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
142  it.data()->RemovePartner(true, this);
143  }
144 }
145 
146 // Constructs a fake ColPartition with no BLOBNBOXes to represent a
147 // horizontal or vertical line, given a type and a bounding box.
149  const ICOORD& vertical,
150  int left, int bottom,
151  int right, int top) {
152  auto* part = new ColPartition(blob_type, vertical);
153  part->bounding_box_ = TBOX(left, bottom, right, top);
154  part->median_bottom_ = bottom;
155  part->median_top_ = top;
156  part->median_height_ = top - bottom;
157  part->median_left_ = left;
158  part->median_right_ = right;
159  part->median_width_ = right - left;
160  part->left_key_ = part->BoxLeftKey();
161  part->right_key_ = part->BoxRightKey();
162  return part;
163 }
164 
165 
166 // Adds the given box to the partition, updating the partition bounds.
167 // The list of boxes in the partition is updated, ensuring that no box is
168 // recorded twice, and the boxes are kept in increasing left position.
170  TBOX box = bbox->bounding_box();
171  // Update the partition limits.
172  if (boxes_.length() == 0) {
173  bounding_box_ = box;
174  } else {
175  bounding_box_ += box;
176  }
177 
178  if (IsVerticalType()) {
179  if (!last_add_was_vertical_) {
180  boxes_.sort(SortByBoxBottom<BLOBNBOX>);
181  last_add_was_vertical_ = true;
182  }
183  boxes_.add_sorted(SortByBoxBottom<BLOBNBOX>, true, bbox);
184  } else {
185  if (last_add_was_vertical_) {
186  boxes_.sort(SortByBoxLeft<BLOBNBOX>);
187  last_add_was_vertical_ = false;
188  }
189  boxes_.add_sorted(SortByBoxLeft<BLOBNBOX>, true, bbox);
190  }
191  if (!left_key_tab_)
192  left_key_ = BoxLeftKey();
193  if (!right_key_tab_)
194  right_key_ = BoxRightKey();
195  if (TabFind::WithinTestRegion(2, box.left(), box.bottom()))
196  tprintf("Added box (%d,%d)->(%d,%d) left_blob_x_=%d, right_blob_x_ = %d\n",
197  box.left(), box.bottom(), box.right(), box.top(),
198  bounding_box_.left(), bounding_box_.right());
199 }
200 
201 // Removes the given box from the partition, updating the bounds.
203  BLOBNBOX_C_IT bb_it(&boxes_);
204  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
205  if (box == bb_it.data()) {
206  bb_it.extract();
207  ComputeLimits();
208  return;
209  }
210  }
211 }
212 
213 // Returns the tallest box in the partition, as measured perpendicular to the
214 // presumed flow of text.
216  BLOBNBOX* biggest = nullptr;
217  BLOBNBOX_C_IT bb_it(&boxes_);
218  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
219  BLOBNBOX* bbox = bb_it.data();
220  if (IsVerticalType()) {
221  if (biggest == nullptr ||
222  bbox->bounding_box().width() > biggest->bounding_box().width())
223  biggest = bbox;
224  } else {
225  if (biggest == nullptr ||
226  bbox->bounding_box().height() > biggest->bounding_box().height())
227  biggest = bbox;
228  }
229  }
230  return biggest;
231 }
232 
233 // Returns the bounding box excluding the given box.
235  TBOX result;
236  BLOBNBOX_C_IT bb_it(&boxes_);
237  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
238  if (box != bb_it.data()) {
239  result += bb_it.data()->bounding_box();
240  }
241  }
242  return result;
243 }
244 
245 // Claims the boxes in the boxes_list by marking them with a this owner
246 // pointer. If a box is already owned, then it must be owned by this.
248  BLOBNBOX_C_IT bb_it(&boxes_);
249  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
250  BLOBNBOX* bblob = bb_it.data();
251  ColPartition* other = bblob->owner();
252  if (other == nullptr) {
253  // Normal case: ownership is available.
254  bblob->set_owner(this);
255  } else {
256  ASSERT_HOST(other == this);
257  }
258  }
259 }
260 
261 // nullptr the owner of the blobs in this partition, so they can be deleted
262 // independently of the ColPartition.
264  BLOBNBOX_C_IT bb_it(&boxes_);
265  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
266  BLOBNBOX* bblob = bb_it.data();
267  ASSERT_HOST(bblob->owner() == this || bblob->owner() == nullptr);
268  bblob->set_owner(nullptr);
269  }
270 }
271 
272 // nullptr the owner of the blobs in this partition that are owned by this
273 // partition, so they can be deleted independently of the ColPartition.
274 // Any blobs that are not owned by this partition get to keep their owner
275 // without an assert failure.
277  BLOBNBOX_C_IT bb_it(&boxes_);
278  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
279  BLOBNBOX* bblob = bb_it.data();
280  if (bblob->owner() == this)
281  bblob->set_owner(nullptr);
282  }
283 }
284 
285 // Nulls the owner of the blobs in this partition that are owned by this
286 // partition and not leader blobs, removing them from the boxes_ list, thus
287 // turning this partition back to a leader partition if it contains a leader,
288 // or otherwise leaving it empty. Returns true if any boxes remain.
290  BLOBNBOX_C_IT bb_it(&boxes_);
291  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
292  BLOBNBOX* bblob = bb_it.data();
293  if (bblob->flow() != BTFT_LEADER) {
294  if (bblob->owner() == this) bblob->set_owner(nullptr);
295  bb_it.extract();
296  }
297  }
298  if (bb_it.empty()) return false;
299  flow_ = BTFT_LEADER;
300  ComputeLimits();
301  return true;
302 }
303 
304 // Delete the boxes that this partition owns.
306  // Although the boxes_ list is a C_LIST, in some cases it owns the
307  // BLOBNBOXes, as the ColPartition takes ownership from the grid,
308  // and the BLOBNBOXes own the underlying C_BLOBs.
309  for (BLOBNBOX_C_IT bb_it(&boxes_); !bb_it.empty(); bb_it.forward()) {
310  BLOBNBOX* bblob = bb_it.extract();
311  delete bblob->cblob();
312  delete bblob;
313  }
314 }
315 
316 // Reflects the partition in the y-axis, assuming that its blobs have
317 // already been done. Corrects only a limited part of the members, since
318 // this function is assumed to be used shortly after initial creation, which
319 // is before a lot of the members are used.
321  BLOBNBOX_CLIST reversed_boxes;
322  BLOBNBOX_C_IT reversed_it(&reversed_boxes);
323  // Reverse the order of the boxes_.
324  BLOBNBOX_C_IT bb_it(&boxes_);
325  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
326  reversed_it.add_before_then_move(bb_it.extract());
327  }
328  bb_it.add_list_after(&reversed_boxes);
329  ASSERT_HOST(!left_key_tab_ && !right_key_tab_);
330  int tmp = left_margin_;
331  left_margin_ = -right_margin_;
332  right_margin_ = -tmp;
333  ComputeLimits();
334 }
335 
336 // Returns true if this is a legal partition - meaning that the conditions
337 // left_margin <= bounding_box left
338 // left_key <= bounding box left key
339 // bounding box left <= bounding box right
340 // and likewise for right margin and key
341 // are all met.
343  if (bounding_box_.left() > bounding_box_.right()) {
344  if (textord_debug_bugs) {
345  tprintf("Bounding box invalid\n");
346  Print();
347  }
348  return false; // Bounding box invalid.
349  }
350  if (left_margin_ > bounding_box_.left() ||
351  right_margin_ < bounding_box_.right()) {
352  if (textord_debug_bugs) {
353  tprintf("Margins invalid\n");
354  Print();
355  }
356  return false; // Margins invalid.
357  }
358  if (left_key_ > BoxLeftKey() || right_key_ < BoxRightKey()) {
359  if (textord_debug_bugs) {
360  tprintf("Key inside box: %d v %d or %d v %d\n",
361  left_key_, BoxLeftKey(), right_key_, BoxRightKey());
362  Print();
363  }
364  return false; // Keys inside the box.
365  }
366  return true;
367 }
368 
369 // Returns true if the left and right edges are approximately equal.
370 bool ColPartition::MatchingColumns(const ColPartition& other) const {
371  int y = (MidY() + other.MidY()) / 2;
372  if (!NearlyEqual(other.LeftAtY(y) / kColumnWidthFactor,
373  LeftAtY(y) / kColumnWidthFactor, 1))
374  return false;
375  if (!NearlyEqual(other.RightAtY(y) / kColumnWidthFactor,
376  RightAtY(y) / kColumnWidthFactor, 1))
377  return false;
378  return true;
379 }
380 
381 // Returns true if the colors match for two text partitions.
383  if (color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise &&
384  other.color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise)
385  return false; // Too noisy.
386 
387  // Colors must match for other to count.
388  double d_this1_o = ImageFind::ColorDistanceFromLine(other.color1_,
389  other.color2_,
390  color1_);
391  double d_this2_o = ImageFind::ColorDistanceFromLine(other.color1_,
392  other.color2_,
393  color2_);
394  double d_o1_this = ImageFind::ColorDistanceFromLine(color1_, color2_,
395  other.color1_);
396  double d_o2_this = ImageFind::ColorDistanceFromLine(color1_, color2_,
397  other.color2_);
398 // All 4 distances must be small enough.
399  return d_this1_o < kMaxColorDistance && d_this2_o < kMaxColorDistance &&
400  d_o1_this < kMaxColorDistance && d_o2_this < kMaxColorDistance;
401 }
402 
403 // Returns true if the sizes match for two text partitions,
404 // taking orientation into account. See also SizesSimilar.
405 bool ColPartition::MatchingSizes(const ColPartition& other) const {
406  if (blob_type_ == BRT_VERT_TEXT || other.blob_type_ == BRT_VERT_TEXT)
407  return !TabFind::DifferentSizes(median_width_, other.median_width_);
408  else
409  return !TabFind::DifferentSizes(median_height_, other.median_height_);
410 }
411 
412 // Returns true if there is no tabstop violation in merging this and other.
414  if (bounding_box_.right() < other.bounding_box_.left() &&
415  bounding_box_.right() < other.LeftBlobRule())
416  return false;
417  if (other.bounding_box_.right() < bounding_box_.left() &&
418  other.bounding_box_.right() < LeftBlobRule())
419  return false;
420  if (bounding_box_.left() > other.bounding_box_.right() &&
421  bounding_box_.left() > other.RightBlobRule())
422  return false;
423  if (other.bounding_box_.left() > bounding_box_.right() &&
424  other.bounding_box_.left() > RightBlobRule())
425  return false;
426  return true;
427 }
428 
429 // Returns true if other has a similar stroke width to this.
431  double fractional_tolerance,
432  double constant_tolerance) const {
433  int match_count = 0;
434  int nonmatch_count = 0;
435  BLOBNBOX_C_IT box_it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
436  BLOBNBOX_C_IT other_it(const_cast<BLOBNBOX_CLIST*>(&other.boxes_));
437  box_it.mark_cycle_pt();
438  other_it.mark_cycle_pt();
439  while (!box_it.cycled_list() && !other_it.cycled_list()) {
440  if (box_it.data()->MatchingStrokeWidth(*other_it.data(),
441  fractional_tolerance,
442  constant_tolerance))
443  ++match_count;
444  else
445  ++nonmatch_count;
446  box_it.forward();
447  other_it.forward();
448  }
449  return match_count > nonmatch_count;
450 }
451 
452 // Returns true if base is an acceptable diacritic base char merge
453 // with this as the diacritic.
454 // Returns true if:
455 // (1) this is a ColPartition containing only diacritics, and
456 // (2) the base characters indicated on the diacritics all believably lie
457 // within the text line of the candidate ColPartition.
459  bool debug) const {
460  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
461  int min_top = INT32_MAX;
462  int max_bottom = -INT32_MAX;
463  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
464  BLOBNBOX* blob = it.data();
465  if (!blob->IsDiacritic()) {
466  if (debug) {
467  tprintf("Blob is not a diacritic:");
468  blob->bounding_box().print();
469  }
470  return false; // All blobs must have diacritic bases.
471  }
472  if (blob->base_char_top() < min_top)
473  min_top = blob->base_char_top();
474  if (blob->base_char_bottom() > max_bottom)
475  max_bottom = blob->base_char_bottom();
476  }
477  // If the intersection of all vertical ranges of all base characters
478  // overlaps the median range of this, then it is OK.
479  bool result = min_top > candidate.median_bottom_ &&
480  max_bottom < candidate.median_top_;
481  if (debug) {
482  if (result)
483  tprintf("OKDiacritic!\n");
484  else
485  tprintf("y ranges don\'t overlap: %d-%d / %d-%d\n",
486  max_bottom, min_top, median_bottom_, median_top_);
487  }
488  return result;
489 }
490 
491 // Sets the sort key using either the tab vector, or the bounding box if
492 // the tab vector is nullptr. If the tab_vector lies inside the bounding_box,
493 // use the edge of the box as a key any way.
494 void ColPartition::SetLeftTab(const TabVector* tab_vector) {
495  if (tab_vector != nullptr) {
496  left_key_ = tab_vector->sort_key();
497  left_key_tab_ = left_key_ <= BoxLeftKey();
498  } else {
499  left_key_tab_ = false;
500  }
501  if (!left_key_tab_)
502  left_key_ = BoxLeftKey();
503 }
504 
505 // As SetLeftTab, but with the right.
506 void ColPartition::SetRightTab(const TabVector* tab_vector) {
507  if (tab_vector != nullptr) {
508  right_key_ = tab_vector->sort_key();
509  right_key_tab_ = right_key_ >= BoxRightKey();
510  } else {
511  right_key_tab_ = false;
512  }
513  if (!right_key_tab_)
514  right_key_ = BoxRightKey();
515 }
516 
517 // Copies the left/right tab from the src partition, but if take_box is
518 // true, copies the box instead and uses that as a key.
519 void ColPartition::CopyLeftTab(const ColPartition& src, bool take_box) {
520  left_key_tab_ = take_box ? false : src.left_key_tab_;
521  if (left_key_tab_) {
522  left_key_ = src.left_key_;
523  } else {
524  bounding_box_.set_left(XAtY(src.BoxLeftKey(), MidY()));
525  left_key_ = BoxLeftKey();
526  }
527  if (left_margin_ > bounding_box_.left())
528  left_margin_ = src.left_margin_;
529 }
530 
531 // As CopyLeftTab, but with the right.
532 void ColPartition::CopyRightTab(const ColPartition& src, bool take_box) {
533  right_key_tab_ = take_box ? false : src.right_key_tab_;
534  if (right_key_tab_) {
535  right_key_ = src.right_key_;
536  } else {
537  bounding_box_.set_right(XAtY(src.BoxRightKey(), MidY()));
538  right_key_ = BoxRightKey();
539  }
540  if (right_margin_ < bounding_box_.right())
541  right_margin_ = src.right_margin_;
542 }
543 
544 // Returns the left rule line x coord of the leftmost blob.
546  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
547  return it.data()->left_rule();
548 }
549 // Returns the right rule line x coord of the rightmost blob.
551  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
552  it.move_to_last();
553  return it.data()->right_rule();
554 }
555 
558  return special_blobs_densities_[type];
559 }
560 
563  BLOBNBOX_C_IT blob_it(&boxes_);
564  int count = 0;
565  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
566  BLOBNBOX* blob = blob_it.data();
568  if (blob_type == type) {
569  count++;
570  }
571  }
572 
573  return count;
574 }
575 
577  const BlobSpecialTextType type, const float density) {
579  special_blobs_densities_[type] = density;
580 }
581 
583  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
584  if (boxes_.empty()) {
585  return;
586  }
587 
588  BLOBNBOX_C_IT blob_it(&boxes_);
589  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
590  BLOBNBOX* blob = blob_it.data();
592  special_blobs_densities_[type]++;
593  }
594 
595  for (float& special_blobs_density : special_blobs_densities_) {
596  special_blobs_density /= boxes_.length();
597  }
598 }
599 
600 // Add a partner above if upper, otherwise below.
601 // Add them uniquely and keep the list sorted by box left.
602 // Partnerships are added symmetrically to partner and this.
603 void ColPartition::AddPartner(bool upper, ColPartition* partner) {
604  if (upper) {
605  partner->lower_partners_.add_sorted(SortByBoxLeft<ColPartition>,
606  true, this);
607  upper_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
608  } else {
609  partner->upper_partners_.add_sorted(SortByBoxLeft<ColPartition>,
610  true, this);
611  lower_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
612  }
613 }
614 
615 // Removes the partner from this, but does not remove this from partner.
616 // This asymmetric removal is so as not to mess up the iterator that is
617 // working on partner's partner list.
618 void ColPartition::RemovePartner(bool upper, ColPartition* partner) {
619  ColPartition_C_IT it(upper ? &upper_partners_ : &lower_partners_);
620  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
621  if (it.data() == partner) {
622  it.extract();
623  break;
624  }
625  }
626 }
627 
628 // Returns the partner if the given partner is a singleton, otherwise nullptr.
630  ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
631  if (!partners->singleton())
632  return nullptr;
633  ColPartition_C_IT it(partners);
634  return it.data();
635 }
636 
637 // Merge with the other partition and delete it.
639  // The result has to either own all of the blobs or none of them.
640  // Verify the flag is consistent.
641  ASSERT_HOST(owns_blobs() == other->owns_blobs());
642  // TODO(nbeato): check owns_blobs better. Right now owns_blobs
643  // should always be true when this is called. So there is no issues.
644  if (TabFind::WithinTestRegion(2, bounding_box_.left(),
645  bounding_box_.bottom()) ||
646  TabFind::WithinTestRegion(2, other->bounding_box_.left(),
647  other->bounding_box_.bottom())) {
648  tprintf("Merging:");
649  Print();
650  other->Print();
651  }
652 
653  // Update the special_blobs_densities_.
654  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
655  for (int type = 0; type < BSTT_COUNT; ++type) {
656  unsigned w1 = boxes_.length();
657  unsigned w2 = other->boxes_.length();
658  float new_val = special_blobs_densities_[type] * w1 +
659  other->special_blobs_densities_[type] * w2;
660  if (!w1 || !w2) {
661  ASSERT_HOST((w1 + w2) > 0);
662  special_blobs_densities_[type] = new_val / (w1 + w2);
663  }
664  }
665 
666  // Merge the two sorted lists.
667  BLOBNBOX_C_IT it(&boxes_);
668  BLOBNBOX_C_IT it2(&other->boxes_);
669  for (; !it2.empty(); it2.forward()) {
670  BLOBNBOX* bbox2 = it2.extract();
671  ColPartition* prev_owner = bbox2->owner();
672  if (prev_owner != other && prev_owner != nullptr) {
673  // A blob on other's list is owned by someone else; let them have it.
674  continue;
675  }
676  ASSERT_HOST(prev_owner == other || prev_owner == nullptr);
677  if (prev_owner == other)
678  bbox2->set_owner(this);
679  it.add_to_end(bbox2);
680  }
681  left_margin_ = std::min(left_margin_, other->left_margin_);
682  right_margin_ = std::max(right_margin_, other->right_margin_);
683  if (other->left_key_ < left_key_) {
684  left_key_ = other->left_key_;
685  left_key_tab_ = other->left_key_tab_;
686  }
687  if (other->right_key_ > right_key_) {
688  right_key_ = other->right_key_;
689  right_key_tab_ = other->right_key_tab_;
690  }
691  // Combine the flow and blob_type in a sensible way.
692  // Dominant flows stay.
693  if (!DominatesInMerge(flow_, other->flow_)) {
694  flow_ = other->flow_;
695  blob_type_ = other->blob_type_;
696  }
697  SetBlobTypes();
698  if (IsVerticalType()) {
699  boxes_.sort(SortByBoxBottom<BLOBNBOX>);
700  last_add_was_vertical_ = true;
701  } else {
702  boxes_.sort(SortByBoxLeft<BLOBNBOX>);
703  last_add_was_vertical_ = false;
704  }
705  ComputeLimits();
706  // Fix partner lists. other is going away, so remove it as a
707  // partner of all its partners and add this in its place.
708  for (int upper = 0; upper < 2; ++upper) {
709  ColPartition_CLIST partners;
710  ColPartition_C_IT part_it(&partners);
711  part_it.add_list_after(upper ? &other->upper_partners_
712  : &other->lower_partners_);
713  for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
714  ColPartition* partner = part_it.extract();
715  partner->RemovePartner(!upper, other);
716  partner->RemovePartner(!upper, this);
717  partner->AddPartner(!upper, this);
718  }
719  }
720  delete other;
721  if (cb != nullptr) {
722  SetColumnGoodness(cb);
723  }
724 }
725 
726 // Merge1 and merge2 are candidates to be merged, yet their combined box
727 // overlaps this. Is that allowed?
728 // Returns true if the overlap between this and the merged pair of
729 // merge candidates is sufficiently trivial to be allowed.
730 // The merged box can graze the edge of this by the ok_box_overlap
731 // if that exceeds the margin to the median top and bottom.
732 // ok_box_overlap should be set by the caller appropriate to the sizes of
733 // the text involved, and is usually a fraction of the median size of merge1
734 // and/or merge2, or this.
735 // TODO(rays) Determine whether vertical text needs to be considered.
737  const ColPartition& merge2,
738  int ok_box_overlap, bool debug) {
739  // Vertical partitions are not allowed to be involved.
740  if (IsVerticalType() || merge1.IsVerticalType() || merge2.IsVerticalType()) {
741  if (debug)
742  tprintf("Vertical partition\n");
743  return false;
744  }
745  // The merging partitions must strongly overlap each other.
746  if (!merge1.VSignificantCoreOverlap(merge2)) {
747  if (debug)
748  tprintf("Voverlap %d (%d)\n",
749  merge1.VCoreOverlap(merge2),
750  merge1.VSignificantCoreOverlap(merge2));
751  return false;
752  }
753  // The merged box must not overlap the median bounds of this.
754  TBOX merged_box(merge1.bounding_box());
755  merged_box += merge2.bounding_box();
756  if (merged_box.bottom() < median_top_ && merged_box.top() > median_bottom_ &&
757  merged_box.bottom() < bounding_box_.top() - ok_box_overlap &&
758  merged_box.top() > bounding_box_.bottom() + ok_box_overlap) {
759  if (debug)
760  tprintf("Excessive box overlap\n");
761  return false;
762  }
763  // Looks OK!
764  return true;
765 }
766 
767 // Find the blob at which to split this to minimize the overlap with the
768 // given box. Returns the first blob to go in the second partition.
770  if (boxes_.empty() || boxes_.singleton())
771  return nullptr;
772  BLOBNBOX_C_IT it(&boxes_);
773  TBOX left_box(it.data()->bounding_box());
774  for (it.forward(); !it.at_first(); it.forward()) {
775  BLOBNBOX* bbox = it.data();
776  left_box += bbox->bounding_box();
777  if (left_box.overlap(box))
778  return bbox;
779  }
780  return nullptr;
781 }
782 
783 // Split this partition keeping the first half in this and returning
784 // the second half.
785 // Splits by putting the split_blob and the blobs that follow
786 // in the second half, and the rest in the first half.
788  ColPartition* split_part = ShallowCopy();
789  split_part->set_owns_blobs(owns_blobs());
790  BLOBNBOX_C_IT it(&boxes_);
791  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
792  BLOBNBOX* bbox = it.data();
793  ColPartition* prev_owner = bbox->owner();
794  ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == nullptr);
795  if (bbox == split_blob || !split_part->boxes_.empty()) {
796  split_part->AddBox(it.extract());
797  if (owns_blobs() && prev_owner != nullptr)
798  bbox->set_owner(split_part);
799  }
800  }
801  ASSERT_HOST(!it.empty());
802  if (split_part->IsEmpty()) {
803  // Split part ended up with nothing. Possible if split_blob is not
804  // in the list of blobs.
805  delete split_part;
806  return nullptr;
807  }
808  right_key_tab_ = false;
809  split_part->left_key_tab_ = false;
810  ComputeLimits();
811  // TODO(nbeato) Merge Ray's CL like this:
812  // if (owns_blobs())
813  // SetBlobTextlineGoodness();
814  split_part->ComputeLimits();
815  // TODO(nbeato) Merge Ray's CL like this:
816  // if (split_part->owns_blobs())
817  // split_part->SetBlobTextlineGoodness();
818  return split_part;
819 }
820 
821 // Split this partition at the given x coordinate, returning the right
822 // half and keeping the left half in this.
824  if (split_x <= bounding_box_.left() || split_x >= bounding_box_.right())
825  return nullptr; // There will be no change.
826  ColPartition* split_part = ShallowCopy();
827  split_part->set_owns_blobs(owns_blobs());
828  BLOBNBOX_C_IT it(&boxes_);
829  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
830  BLOBNBOX* bbox = it.data();
831  ColPartition* prev_owner = bbox->owner();
832  ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == nullptr);
833  const TBOX& box = bbox->bounding_box();
834  if (box.left() >= split_x) {
835  split_part->AddBox(it.extract());
836  if (owns_blobs() && prev_owner != nullptr)
837  bbox->set_owner(split_part);
838  }
839  }
840  if (it.empty()) {
841  // Possible if split-x passes through the first blob.
842  it.add_list_after(&split_part->boxes_);
843  }
844  ASSERT_HOST(!it.empty());
845  if (split_part->IsEmpty()) {
846  // Split part ended up with nothing. Possible if split_x passes
847  // through the last blob.
848  delete split_part;
849  return nullptr;
850  }
851  right_key_tab_ = false;
852  split_part->left_key_tab_ = false;
853  right_margin_ = split_x;
854  split_part->left_margin_ = split_x;
855  ComputeLimits();
856  split_part->ComputeLimits();
857  return split_part;
858 }
859 
860 // Recalculates all the coordinate limits of the partition.
862  bounding_box_ = TBOX(); // Clear it
863  BLOBNBOX_C_IT it(&boxes_);
864  BLOBNBOX* bbox = nullptr;
865  int non_leader_count = 0;
866  if (it.empty()) {
867  bounding_box_.set_left(left_margin_);
868  bounding_box_.set_right(right_margin_);
869  bounding_box_.set_bottom(0);
870  bounding_box_.set_top(0);
871  } else {
872  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
873  bbox = it.data();
874  bounding_box_ += bbox->bounding_box();
875  if (bbox->flow() != BTFT_LEADER)
876  ++non_leader_count;
877  }
878  }
879  if (!left_key_tab_)
880  left_key_ = BoxLeftKey();
881  if (left_key_ > BoxLeftKey() && textord_debug_bugs) {
882  // TODO(rays) investigate the causes of these error messages, to find
883  // out if they are genuinely harmful, or just indicative of junk input.
884  tprintf("Computed left-illegal partition\n");
885  Print();
886  }
887  if (!right_key_tab_)
888  right_key_ = BoxRightKey();
889  if (right_key_ < BoxRightKey() && textord_debug_bugs) {
890  tprintf("Computed right-illegal partition\n");
891  Print();
892  }
893  if (it.empty())
894  return;
895  if (IsImageType() || blob_type() == BRT_RECTIMAGE ||
896  blob_type() == BRT_POLYIMAGE) {
897  median_top_ = bounding_box_.top();
898  median_bottom_ = bounding_box_.bottom();
899  median_height_ = bounding_box_.height();
900  median_left_ = bounding_box_.left();
901  median_right_ = bounding_box_.right();
902  median_width_ = bounding_box_.width();
903  } else {
904  STATS top_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
905  STATS bottom_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
906  STATS height_stats(0, bounding_box_.height() + 1);
907  STATS left_stats(bounding_box_.left(), bounding_box_.right() + 1);
908  STATS right_stats(bounding_box_.left(), bounding_box_.right() + 1);
909  STATS width_stats(0, bounding_box_.width() + 1);
910  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
911  bbox = it.data();
912  if (non_leader_count == 0 || bbox->flow() != BTFT_LEADER) {
913  const TBOX& box = bbox->bounding_box();
914  int area = box.area();
915  top_stats.add(box.top(), area);
916  bottom_stats.add(box.bottom(), area);
917  height_stats.add(box.height(), area);
918  left_stats.add(box.left(), area);
919  right_stats.add(box.right(), area);
920  width_stats.add(box.width(), area);
921  }
922  }
923  median_top_ = static_cast<int>(top_stats.median() + 0.5);
924  median_bottom_ = static_cast<int>(bottom_stats.median() + 0.5);
925  median_height_ = static_cast<int>(height_stats.median() + 0.5);
926  median_left_ = static_cast<int>(left_stats.median() + 0.5);
927  median_right_ = static_cast<int>(right_stats.median() + 0.5);
928  median_width_ = static_cast<int>(width_stats.median() + 0.5);
929  }
930 
931  if (right_margin_ < bounding_box_.right() && textord_debug_bugs) {
932  tprintf("Made partition with bad right coords");
933  Print();
934  }
935  if (left_margin_ > bounding_box_.left() && textord_debug_bugs) {
936  tprintf("Made partition with bad left coords");
937  Print();
938  }
939  // Fix partner lists. The bounding box has changed and partners are stored
940  // in bounding box order, so remove and reinsert this as a partner
941  // of all its partners.
942  for (int upper = 0; upper < 2; ++upper) {
943  ColPartition_CLIST partners;
944  ColPartition_C_IT part_it(&partners);
945  part_it.add_list_after(upper ? &upper_partners_ : &lower_partners_);
946  for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
947  ColPartition* partner = part_it.extract();
948  partner->RemovePartner(!upper, this);
949  partner->AddPartner(!upper, this);
950  }
951  }
952  if (TabFind::WithinTestRegion(2, bounding_box_.left(),
953  bounding_box_.bottom())) {
954  tprintf("Recomputed box for partition %p\n", this);
955  Print();
956  }
957 }
958 
959 // Returns the number of boxes that overlap the given box.
961  BLOBNBOX_C_IT it(&boxes_);
962  int overlap_count = 0;
963  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
964  BLOBNBOX* bbox = it.data();
965  if (box.overlap(bbox->bounding_box()))
966  ++overlap_count;
967  }
968  return overlap_count;
969 }
970 
971 // Computes and sets the type_ and first_column_, last_column_ and column_set_.
972 // resolution refers to the ppi resolution of the image.
973 void ColPartition::SetPartitionType(int resolution, ColPartitionSet* columns) {
974  int first_spanned_col = -1;
975  ColumnSpanningType span_type =
976  columns->SpanningType(resolution,
977  bounding_box_.left(), bounding_box_.right(),
978  std::min(bounding_box_.height(), bounding_box_.width()),
979  MidY(), left_margin_, right_margin_,
980  &first_column_, &last_column_,
981  &first_spanned_col);
982  column_set_ = columns;
983  if (first_column_ < last_column_ && span_type == CST_PULLOUT &&
984  !IsLineType()) {
985  // Unequal columns may indicate that the pullout spans one of the columns
986  // it lies in, so force it to be allocated to just that column.
987  if (first_spanned_col >= 0) {
988  first_column_ = first_spanned_col;
989  last_column_ = first_spanned_col;
990  } else {
991  if ((first_column_ & 1) == 0)
992  last_column_ = first_column_;
993  else if ((last_column_ & 1) == 0)
994  first_column_ = last_column_;
995  else
996  first_column_ = last_column_ = (first_column_ + last_column_) / 2;
997  }
998  }
999  type_ = PartitionType(span_type);
1000 }
1001 
1002 // Returns the PartitionType from the current BlobRegionType and a column
1003 // flow spanning type ColumnSpanningType, generated by
1004 // ColPartitionSet::SpanningType, that indicates how the partition sits
1005 // in the columns.
1007  if (flow == CST_NOISE) {
1008  if (blob_type_ != BRT_HLINE && blob_type_ != BRT_VLINE &&
1009  blob_type_ != BRT_RECTIMAGE && blob_type_ != BRT_VERT_TEXT)
1010  return PT_NOISE;
1011  flow = CST_FLOWING;
1012  }
1013 
1014  switch (blob_type_) {
1015  case BRT_NOISE:
1016  return PT_NOISE;
1017  case BRT_HLINE:
1018  return PT_HORZ_LINE;
1019  case BRT_VLINE:
1020  return PT_VERT_LINE;
1021  case BRT_RECTIMAGE:
1022  case BRT_POLYIMAGE:
1023  switch (flow) {
1024  case CST_FLOWING:
1025  return PT_FLOWING_IMAGE;
1026  case CST_HEADING:
1027  return PT_HEADING_IMAGE;
1028  case CST_PULLOUT:
1029  return PT_PULLOUT_IMAGE;
1030  default:
1031  ASSERT_HOST(!"Undefined flow type for image!");
1032  }
1033  break;
1034  case BRT_VERT_TEXT:
1035  return PT_VERTICAL_TEXT;
1036  case BRT_TEXT:
1037  case BRT_UNKNOWN:
1038  default:
1039  switch (flow) {
1040  case CST_FLOWING:
1041  return PT_FLOWING_TEXT;
1042  case CST_HEADING:
1043  return PT_HEADING_TEXT;
1044  case CST_PULLOUT:
1045  return PT_PULLOUT_TEXT;
1046  default:
1047  ASSERT_HOST(!"Undefined flow type for text!");
1048  }
1049  }
1050  ASSERT_HOST(!"Should never get here!");
1051  return PT_NOISE;
1052 }
1053 
1054 // Returns the first and last column touched by this partition.
1055 // resolution refers to the ppi resolution of the image.
1056 void ColPartition::ColumnRange(int resolution, ColPartitionSet* columns,
1057  int* first_col, int* last_col) {
1058  int first_spanned_col = -1;
1059  ColumnSpanningType span_type =
1060  columns->SpanningType(resolution,
1061  bounding_box_.left(), bounding_box_.right(),
1062  std::min(bounding_box_.height(), bounding_box_.width()),
1063  MidY(), left_margin_, right_margin_,
1064  first_col, last_col,
1065  &first_spanned_col);
1066  type_ = PartitionType(span_type);
1067 }
1068 
1069 // Sets the internal flags good_width_ and good_column_.
1071  int y = MidY();
1072  int width = RightAtY(y) - LeftAtY(y);
1073  good_width_ = cb(width);
1074  good_column_ = blob_type_ == BRT_TEXT && left_key_tab_ && right_key_tab_;
1075 }
1076 
1077 // Determines whether the blobs in this partition mostly represent
1078 // a leader (fixed pitch sequence) and sets the member blobs accordingly.
1079 // Note that height is assumed to have been tested elsewhere, and that this
1080 // function will find most fixed-pitch text as leader without a height filter.
1081 // Leader detection is limited to sequences of identical width objects,
1082 // such as .... or ----, so patterns, such as .-.-.-.-. will not be found.
1084  bool result = false;
1085  // Gather statistics on the gaps between blobs and the widths of the blobs.
1086  int part_width = bounding_box_.width();
1087  STATS gap_stats(0, part_width);
1088  STATS width_stats(0, part_width);
1089  BLOBNBOX_C_IT it(&boxes_);
1090  BLOBNBOX* prev_blob = it.data();
1091  prev_blob->set_flow(BTFT_NEIGHBOURS);
1092  width_stats.add(prev_blob->bounding_box().width(), 1);
1093  int blob_count = 1;
1094  for (it.forward(); !it.at_first(); it.forward()) {
1095  BLOBNBOX* blob = it.data();
1096  int left = blob->bounding_box().left();
1097  int right = blob->bounding_box().right();
1098  gap_stats.add(left - prev_blob->bounding_box().right(), 1);
1099  width_stats.add(right - left, 1);
1100  blob->set_flow(BTFT_NEIGHBOURS);
1101  prev_blob = blob;
1102  ++blob_count;
1103  }
1104  double median_gap = gap_stats.median();
1105  double median_width = width_stats.median();
1106  double max_width = std::max(median_gap, median_width);
1107  double min_width = std::min(median_gap, median_width);
1108  double gap_iqr = gap_stats.ile(0.75f) - gap_stats.ile(0.25f);
1109  if (textord_debug_tabfind >= 4) {
1110  tprintf("gap iqr = %g, blob_count=%d, limits=%g,%g\n",
1111  gap_iqr, blob_count, max_width * kMaxLeaderGapFractionOfMax,
1112  min_width * kMaxLeaderGapFractionOfMin);
1113  }
1114  if (gap_iqr < max_width * kMaxLeaderGapFractionOfMax &&
1115  gap_iqr < min_width * kMaxLeaderGapFractionOfMin &&
1116  blob_count >= kMinLeaderCount) {
1117  // This is stable enough to be called a leader, so check the widths.
1118  // Since leader dashes can join, run a dp cutting algorithm and go
1119  // on the cost.
1120  int offset = static_cast<int>(ceil(gap_iqr * 2));
1121  int min_step = static_cast<int>(median_gap + median_width + 0.5);
1122  int max_step = min_step + offset;
1123  min_step -= offset;
1124  // Pad the buffer with min_step/2 on each end.
1125  int part_left = bounding_box_.left() - min_step / 2;
1126  part_width += min_step;
1127  auto* projection = new DPPoint[part_width];
1128  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1129  BLOBNBOX* blob = it.data();
1130  int left = blob->bounding_box().left();
1131  int right = blob->bounding_box().right();
1132  int height = blob->bounding_box().height();
1133  for (int x = left; x < right; ++x) {
1134  projection[left - part_left].AddLocalCost(height);
1135  }
1136  }
1137  DPPoint* best_end = DPPoint::Solve(min_step, max_step, false,
1139  part_width, projection);
1140  if (best_end != nullptr && best_end->total_cost() < blob_count) {
1141  // Good enough. Call it a leader.
1142  result = true;
1143  bool modified_blob_list = false;
1144  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1145  BLOBNBOX* blob = it.data();
1146  // If the first or last blob is spaced too much, don't mark it.
1147  if (it.at_first()) {
1148  int gap = it.data_relative(1)->bounding_box().left() -
1149  blob->bounding_box().right();
1150  if (blob->bounding_box().width() + gap > max_step) {
1151  it.extract();
1152  modified_blob_list = true;
1153  continue;
1154  }
1155  }
1156  if (it.at_last()) {
1157  int gap = blob->bounding_box().left() -
1158  it.data_relative(-1)->bounding_box().right();
1159  if (blob->bounding_box().width() + gap > max_step) {
1160  it.extract();
1161  modified_blob_list = true;
1162  break;
1163  }
1164  }
1165  blob->set_region_type(BRT_TEXT);
1166  blob->set_flow(BTFT_LEADER);
1167  }
1168  if (modified_blob_list) ComputeLimits();
1169  blob_type_ = BRT_TEXT;
1170  flow_ = BTFT_LEADER;
1171  } else if (textord_debug_tabfind) {
1172  if (best_end == nullptr) {
1173  tprintf("No path\n");
1174  } else {
1175  tprintf("Total cost = %d vs allowed %d\n", best_end->total_cost(),
1176  blob_count);
1177  }
1178  }
1179  delete [] projection;
1180  }
1181  return result;
1182 }
1183 
1184 // Given the result of TextlineProjection::EvaluateColPartition, (positive for
1185 // horizontal text, negative for vertical text, and near zero for non-text),
1186 // sets the blob_type_ and flow_ for this partition to indicate whether it
1187 // is strongly or weakly vertical or horizontal text, or non-text.
1188 // The function assumes that the blob neighbours are valid (from
1189 // StrokeWidth::SetNeighbours) and that those neighbours have their
1190 // region_type() set.
1192  int blob_count = 0; // Total # blobs.
1193  int good_blob_score_ = 0; // Total # good strokewidth neighbours.
1194  int noisy_count = 0; // Total # neighbours marked as noise.
1195  int hline_count = 0;
1196  int vline_count = 0;
1197  BLOBNBOX_C_IT it(&boxes_);
1198  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1199  BLOBNBOX* blob = it.data();
1200  ++blob_count;
1201  noisy_count += blob->NoisyNeighbours();
1202  good_blob_score_ += blob->GoodTextBlob();
1203  if (blob->region_type() == BRT_HLINE) ++hline_count;
1204  if (blob->region_type() == BRT_VLINE) ++vline_count;
1205  }
1206  flow_ = BTFT_NEIGHBOURS;
1207  blob_type_ = BRT_UNKNOWN;
1208  if (hline_count > vline_count) {
1209  flow_ = BTFT_NONE;
1210  blob_type_ = BRT_HLINE;
1211  } else if (vline_count > hline_count) {
1212  flow_ = BTFT_NONE;
1213  blob_type_ = BRT_VLINE;
1214  } else if (value < -1 || 1 < value) {
1215  int long_side;
1216  int short_side;
1217  if (value > 0) {
1218  long_side = bounding_box_.width();
1219  short_side = bounding_box_.height();
1220  blob_type_ = BRT_TEXT;
1221  } else {
1222  long_side = bounding_box_.height();
1223  short_side = bounding_box_.width();
1224  blob_type_ = BRT_VERT_TEXT;
1225  }
1226  // We will combine the old metrics using aspect ratio and blob counts
1227  // with the input value by allowing a strong indication to flip the
1228  // STRONG_CHAIN/CHAIN flow values.
1229  int strong_score = blob_count >= kHorzStrongTextlineCount ? 1 : 0;
1230  if (short_side > kHorzStrongTextlineHeight) ++strong_score;
1231  if (short_side * kHorzStrongTextlineAspect < long_side) ++strong_score;
1232  if (abs(value) >= kMinStrongTextValue)
1233  flow_ = BTFT_STRONG_CHAIN;
1234  else if (abs(value) >= kMinChainTextValue)
1235  flow_ = BTFT_CHAIN;
1236  else
1237  flow_ = BTFT_NEIGHBOURS;
1238  // Upgrade chain to strong chain if the other indicators are good
1239  if (flow_ == BTFT_CHAIN && strong_score == 3)
1240  flow_ = BTFT_STRONG_CHAIN;
1241  // Downgrade strong vertical text to chain if the indicators are bad.
1242  if (flow_ == BTFT_STRONG_CHAIN && value < 0 && strong_score < 2)
1243  flow_ = BTFT_CHAIN;
1244  }
1245  if (flow_ == BTFT_NEIGHBOURS) {
1246  // Check for noisy neighbours.
1247  if (noisy_count >= blob_count) {
1248  flow_ = BTFT_NONTEXT;
1249  blob_type_= BRT_NOISE;
1250  }
1251  }
1252  if (TabFind::WithinTestRegion(2, bounding_box_.left(),
1253  bounding_box_.bottom())) {
1254  tprintf("RegionFlowTypesFromProjectionValue count=%d, noisy=%d, score=%d,",
1255  blob_count, noisy_count, good_blob_score_);
1256  tprintf(" Projection value=%d, flow=%d, blob_type=%d\n",
1257  value, flow_, blob_type_);
1258  Print();
1259  }
1260  SetBlobTypes();
1261 }
1262 
1263 // Sets all blobs with the partition blob type and flow, but never overwrite
1264 // leader blobs, as we need to be able to identify them later.
1266  if (!owns_blobs())
1267  return;
1268  BLOBNBOX_C_IT it(&boxes_);
1269  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1270  BLOBNBOX* blob = it.data();
1271  if (blob->flow() != BTFT_LEADER)
1272  blob->set_flow(flow_);
1273  blob->set_region_type(blob_type_);
1274  ASSERT_HOST(blob->owner() == nullptr || blob->owner() == this);
1275  }
1276 }
1277 
1278 // Returns true if a decent baseline can be fitted through the blobs.
1279 // Works for both horizontal and vertical text.
1281  // Approximation of the baseline.
1282  DetLineFit linepoints;
1283  // Calculation of the mean height on this line segment. Note that these
1284  // variable names apply to the context of a horizontal line, and work
1285  // analogously, rather than literally in the case of a vertical line.
1286  int total_height = 0;
1287  int coverage = 0;
1288  int height_count = 0;
1289  int width = 0;
1290  BLOBNBOX_C_IT it(&boxes_);
1291  TBOX box(it.data()->bounding_box());
1292  // Accumulate points representing the baseline at the middle of each blob,
1293  // but add an additional point for each end of the line. This makes it
1294  // harder to fit a severe skew angle, as it is most likely not right.
1295  if (IsVerticalType()) {
1296  // For a vertical line, use the right side as the baseline.
1297  ICOORD first_pt(box.right(), box.bottom());
1298  // Use the bottom-right of the first (bottom) box, the top-right of the
1299  // last, and the middle-right of all others.
1300  linepoints.Add(first_pt);
1301  for (it.forward(); !it.at_last(); it.forward()) {
1302  BLOBNBOX* blob = it.data();
1303  box = blob->bounding_box();
1304  ICOORD box_pt(box.right(), (box.top() + box.bottom()) / 2);
1305  linepoints.Add(box_pt);
1306  total_height += box.width();
1307  coverage += box.height();
1308  ++height_count;
1309  }
1310  box = it.data()->bounding_box();
1311  ICOORD last_pt(box.right(), box.top());
1312  linepoints.Add(last_pt);
1313  width = last_pt.y() - first_pt.y();
1314 
1315  } else {
1316  // Horizontal lines use the bottom as the baseline.
1317  TBOX box(it.data()->bounding_box());
1318  // Use the bottom-left of the first box, the the bottom-right of the last,
1319  // and the middle of all others.
1320  ICOORD first_pt(box.left(), box.bottom());
1321  linepoints.Add(first_pt);
1322  for (it.forward(); !it.at_last(); it.forward()) {
1323  BLOBNBOX* blob = it.data();
1324  box = blob->bounding_box();
1325  ICOORD box_pt((box.left() + box.right()) / 2, box.bottom());
1326  linepoints.Add(box_pt);
1327  total_height += box.height();
1328  coverage += box.width();
1329  ++height_count;
1330  }
1331  box = it.data()->bounding_box();
1332  ICOORD last_pt(box.right(), box.bottom());
1333  linepoints.Add(last_pt);
1334  width = last_pt.x() - first_pt.x();
1335  }
1336  // Maximum median error allowed to be a good text line.
1337  if (height_count == 0)
1338  return false;
1339  double max_error = kMaxBaselineError * total_height / height_count;
1340  ICOORD start_pt, end_pt;
1341  double error = linepoints.Fit(&start_pt, &end_pt);
1342  return error < max_error && coverage >= kMinBaselineCoverage * width;
1343 }
1344 
1345 // Adds this ColPartition to a matching WorkingPartSet if one can be found,
1346 // otherwise starts a new one in the appropriate column, ending the previous.
1347 void ColPartition::AddToWorkingSet(const ICOORD& bleft, const ICOORD& tright,
1348  int resolution,
1349  ColPartition_LIST* used_parts,
1350  WorkingPartSet_LIST* working_sets) {
1351  if (block_owned_)
1352  return; // Done it already.
1353  block_owned_ = true;
1354  WorkingPartSet_IT it(working_sets);
1355  // If there is an upper partner use its working_set_ directly.
1356  ColPartition* partner = SingletonPartner(true);
1357  if (partner != nullptr && partner->working_set_ != nullptr) {
1358  working_set_ = partner->working_set_;
1359  working_set_->AddPartition(this);
1360  return;
1361  }
1362  if (partner != nullptr && textord_debug_bugs) {
1363  tprintf("Partition with partner has no working set!:");
1364  Print();
1365  partner->Print();
1366  }
1367  // Search for the column that the left edge fits in.
1368  WorkingPartSet* work_set = nullptr;
1369  it.move_to_first();
1370  int col_index = 0;
1371  for (it.mark_cycle_pt(); !it.cycled_list() &&
1372  col_index != first_column_;
1373  it.forward(), ++col_index);
1374  if (textord_debug_tabfind >= 2) {
1375  tprintf("Match is %s for:", (col_index & 1) ? "Real" : "Between");
1376  Print();
1377  }
1378  if (it.cycled_list() && textord_debug_bugs) {
1379  tprintf("Target column=%d, only had %d\n", first_column_, col_index);
1380  }
1381  ASSERT_HOST(!it.cycled_list());
1382  work_set = it.data();
1383  // If last_column_ != first_column, then we need to scoop up all blocks
1384  // between here and the last_column_ and put back in work_set.
1385  if (!it.cycled_list() && last_column_ != first_column_ && !IsPulloutType()) {
1386  // Find the column that the right edge falls in.
1387  BLOCK_LIST completed_blocks;
1388  TO_BLOCK_LIST to_blocks;
1389  for (; !it.cycled_list() && col_index <= last_column_;
1390  it.forward(), ++col_index) {
1391  WorkingPartSet* end_set = it.data();
1392  end_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts,
1393  &completed_blocks, &to_blocks);
1394  }
1395  work_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);
1396  }
1397  working_set_ = work_set;
1398  work_set->AddPartition(this);
1399 }
1400 
1401 // From the given block_parts list, builds one or more BLOCKs and
1402 // corresponding TO_BLOCKs, such that the line spacing is uniform in each.
1403 // Created blocks are appended to the end of completed_blocks and to_blocks.
1404 // The used partitions are put onto used_parts, as they may still be referred
1405 // to in the partition grid. bleft, tright and resolution are the bounds
1406 // and resolution of the original image.
1407 void ColPartition::LineSpacingBlocks(const ICOORD& bleft, const ICOORD& tright,
1408  int resolution,
1409  ColPartition_LIST* block_parts,
1410  ColPartition_LIST* used_parts,
1411  BLOCK_LIST* completed_blocks,
1412  TO_BLOCK_LIST* to_blocks) {
1413  int page_height = tright.y() - bleft.y();
1414  // Compute the initial spacing stats.
1415  ColPartition_IT it(block_parts);
1416  int part_count = 0;
1417  int max_line_height = 0;
1418 
1419  // TODO(joeliu): We should add some special logic for PT_INLINE_EQUATION type
1420  // because their line spacing with their neighbors maybe smaller and their
1421  // height may be slightly larger.
1422 
1423  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1424  ColPartition* part = it.data();
1425  ASSERT_HOST(!part->boxes()->empty());
1426  STATS side_steps(0, part->bounding_box().height());
1427  if (part->bounding_box().height() > max_line_height)
1428  max_line_height = part->bounding_box().height();
1429  BLOBNBOX_C_IT blob_it(part->boxes());
1430  int prev_bottom = blob_it.data()->bounding_box().bottom();
1431  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1432  BLOBNBOX* blob = blob_it.data();
1433  int bottom = blob->bounding_box().bottom();
1434  int step = bottom - prev_bottom;
1435  if (step < 0)
1436  step = -step;
1437  side_steps.add(step, 1);
1438  prev_bottom = bottom;
1439  }
1440  part->set_side_step(static_cast<int>(side_steps.median() + 0.5));
1441  if (!it.at_last()) {
1442  ColPartition* next_part = it.data_relative(1);
1443  part->set_bottom_spacing(part->median_bottom() -
1444  next_part->median_bottom());
1445  part->set_top_spacing(part->median_top() - next_part->median_top());
1446  } else {
1447  part->set_bottom_spacing(page_height);
1448  part->set_top_spacing(page_height);
1449  }
1450  if (textord_debug_tabfind) {
1451  part->Print();
1452  tprintf("side step = %.2f, top spacing = %d, bottom spacing=%d\n",
1453  side_steps.median(), part->top_spacing(), part->bottom_spacing());
1454  }
1455  ++part_count;
1456  }
1457  if (part_count == 0)
1458  return;
1459 
1460  SmoothSpacings(resolution, page_height, block_parts);
1461 
1462  // Move the partitions into individual block lists and make the blocks.
1463  BLOCK_IT block_it(completed_blocks);
1464  TO_BLOCK_IT to_block_it(to_blocks);
1465  ColPartition_LIST spacing_parts;
1466  ColPartition_IT sp_block_it(&spacing_parts);
1467  int same_block_threshold = max_line_height * kMaxSameBlockLineSpacing;
1468  for (it.mark_cycle_pt(); !it.empty();) {
1469  ColPartition* part = it.extract();
1470  sp_block_it.add_to_end(part);
1471  it.forward();
1472  if (it.empty() || part->bottom_spacing() > same_block_threshold ||
1473  !part->SpacingsEqual(*it.data(), resolution)) {
1474  // There is a spacing boundary. Check to see if it.data() belongs
1475  // better in the current block or the next one.
1476  if (!it.empty() && part->bottom_spacing() <= same_block_threshold) {
1477  ColPartition* next_part = it.data();
1478  // If there is a size match one-way, then the middle line goes with
1479  // its matched size, otherwise it goes with the smallest spacing.
1480  ColPartition* third_part = it.at_last() ? nullptr : it.data_relative(1);
1481  if (textord_debug_tabfind) {
1482  tprintf("Spacings unequal: upper:%d/%d, lower:%d/%d,"
1483  " sizes %d %d %d\n",
1484  part->top_spacing(), part->bottom_spacing(),
1485  next_part->top_spacing(), next_part->bottom_spacing(),
1486  part->median_height(), next_part->median_height(),
1487  third_part != nullptr ? third_part->median_height() : 0);
1488  }
1489  // We can only consider adding the next line to the block if the sizes
1490  // match and the lines are close enough for their size.
1491  if (part->SizesSimilar(*next_part) &&
1492  next_part->median_height() * kMaxSameBlockLineSpacing >
1493  part->bottom_spacing() &&
1495  part->top_spacing()) {
1496  // Even now, we can only add it as long as the third line doesn't
1497  // match in the same way and have a smaller bottom spacing.
1498  if (third_part == nullptr ||
1499  !next_part->SizesSimilar(*third_part) ||
1500  third_part->median_height() * kMaxSameBlockLineSpacing <=
1501  next_part->bottom_spacing() ||
1502  next_part->median_height() * kMaxSameBlockLineSpacing <=
1503  next_part->top_spacing() ||
1504  next_part->bottom_spacing() > part->bottom_spacing()) {
1505  // Add to the current block.
1506  sp_block_it.add_to_end(it.extract());
1507  it.forward();
1508  if (textord_debug_tabfind) {
1509  tprintf("Added line to current block.\n");
1510  }
1511  }
1512  }
1513  }
1514  TO_BLOCK* to_block = MakeBlock(bleft, tright, &spacing_parts, used_parts);
1515  if (to_block != nullptr) {
1516  to_block_it.add_to_end(to_block);
1517  block_it.add_to_end(to_block->block);
1518  }
1519  sp_block_it.set_to_list(&spacing_parts);
1520  } else {
1521  if (textord_debug_tabfind && !it.empty()) {
1522  ColPartition* next_part = it.data();
1523  tprintf("Spacings equal: upper:%d/%d, lower:%d/%d, median:%d/%d\n",
1524  part->top_spacing(), part->bottom_spacing(),
1525  next_part->top_spacing(), next_part->bottom_spacing(),
1526  part->median_height(), next_part->median_height());
1527  }
1528  }
1529  }
1530 }
1531 
1532 // Helper function to clip the input pos to the given bleft, tright bounds.
1533 static void ClipCoord(const ICOORD& bleft, const ICOORD& tright, ICOORD* pos) {
1534  if (pos->x() < bleft.x())
1535  pos->set_x(bleft.x());
1536  if (pos->x() > tright.x())
1537  pos->set_x(tright.x());
1538  if (pos->y() < bleft.y())
1539  pos->set_y(bleft.y());
1540  if (pos->y() > tright.y())
1541  pos->set_y(tright.y());
1542 }
1543 
1544 // Helper moves the blobs from the given list of block_parts into the block
1545 // itself. Sets up the block for (old) textline formation correctly for
1546 // vertical and horizontal text. The partitions are moved to used_parts
1547 // afterwards, as they cannot be deleted yet.
1548 static TO_BLOCK* MoveBlobsToBlock(bool vertical_text, int line_spacing,
1549  BLOCK* block,
1550  ColPartition_LIST* block_parts,
1551  ColPartition_LIST* used_parts) {
1552  // Make a matching TO_BLOCK and put all the BLOBNBOXes from the parts in it.
1553  // Move all the parts to a done list as they are no longer needed, except
1554  // that have have to continue to exist until the part grid is deleted.
1555  // Compute the median blob size as we go, as the block needs to know.
1556  TBOX block_box(block->pdblk.bounding_box());
1557  STATS sizes(0, std::max(block_box.width(), block_box.height()));
1558  bool text_type = block->pdblk.poly_block()->IsText();
1559  ColPartition_IT it(block_parts);
1560  auto* to_block = new TO_BLOCK(block);
1561  BLOBNBOX_IT blob_it(&to_block->blobs);
1562  ColPartition_IT used_it(used_parts);
1563  for (it.move_to_first(); !it.empty(); it.forward()) {
1564  ColPartition* part = it.extract();
1565  // Transfer blobs from all regions to the output blocks.
1566  // Blobs for non-text regions will be used to define the polygonal
1567  // bounds of the region.
1568  for (BLOBNBOX_C_IT bb_it(part->boxes()); !bb_it.empty();
1569  bb_it.forward()) {
1570  BLOBNBOX* bblob = bb_it.extract();
1571  if (bblob->owner() != part) {
1572  tprintf("Ownership incorrect for blob:");
1573  bblob->bounding_box().print();
1574  tprintf("Part=");
1575  part->Print();
1576  if (bblob->owner() == nullptr) {
1577  tprintf("Not owned\n");
1578  } else {
1579  tprintf("Owner part:");
1580  bblob->owner()->Print();
1581  }
1582  }
1583  ASSERT_HOST(bblob->owner() == part);
1584  // Assert failure here is caused by arbitrarily changing the partition
1585  // type without also changing the blob type, such as in
1586  // InsertSmallBlobsAsUnknowns.
1587  ASSERT_HOST(!text_type || bblob->region_type() >= BRT_UNKNOWN);
1588  C_OUTLINE_LIST* outlines = bblob->cblob()->out_list();
1589  C_OUTLINE_IT ol_it(outlines);
1590  ASSERT_HOST(!text_type || ol_it.data()->pathlength() > 0);
1591  if (vertical_text)
1592  sizes.add(bblob->bounding_box().width(), 1);
1593  else
1594  sizes.add(bblob->bounding_box().height(), 1);
1595  blob_it.add_after_then_move(bblob);
1596  }
1597  used_it.add_to_end(part);
1598  }
1599  if (text_type && blob_it.empty()) {
1600  delete block;
1601  delete to_block;
1602  return nullptr;
1603  }
1604  to_block->line_size = sizes.median();
1605  if (vertical_text) {
1606  int block_width = block->pdblk.bounding_box().width();
1607  if (block_width < line_spacing)
1608  line_spacing = block_width;
1609  to_block->line_spacing = static_cast<float>(line_spacing);
1610  to_block->max_blob_size = static_cast<float>(block_width + 1);
1611  } else {
1612  int block_height = block->pdblk.bounding_box().height();
1613  if (block_height < line_spacing)
1614  line_spacing = block_height;
1615  to_block->line_spacing = static_cast<float>(line_spacing);
1616  to_block->max_blob_size = static_cast<float>(block_height + 1);
1617  }
1618  return to_block;
1619 }
1620 
1621 // Constructs a block from the given list of partitions.
1622 // Arguments are as LineSpacingBlocks above.
1623 TO_BLOCK* ColPartition::MakeBlock(const ICOORD& bleft, const ICOORD& tright,
1624  ColPartition_LIST* block_parts,
1625  ColPartition_LIST* used_parts) {
1626  if (block_parts->empty())
1627  return nullptr; // Nothing to do.
1628  // If the block_parts are not in reading order, then it will make an invalid
1629  // block polygon and bounding_box, so sort by bounding box now just to make
1630  // sure.
1631  block_parts->sort(&ColPartition::SortByBBox);
1632  ColPartition_IT it(block_parts);
1633  ColPartition* part = it.data();
1634  PolyBlockType type = part->type();
1635  if (type == PT_VERTICAL_TEXT)
1636  return MakeVerticalTextBlock(bleft, tright, block_parts, used_parts);
1637  // LineSpacingBlocks has handed us a collection of evenly spaced lines and
1638  // put the average spacing in each partition, so we can just take the
1639  // linespacing from the first partition.
1640  int line_spacing = part->bottom_spacing();
1641  if (line_spacing < part->median_height())
1642  line_spacing = part->bounding_box().height();
1643  ICOORDELT_LIST vertices;
1644  ICOORDELT_IT vert_it(&vertices);
1645  ICOORD start, end;
1646  int min_x = INT32_MAX;
1647  int max_x = -INT32_MAX;
1648  int min_y = INT32_MAX;
1649  int max_y = -INT32_MAX;
1650  int iteration = 0;
1651  do {
1652  if (iteration == 0)
1653  ColPartition::LeftEdgeRun(&it, &start, &end);
1654  else
1655  ColPartition::RightEdgeRun(&it, &start, &end);
1656  ClipCoord(bleft, tright, &start);
1657  ClipCoord(bleft, tright, &end);
1658  vert_it.add_after_then_move(new ICOORDELT(start));
1659  vert_it.add_after_then_move(new ICOORDELT(end));
1660  UpdateRange(start.x(), &min_x, &max_x);
1661  UpdateRange(end.x(), &min_x, &max_x);
1662  UpdateRange(start.y(), &min_y, &max_y);
1663  UpdateRange(end.y(), &min_y, &max_y);
1664  if ((iteration == 0 && it.at_first()) ||
1665  (iteration == 1 && it.at_last())) {
1666  ++iteration;
1667  it.move_to_last();
1668  }
1669  } while (iteration < 2);
1671  tprintf("Making block at (%d,%d)->(%d,%d)\n",
1672  min_x, min_y, max_x, max_y);
1673  auto* block = new BLOCK("", true, 0, 0, min_x, min_y, max_x, max_y);
1674  block->pdblk.set_poly_block(new POLY_BLOCK(&vertices, type));
1675  return MoveBlobsToBlock(false, line_spacing, block, block_parts, used_parts);
1676 }
1677 
1678 // Constructs a block from the given list of vertical text partitions.
1679 // Currently only creates rectangular blocks.
1681  const ICOORD& tright,
1682  ColPartition_LIST* block_parts,
1683  ColPartition_LIST* used_parts) {
1684  if (block_parts->empty())
1685  return nullptr; // Nothing to do.
1686  ColPartition_IT it(block_parts);
1687  ColPartition* part = it.data();
1688  TBOX block_box = part->bounding_box();
1689  int line_spacing = block_box.width();
1690  PolyBlockType type = it.data()->type();
1691  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1692  block_box += it.data()->bounding_box();
1693  }
1694  if (textord_debug_tabfind) {
1695  tprintf("Making block at:");
1696  block_box.print();
1697  }
1698  auto* block = new BLOCK("", true, 0, 0, block_box.left(), block_box.bottom(),
1699  block_box.right(), block_box.top());
1700  block->pdblk.set_poly_block(new POLY_BLOCK(block_box, type));
1701  return MoveBlobsToBlock(true, line_spacing, block, block_parts, used_parts);
1702 }
1703 
1704 // Makes a TO_ROW matching this and moves all the blobs to it, transferring
1705 // ownership to to returned TO_ROW.
1707  BLOBNBOX_C_IT blob_it(&boxes_);
1708  TO_ROW* row = nullptr;
1709  int line_size = IsVerticalType() ? median_width_ : median_height_;
1710  // Add all the blobs to a single TO_ROW.
1711  for (; !blob_it.empty(); blob_it.forward()) {
1712  BLOBNBOX* blob = blob_it.extract();
1713 // blob->compute_bounding_box();
1714  int top = blob->bounding_box().top();
1715  int bottom = blob->bounding_box().bottom();
1716  if (row == nullptr) {
1717  row = new TO_ROW(blob, static_cast<float>(top),
1718  static_cast<float>(bottom),
1719  static_cast<float>(line_size));
1720  } else {
1721  row->add_blob(blob, static_cast<float>(top),
1722  static_cast<float>(bottom),
1723  static_cast<float>(line_size));
1724  }
1725  }
1726  return row;
1727 }
1728 
1729 // Returns a copy of everything except the list of boxes. The resulting
1730 // ColPartition is only suitable for keeping in a column candidate list.
1732  auto* part = new ColPartition(blob_type_, vertical_);
1733  part->left_margin_ = left_margin_;
1734  part->right_margin_ = right_margin_;
1735  part->bounding_box_ = bounding_box_;
1736  memcpy(part->special_blobs_densities_, special_blobs_densities_,
1737  sizeof(special_blobs_densities_));
1738  part->median_bottom_ = median_bottom_;
1739  part->median_top_ = median_top_;
1740  part->median_height_ = median_height_;
1741  part->median_left_ = median_left_;
1742  part->median_right_ = median_right_;
1743  part->median_width_ = median_width_;
1744  part->good_width_ = good_width_;
1745  part->good_column_ = good_column_;
1746  part->left_key_tab_ = left_key_tab_;
1747  part->right_key_tab_ = right_key_tab_;
1748  part->type_ = type_;
1749  part->flow_ = flow_;
1750  part->left_key_ = left_key_;
1751  part->right_key_ = right_key_;
1752  part->first_column_ = first_column_;
1753  part->last_column_ = last_column_;
1754  part->owns_blobs_ = false;
1755  return part;
1756 }
1757 
1759  ColPartition* copy = ShallowCopy();
1760  copy->set_owns_blobs(false);
1761  BLOBNBOX_C_IT inserter(copy->boxes());
1762  BLOBNBOX_C_IT traverser(boxes());
1763  for (traverser.mark_cycle_pt(); !traverser.cycled_list(); traverser.forward())
1764  inserter.add_after_then_move(traverser.data());
1765  return copy;
1766 }
1767 
1768 #ifndef GRAPHICS_DISABLED
1769 // Provides a color for BBGrid to draw the rectangle.
1770 // Must be kept in sync with PolyBlockType.
1772  if (type_ == PT_UNKNOWN)
1773  return BLOBNBOX::TextlineColor(blob_type_, flow_);
1774  return POLY_BLOCK::ColorForPolyBlockType(type_);
1775 }
1776 #endif // GRAPHICS_DISABLED
1777 
1778 // Keep in sync with BlobRegionType.
1779 static char kBlobTypes[BRT_COUNT + 1] = "NHSRIUVT";
1780 
1781 // Prints debug information on this.
1782 void ColPartition::Print() const {
1783  int y = MidY();
1784  tprintf("ColPart:%c(M%d-%c%d-B%d/%d,%d/%d)->(%dB-%d%c-%dM/%d,%d/%d)"
1785  " w-ok=%d, v-ok=%d, type=%d%c%d, fc=%d, lc=%d, boxes=%d"
1786  " ts=%d bs=%d ls=%d rs=%d\n",
1787  boxes_.empty() ? 'E' : ' ',
1788  left_margin_, left_key_tab_ ? 'T' : 'B', LeftAtY(y),
1789  bounding_box_.left(), median_left_,
1790  bounding_box_.bottom(), median_bottom_,
1791  bounding_box_.right(), RightAtY(y), right_key_tab_ ? 'T' : 'B',
1792  right_margin_, median_right_, bounding_box_.top(), median_top_,
1793  good_width_, good_column_, type_,
1794  kBlobTypes[blob_type_], flow_,
1795  first_column_, last_column_, boxes_.length(),
1796  space_above_, space_below_, space_to_left_, space_to_right_);
1797 }
1798 
1799 // Prints debug information on the colors.
1801  tprintf("Colors:(%d, %d, %d)%d -> (%d, %d, %d)\n",
1802  color1_[COLOR_RED], color1_[COLOR_GREEN], color1_[COLOR_BLUE],
1803  color1_[L_ALPHA_CHANNEL],
1804  color2_[COLOR_RED], color2_[COLOR_GREEN], color2_[COLOR_BLUE]);
1805 }
1806 
1807 // Sets the types of all partitions in the run to be the max of the types.
1808 void ColPartition::SmoothPartnerRun(int working_set_count) {
1809  STATS left_stats(0, working_set_count);
1810  STATS right_stats(0, working_set_count);
1811  PolyBlockType max_type = type_;
1812  ColPartition* partner;
1813  for (partner = SingletonPartner(false); partner != nullptr;
1814  partner = partner->SingletonPartner(false)) {
1815  if (partner->type_ > max_type)
1816  max_type = partner->type_;
1817  if (column_set_ == partner->column_set_) {
1818  left_stats.add(partner->first_column_, 1);
1819  right_stats.add(partner->last_column_, 1);
1820  }
1821  }
1822  type_ = max_type;
1823  // TODO(rays) Either establish that it isn't necessary to set the columns,
1824  // or find a way to do it that does not cause an assert failure in
1825  // AddToWorkingSet.
1826 #if 0
1827  first_column_ = left_stats.mode();
1828  last_column_ = right_stats.mode();
1829  if (last_column_ < first_column_)
1830  last_column_ = first_column_;
1831 #endif
1832 
1833  for (partner = SingletonPartner(false); partner != nullptr;
1834  partner = partner->SingletonPartner(false)) {
1835  partner->type_ = max_type;
1836 #if 0 // See TODO above
1837  if (column_set_ == partner->column_set_) {
1838  partner->first_column_ = first_column_;
1839  partner->last_column_ = last_column_;
1840  }
1841 #endif
1842  }
1843 }
1844 
1845 // ======= Scenario common to all Refine*Partners* functions =======
1846 // ColPartitions are aiming to represent textlines, or horizontal slices
1847 // of images, and we are trying to form bi-directional (upper/lower) chains
1848 // of UNIQUE partner ColPartitions that can be made into blocks.
1849 // The ColPartitions have previously been typed (see SetPartitionType)
1850 // according to a combination of the content type and
1851 // how they lie on the columns. We want to chain text into
1852 // groups of a single type, but image ColPartitions may have been typed
1853 // differently in different parts of the image, due to being non-rectangular.
1854 //
1855 // We previously ran a search for upper and lower partners, but there may
1856 // be more than one, and they may be of mixed types, so now we wish to
1857 // refine the partners down to at most one.
1858 // A heading may have multiple partners:
1859 // ===============================
1860 // ======== ========== =========
1861 // ======== ========== =========
1862 // but it should be a different type.
1863 // A regular flowing text line may have multiple partners:
1864 // ================== ===================
1865 // ======= ================= ===========
1866 // This could be the start of a pull-out, or it might all be in a single
1867 // column and might be caused by tightly spaced text, bold words, bullets,
1868 // funny punctuation etc, all of which can cause textlines to be split into
1869 // multiple ColPartitions. Pullouts and figure captions should now be different
1870 // types so we can more aggressively merge groups of partners that all sit
1871 // in a single column.
1872 //
1873 // Cleans up the partners of the given type so that there is at most
1874 // one partner. This makes block creation simpler.
1875 // If get_desperate is true, goes to more desperate merge methods
1876 // to merge flowing text before breaking partnerships.
1878  ColPartitionGrid* grid) {
1879  if (TypesSimilar(type_, type)) {
1880  RefinePartnersInternal(true, get_desperate, grid);
1881  RefinePartnersInternal(false, get_desperate, grid);
1882  } else if (type == PT_COUNT) {
1883  // This is the final pass. Make sure only the correctly typed
1884  // partners surivive, however many there are.
1885  RefinePartnersByType(true, &upper_partners_);
1886  RefinePartnersByType(false, &lower_partners_);
1887  // It is possible for a merge to have given a partition multiple
1888  // partners again, so the last resort is to use overlap which is
1889  // guaranteed to leave at most one partner left.
1890  if (!upper_partners_.empty() && !upper_partners_.singleton())
1891  RefinePartnersByOverlap(true, &upper_partners_);
1892  if (!lower_partners_.empty() && !lower_partners_.singleton())
1893  RefinePartnersByOverlap(false, &lower_partners_);
1894  }
1895 }
1896 
1898 
1899 // Cleans up the partners above if upper is true, else below.
1900 // If get_desperate is true, goes to more desperate merge methods
1901 // to merge flowing text before breaking partnerships.
1902 void ColPartition::RefinePartnersInternal(bool upper, bool get_desperate,
1903  ColPartitionGrid* grid) {
1904  ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
1905  if (!partners->empty() && !partners->singleton()) {
1906  RefinePartnersByType(upper, partners);
1907  if (!partners->empty() && !partners->singleton()) {
1908  // Check for transitive partnerships and break the cycle.
1909  RefinePartnerShortcuts(upper, partners);
1910  if (!partners->empty() && !partners->singleton()) {
1911  // Types didn't fix it. Flowing text keeps the one with the longest
1912  // sequence of singleton matching partners. All others max overlap.
1913  if (TypesSimilar(type_, PT_FLOWING_TEXT) && get_desperate) {
1914  RefineTextPartnersByMerge(upper, false, partners, grid);
1915  if (!partners->empty() && !partners->singleton())
1916  RefineTextPartnersByMerge(upper, true, partners, grid);
1917  }
1918  // The last resort is to use overlap.
1919  if (!partners->empty() && !partners->singleton())
1920  RefinePartnersByOverlap(upper, partners);
1921  }
1922  }
1923  }
1924 }
1925 
1926 // Cleans up the partners above if upper is true, else below.
1927 // Restricts the partners to only desirable types. For text and BRT_HLINE this
1928 // means the same type_ , and for image types it means any image type.
1929 void ColPartition::RefinePartnersByType(bool upper,
1930  ColPartition_CLIST* partners) {
1931  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
1932  bounding_box_.bottom());
1933  if (debug) {
1934  tprintf("Refining %d %s partners by type for:\n",
1935  partners->length(), upper ? "Upper" : "Lower");
1936  Print();
1937  }
1938  ColPartition_C_IT it(partners);
1939  // Purify text by type.
1940  if (!IsImageType() && !IsLineType() && type() != PT_TABLE) {
1941  // Keep only partners matching type_.
1942  // Exception: PT_VERTICAL_TEXT is allowed to stay with the other
1943  // text types if it is the only partner.
1944  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1945  ColPartition* partner = it.data();
1946  if (!TypesSimilar(type_, partner->type_)) {
1947  if (debug) {
1948  tprintf("Removing partner:");
1949  partner->Print();
1950  }
1951  partner->RemovePartner(!upper, this);
1952  it.extract();
1953  } else if (debug) {
1954  tprintf("Keeping partner:");
1955  partner->Print();
1956  }
1957  }
1958  } else {
1959  // Only polyimages are allowed to have partners of any kind!
1960  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1961  ColPartition* partner = it.data();
1962  if (partner->blob_type() != BRT_POLYIMAGE ||
1963  blob_type() != BRT_POLYIMAGE) {
1964  if (debug) {
1965  tprintf("Removing partner:");
1966  partner->Print();
1967  }
1968  partner->RemovePartner(!upper, this);
1969  it.extract();
1970  } else if (debug) {
1971  tprintf("Keeping partner:");
1972  partner->Print();
1973  }
1974  }
1975  }
1976 }
1977 
1978 // Cleans up the partners above if upper is true, else below.
1979 // Remove transitive partnerships: this<->a, and a<->b and this<->b.
1980 // Gets rid of this<->b, leaving a clean chain.
1981 // Also if we have this<->a and a<->this, then gets rid of this<->a, as
1982 // this has multiple partners.
1983 void ColPartition::RefinePartnerShortcuts(bool upper,
1984  ColPartition_CLIST* partners) {
1985  bool done_any = false;
1986  do {
1987  done_any = false;
1988  ColPartition_C_IT it(partners);
1989  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1990  ColPartition* a = it.data();
1991  // Check for a match between all of a's partners (it1/b1) and all
1992  // of this's partners (it2/b2).
1993  ColPartition_C_IT it1(upper ? &a->upper_partners_ : &a->lower_partners_);
1994  for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
1995  ColPartition* b1 = it1.data();
1996  if (b1 == this) {
1997  done_any = true;
1998  it.extract();
1999  a->RemovePartner(!upper, this);
2000  break;
2001  }
2002  ColPartition_C_IT it2(partners);
2003  for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {
2004  ColPartition* b2 = it2.data();
2005  if (b1 == b2) {
2006  // Jackpot! b2 should not be a partner of this.
2007  it2.extract();
2008  b2->RemovePartner(!upper, this);
2009  done_any = true;
2010  // That potentially invalidated all the iterators, so break out
2011  // and start again.
2012  break;
2013  }
2014  }
2015  if (done_any)
2016  break;
2017  }
2018  if (done_any)
2019  break;
2020  }
2021  } while (done_any && !partners->empty() && !partners->singleton());
2022 }
2023 
2024 // Cleans up the partners above if upper is true, else below.
2025 // If multiple text partners can be merged, (with each other, NOT with this),
2026 // then do so.
2027 // If desperate is true, then an increase in overlap with the merge is
2028 // allowed. If the overlap increases, then the desperately_merged_ flag
2029 // is set, indicating that the textlines probably need to be regenerated
2030 // by aggressive line fitting/splitting, as there are probably vertically
2031 // joined blobs that cross textlines.
2032 void ColPartition::RefineTextPartnersByMerge(bool upper, bool desperate,
2033  ColPartition_CLIST* partners,
2034  ColPartitionGrid* grid) {
2035  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
2036  bounding_box_.bottom());
2037  if (debug) {
2038  tprintf("Refining %d %s partners by merge for:\n",
2039  partners->length(), upper ? "Upper" : "Lower");
2040  Print();
2041  }
2042  while (!partners->empty() && !partners->singleton()) {
2043  // Absorb will mess up the iterators, so we have to merge one partition
2044  // at a time and rebuild the iterators each time.
2045  ColPartition_C_IT it(partners);
2046  ColPartition* part = it.data();
2047  // Gather a list of merge candidates, from the list of partners, that
2048  // are all in the same single column. See general scenario comment above.
2049  ColPartition_CLIST candidates;
2050  ColPartition_C_IT cand_it(&candidates);
2051  for (it.forward(); !it.at_first(); it.forward()) {
2052  ColPartition* candidate = it.data();
2053  if (part->first_column_ == candidate->last_column_ &&
2054  part->last_column_ == candidate->first_column_)
2055  cand_it.add_after_then_move(it.data());
2056  }
2057  int overlap_increase;
2058  ColPartition* candidate = grid->BestMergeCandidate(part, &candidates, debug,
2059  nullptr, &overlap_increase);
2060  if (candidate != nullptr && (overlap_increase <= 0 || desperate)) {
2061  if (debug) {
2062  tprintf("Merging:hoverlap=%d, voverlap=%d, OLI=%d\n",
2063  part->HCoreOverlap(*candidate), part->VCoreOverlap(*candidate),
2064  overlap_increase);
2065  }
2066  // Remove before merge and re-insert to keep the integrity of the grid.
2067  grid->RemoveBBox(candidate);
2068  grid->RemoveBBox(part);
2069  part->Absorb(candidate, nullptr);
2070  // We modified the box of part, so re-insert it into the grid.
2071  grid->InsertBBox(true, true, part);
2072  if (overlap_increase > 0)
2073  part->desperately_merged_ = true;
2074  } else {
2075  break; // Can't merge.
2076  }
2077  }
2078 }
2079 
2080 // Cleans up the partners above if upper is true, else below.
2081 // Keep the partner with the biggest overlap.
2082 void ColPartition::RefinePartnersByOverlap(bool upper,
2083  ColPartition_CLIST* partners) {
2084  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
2085  bounding_box_.bottom());
2086  if (debug) {
2087  tprintf("Refining %d %s partners by overlap for:\n",
2088  partners->length(), upper ? "Upper" : "Lower");
2089  Print();
2090  }
2091  ColPartition_C_IT it(partners);
2092  ColPartition* best_partner = it.data();
2093  // Find the partner with the best overlap.
2094  int best_overlap = 0;
2095  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2096  ColPartition* partner = it.data();
2097  int overlap = std::min(bounding_box_.right(), partner->bounding_box_.right())
2098  - std::max(bounding_box_.left(), partner->bounding_box_.left());
2099  if (overlap > best_overlap) {
2100  best_overlap = overlap;
2101  best_partner = partner;
2102  }
2103  }
2104  // Keep only the best partner.
2105  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2106  ColPartition* partner = it.data();
2107  if (partner != best_partner) {
2108  if (debug) {
2109  tprintf("Removing partner:");
2110  partner->Print();
2111  }
2112  partner->RemovePartner(!upper, this);
2113  it.extract();
2114  }
2115  }
2116 }
2117 
2118 // Return true if bbox belongs better in this than other.
2119 bool ColPartition::ThisPartitionBetter(BLOBNBOX* bbox,
2120  const ColPartition& other) {
2121  const TBOX& box = bbox->bounding_box();
2122  // Margins take priority.
2123  int left = box.left();
2124  int right = box.right();
2125  if (left < left_margin_ || right > right_margin_)
2126  return false;
2127  if (left < other.left_margin_ || right > other.right_margin_)
2128  return true;
2129  int top = box.top();
2130  int bottom = box.bottom();
2131  int this_overlap = std::min(top, median_top_) - std::max(bottom, median_bottom_);
2132  int other_overlap = std::min(top, other.median_top_) -
2133  std::max(bottom, other.median_bottom_);
2134  int this_miss = median_top_ - median_bottom_ - this_overlap;
2135  int other_miss = other.median_top_ - other.median_bottom_ - other_overlap;
2136  if (TabFind::WithinTestRegion(3, box.left(), box.bottom())) {
2137  tprintf("Unique on (%d,%d)->(%d,%d) overlap %d/%d, miss %d/%d, mt=%d/%d\n",
2138  box.left(), box.bottom(), box.right(), box.top(),
2139  this_overlap, other_overlap, this_miss, other_miss,
2140  median_top_, other.median_top_);
2141  }
2142  if (this_miss < other_miss)
2143  return true;
2144  if (this_miss > other_miss)
2145  return false;
2146  if (this_overlap > other_overlap)
2147  return true;
2148  if (this_overlap < other_overlap)
2149  return false;
2150  return median_top_ >= other.median_top_;
2151 }
2152 
2153 // Returns the median line-spacing between the current position and the end
2154 // of the list.
2155 // The iterator is passed by value so the iteration does not modify the
2156 // caller's iterator.
2157 static int MedianSpacing(int page_height, ColPartition_IT it) {
2158  STATS stats(0, page_height);
2159  while (!it.cycled_list()) {
2160  ColPartition* part = it.data();
2161  it.forward();
2162  stats.add(part->bottom_spacing(), 1);
2163  stats.add(part->top_spacing(), 1);
2164  }
2165  return static_cast<int>(stats.median() + 0.5);
2166 }
2167 
2168 // Returns true if this column partition is in the same column as
2169 // part. This function will only work after the SetPartitionType function
2170 // has been called on both column partitions. This is useful for
2171 // doing a SideSearch when you want things in the same page column.
2172 //
2173 // Currently called by the table detection code to identify if potential table
2174 // partitions exist in the same column.
2176  // Overlap does not occur when last < part.first or first > part.last.
2177  // In other words, one is completely to the side of the other.
2178  // This is just DeMorgan's law applied to that so the function returns true.
2179  return (last_column_ >= part.first_column_) &&
2180  (first_column_ <= part.last_column_);
2181 }
2182 
2183 // Smoothes the spacings in the list into groups of equal linespacing.
2184 // resolution is the resolution of the original image, used as a basis
2185 // for thresholds in change of spacing. page_height is in pixels.
2186 void ColPartition::SmoothSpacings(int resolution, int page_height,
2187  ColPartition_LIST* parts) {
2188  // The task would be trivial if we didn't have to allow for blips -
2189  // occasional offsets in spacing caused by anomalous text, such as all
2190  // caps, groups of descenders, joined words, Arabic etc.
2191  // The neighbourhood stores a consecutive group of partitions so that
2192  // blips can be detected correctly, yet conservatively enough to not
2193  // mistake genuine spacing changes for blips. See example below.
2194  ColPartition* neighbourhood[PN_COUNT];
2195  ColPartition_IT it(parts);
2196  it.mark_cycle_pt();
2197  // Although we know nothing about the spacings is this list, the median is
2198  // used as an approximation to allow blips.
2199  // If parts of this block aren't spaced to the median, then we can't
2200  // accept blips in those parts, but we'll recalculate it each time we
2201  // split the block, so the median becomes more likely to match all the text.
2202  int median_space = MedianSpacing(page_height, it);
2203  ColPartition_IT start_it(it);
2204  ColPartition_IT end_it(it);
2205  for (int i = 0; i < PN_COUNT; ++i) {
2206  if (i < PN_UPPER || it.cycled_list()) {
2207  neighbourhood[i] = nullptr;
2208  } else {
2209  if (i == PN_LOWER)
2210  end_it = it;
2211  neighbourhood[i] = it.data();
2212  it.forward();
2213  }
2214  }
2215  while (neighbourhood[PN_UPPER] != nullptr) {
2216  // Test for end of a group. Normally SpacingsEqual is true within a group,
2217  // but in the case of a blip, it will be false. Here is an example:
2218  // Line enum Spacing below (spacing between tops of lines)
2219  // 1 ABOVE2 20
2220  // 2 ABOVE1 20
2221  // 3 UPPER 15
2222  // 4 LOWER 25
2223  // 5 BELOW1 20
2224  // 6 BELOW2 20
2225  // Line 4 is all in caps (regular caps), so the spacing between line 3
2226  // and line 4 (looking at the tops) is smaller than normal, and the
2227  // spacing between line 4 and line 5 is larger than normal, but the
2228  // two of them add to twice the normal spacing.
2229  // The following if has to accept unequal spacings 3 times to pass the
2230  // blip (20/15, 15/25 and 25/20)
2231  // When the blip is in the middle, OKSpacingBlip tests that one of
2232  // ABOVE1 and BELOW1 matches the median.
2233  // The first time, everything is shifted down 1, so we present
2234  // OKSpacingBlip with neighbourhood+1 and check that PN_UPPER is median.
2235  // The last time, everything is shifted up 1, so we present OKSpacingBlip
2236  // with neighbourhood-1 and check that PN_LOWER matches the median.
2237  if (neighbourhood[PN_LOWER] == nullptr ||
2238  (!neighbourhood[PN_UPPER]->SpacingsEqual(*neighbourhood[PN_LOWER],
2239  resolution) &&
2240  !OKSpacingBlip(resolution, median_space, neighbourhood) &&
2241  (!OKSpacingBlip(resolution, median_space, neighbourhood - 1) ||
2242  !neighbourhood[PN_LOWER]->SpacingEqual(median_space, resolution)) &&
2243  (!OKSpacingBlip(resolution, median_space, neighbourhood + 1) ||
2244  !neighbourhood[PN_UPPER]->SpacingEqual(median_space, resolution)))) {
2245  // The group has ended. PN_UPPER is the last member.
2246  // Compute the mean spacing over the group.
2247  ColPartition_IT sum_it(start_it);
2248  ColPartition* last_part = neighbourhood[PN_UPPER];
2249  double total_bottom = 0.0;
2250  double total_top = 0.0;
2251  int total_count = 0;
2252  ColPartition* upper = sum_it.data();
2253  // We do not process last_part, as its spacing is different.
2254  while (upper != last_part) {
2255  total_bottom += upper->bottom_spacing();
2256  total_top += upper->top_spacing();
2257  ++total_count;
2258  sum_it.forward();
2259  upper = sum_it.data();
2260  }
2261  if (total_count > 0) {
2262  // There were at least 2 lines, so set them all to the mean.
2263  int top_spacing = static_cast<int>(total_top / total_count + 0.5);
2264  int bottom_spacing = static_cast<int>(total_bottom / total_count + 0.5);
2265  if (textord_debug_tabfind) {
2266  tprintf("Spacing run ended. Cause:");
2267  if (neighbourhood[PN_LOWER] == nullptr) {
2268  tprintf("No more lines\n");
2269  } else {
2270  tprintf("Spacing change. Spacings:\n");
2271  for (int i = 0; i < PN_COUNT; ++i) {
2272  if (neighbourhood[i] == nullptr) {
2273  tprintf("NULL");
2274  if (i > 0 && neighbourhood[i - 1] != nullptr) {
2275  if (neighbourhood[i - 1]->SingletonPartner(false) != nullptr) {
2276  tprintf(" Lower partner:");
2277  neighbourhood[i - 1]->SingletonPartner(false)->Print();
2278  } else {
2279  tprintf(" nullptr lower partner:\n");
2280  }
2281  } else {
2282  tprintf("\n");
2283  }
2284  } else {
2285  tprintf("Top = %d, bottom = %d\n",
2286  neighbourhood[i]->top_spacing(),
2287  neighbourhood[i]->bottom_spacing());
2288  }
2289  }
2290  }
2291  tprintf("Mean spacing = %d/%d\n", top_spacing, bottom_spacing);
2292  }
2293  sum_it = start_it;
2294  upper = sum_it.data();
2295  while (upper != last_part) {
2296  upper->set_top_spacing(top_spacing);
2297  upper->set_bottom_spacing(bottom_spacing);
2298  if (textord_debug_tabfind) {
2299  tprintf("Setting mean on:");
2300  upper->Print();
2301  }
2302  sum_it.forward();
2303  upper = sum_it.data();
2304  }
2305  }
2306  // PN_LOWER starts the next group and end_it is the next start_it.
2307  start_it = end_it;
2308  // Recalculate the median spacing to maximize the chances of detecting
2309  // spacing blips.
2310  median_space = MedianSpacing(page_height, end_it);
2311  }
2312  // Shuffle pointers.
2313  for (int j = 1; j < PN_COUNT; ++j) {
2314  neighbourhood[j - 1] = neighbourhood[j];
2315  }
2316  if (it.cycled_list()) {
2317  neighbourhood[PN_COUNT - 1] = nullptr;
2318  } else {
2319  neighbourhood[PN_COUNT - 1] = it.data();
2320  it.forward();
2321  }
2322  end_it.forward();
2323  }
2324 }
2325 
2326 // Returns true if the parts array of pointers to partitions matches the
2327 // condition for a spacing blip. See SmoothSpacings for what this means
2328 // and how it is used.
2329 bool ColPartition::OKSpacingBlip(int resolution, int median_spacing,
2330  ColPartition** parts) {
2331  if (parts[PN_UPPER] == nullptr || parts[PN_LOWER] == nullptr)
2332  return false;
2333  // The blip is OK if upper and lower sum to an OK value and at least
2334  // one of above1 and below1 is equal to the median.
2335  return parts[PN_UPPER]->SummedSpacingOK(*parts[PN_LOWER],
2336  median_spacing, resolution) &&
2337  ((parts[PN_ABOVE1] != nullptr &&
2338  parts[PN_ABOVE1]->SpacingEqual(median_spacing, resolution)) ||
2339  (parts[PN_BELOW1] != nullptr &&
2340  parts[PN_BELOW1]->SpacingEqual(median_spacing, resolution)));
2341 }
2342 
2343 // Returns true if both the top and bottom spacings of this match the given
2344 // spacing to within suitable margins dictated by the image resolution.
2345 bool ColPartition::SpacingEqual(int spacing, int resolution) const {
2346  int bottom_error = BottomSpacingMargin(resolution);
2347  int top_error = TopSpacingMargin(resolution);
2348  return NearlyEqual(bottom_spacing_, spacing, bottom_error) &&
2349  NearlyEqual(top_spacing_, spacing, top_error);
2350 }
2351 
2352 // Returns true if both the top and bottom spacings of this and other
2353 // match to within suitable margins dictated by the image resolution.
2354 bool ColPartition::SpacingsEqual(const ColPartition& other,
2355  int resolution) const {
2356  int bottom_error = std::max(BottomSpacingMargin(resolution),
2357  other.BottomSpacingMargin(resolution));
2358  int top_error = std::max(TopSpacingMargin(resolution),
2359  other.TopSpacingMargin(resolution));
2360  return NearlyEqual(bottom_spacing_, other.bottom_spacing_, bottom_error) &&
2361  (NearlyEqual(top_spacing_, other.top_spacing_, top_error) ||
2362  NearlyEqual(top_spacing_ + other.top_spacing_, bottom_spacing_ * 2,
2363  bottom_error));
2364 }
2365 
2366 // Returns true if the sum spacing of this and other match the given
2367 // spacing (or twice the given spacing) to within a suitable margin dictated
2368 // by the image resolution.
2369 bool ColPartition::SummedSpacingOK(const ColPartition& other,
2370  int spacing, int resolution) const {
2371  int bottom_error = std::max(BottomSpacingMargin(resolution),
2372  other.BottomSpacingMargin(resolution));
2373  int top_error = std::max(TopSpacingMargin(resolution),
2374  other.TopSpacingMargin(resolution));
2375  int bottom_total = bottom_spacing_ + other.bottom_spacing_;
2376  int top_total = top_spacing_ + other.top_spacing_;
2377  return (NearlyEqual(spacing, bottom_total, bottom_error) &&
2378  NearlyEqual(spacing, top_total, top_error)) ||
2379  (NearlyEqual(spacing * 2, bottom_total, bottom_error) &&
2380  NearlyEqual(spacing * 2, top_total, top_error));
2381 }
2382 
2383 // Returns a suitable spacing margin that can be applied to bottoms of
2384 // text lines, based on the resolution and the stored side_step_.
2385 int ColPartition::BottomSpacingMargin(int resolution) const {
2386  return static_cast<int>(kMaxSpacingDrift * resolution + 0.5) + side_step_;
2387 }
2388 
2389 // Returns a suitable spacing margin that can be applied to tops of
2390 // text lines, based on the resolution and the stored side_step_.
2391 int ColPartition::TopSpacingMargin(int resolution) const {
2392  return static_cast<int>(kMaxTopSpacingFraction * median_height_ + 0.5) +
2393  BottomSpacingMargin(resolution);
2394 }
2395 
2396 // Returns true if the median text sizes of this and other agree to within
2397 // a reasonable multiplicative factor.
2398 bool ColPartition::SizesSimilar(const ColPartition& other) const {
2399  return median_height_ <= other.median_height_ * kMaxSizeRatio &&
2400  other.median_height_ <= median_height_ * kMaxSizeRatio;
2401 }
2402 
2403 // Helper updates margin_left and margin_right, being the bounds of the left
2404 // margin of part of a block. Returns false and does not update the bounds if
2405 // this partition has a disjoint margin with the established margin.
2406 static bool UpdateLeftMargin(const ColPartition& part,
2407  int* margin_left, int* margin_right) {
2408  const TBOX& part_box = part.bounding_box();
2409  int top = part_box.top();
2410  int bottom = part_box.bottom();
2411  int tl_key = part.SortKey(part.left_margin(), top);
2412  int tr_key = part.SortKey(part_box.left(), top);
2413  int bl_key = part.SortKey(part.left_margin(), bottom);
2414  int br_key = part.SortKey(part_box.left(), bottom);
2415  int left_key = std::max(tl_key, bl_key);
2416  int right_key = std::min(tr_key, br_key);
2417  if (left_key <= *margin_right && right_key >= *margin_left) {
2418  // This part is good - let's keep it.
2419  *margin_right = std::min(*margin_right, right_key);
2420  *margin_left = std::max(*margin_left, left_key);
2421  return true;
2422  }
2423  return false;
2424 }
2425 
2426 // Computes and returns in start, end a line segment formed from a
2427 // forwards-iterated group of left edges of partitions that satisfy the
2428 // condition that the intersection of the left margins is non-empty, ie the
2429 // rightmost left margin is to the left of the leftmost left bounding box edge.
2430 // On return the iterator is set to the start of the next run.
2431 void ColPartition::LeftEdgeRun(ColPartition_IT* part_it,
2432  ICOORD* start, ICOORD* end) {
2433  ColPartition* part = part_it->data();
2434  ColPartition* start_part = part;
2435  int start_y = part->bounding_box_.top();
2436  if (!part_it->at_first()) {
2437  int prev_bottom = part_it->data_relative(-1)->bounding_box_.bottom();
2438  if (prev_bottom < start_y)
2439  start_y = prev_bottom;
2440  else if (prev_bottom > start_y)
2441  start_y = (start_y + prev_bottom) / 2;
2442  }
2443  int end_y = part->bounding_box_.bottom();
2444  int margin_right = INT32_MAX;
2445  int margin_left = -INT32_MAX;
2446  UpdateLeftMargin(*part, &margin_left, &margin_right);
2447  do {
2448  part_it->forward();
2449  part = part_it->data();
2450  } while (!part_it->at_first() &&
2451  UpdateLeftMargin(*part, &margin_left, &margin_right));
2452  // The run ended. If we were pushed inwards, compute the next run and
2453  // extend it backwards into the run we just calculated to find the end of
2454  // this run that provides a tight box.
2455  int next_margin_right = INT32_MAX;
2456  int next_margin_left = -INT32_MAX;
2457  UpdateLeftMargin(*part, &next_margin_left, &next_margin_right);
2458  if (next_margin_left > margin_right) {
2459  ColPartition_IT next_it(*part_it);
2460  do {
2461  next_it.forward();
2462  part = next_it.data();
2463  } while (!next_it.at_first() &&
2464  UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
2465  // Now extend the next run backwards into the original run to get the
2466  // tightest fit.
2467  do {
2468  part_it->backward();
2469  part = part_it->data();
2470  } while (part != start_part &&
2471  UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
2472  part_it->forward();
2473  }
2474  // Now calculate the end_y.
2475  part = part_it->data_relative(-1);
2476  end_y = part->bounding_box_.bottom();
2477  if (!part_it->at_first() && part_it->data()->bounding_box_.top() < end_y)
2478  end_y = (end_y + part_it->data()->bounding_box_.top()) / 2;
2479  start->set_y(start_y);
2480  start->set_x(part->XAtY(margin_right, start_y));
2481  end->set_y(end_y);
2482  end->set_x(part->XAtY(margin_right, end_y));
2483  if (textord_debug_tabfind && !part_it->at_first())
2484  tprintf("Left run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
2485  start_y, end_y, part->XAtY(margin_left, end_y),
2486  end->x(), part->left_margin_, part->bounding_box_.left());
2487 }
2488 
2489 // Helper updates margin_left and margin_right, being the bounds of the right
2490 // margin of part of a block. Returns false and does not update the bounds if
2491 // this partition has a disjoint margin with the established margin.
2492 static bool UpdateRightMargin(const ColPartition& part,
2493  int* margin_left, int* margin_right) {
2494  const TBOX& part_box = part.bounding_box();
2495  int top = part_box.top();
2496  int bottom = part_box.bottom();
2497  int tl_key = part.SortKey(part_box.right(), top);
2498  int tr_key = part.SortKey(part.right_margin(), top);
2499  int bl_key = part.SortKey(part_box.right(), bottom);
2500  int br_key = part.SortKey(part.right_margin(), bottom);
2501  int left_key = std::max(tl_key, bl_key);
2502  int right_key = std::min(tr_key, br_key);
2503  if (left_key <= *margin_right && right_key >= *margin_left) {
2504  // This part is good - let's keep it.
2505  *margin_right = std::min(*margin_right, right_key);
2506  *margin_left = std::max(*margin_left, left_key);
2507  return true;
2508  }
2509  return false;
2510 }
2511 
2512 // Computes and returns in start, end a line segment formed from a
2513 // backwards-iterated group of right edges of partitions that satisfy the
2514 // condition that the intersection of the right margins is non-empty, ie the
2515 // leftmost right margin is to the right of the rightmost right bounding box
2516 // edge.
2517 // On return the iterator is set to the start of the next run.
2518 void ColPartition::RightEdgeRun(ColPartition_IT* part_it,
2519  ICOORD* start, ICOORD* end) {
2520  ColPartition* part = part_it->data();
2521  ColPartition* start_part = part;
2522  int start_y = part->bounding_box_.bottom();
2523  if (!part_it->at_last()) {
2524  int next_y = part_it->data_relative(1)->bounding_box_.top();
2525  if (next_y > start_y)
2526  start_y = next_y;
2527  else if (next_y < start_y)
2528  start_y = (start_y + next_y) / 2;
2529  }
2530  int end_y = part->bounding_box_.top();
2531  int margin_right = INT32_MAX;
2532  int margin_left = -INT32_MAX;
2533  UpdateRightMargin(*part, &margin_left, &margin_right);
2534  do {
2535  part_it->backward();
2536  part = part_it->data();
2537  } while (!part_it->at_last() &&
2538  UpdateRightMargin(*part, &margin_left, &margin_right));
2539  // The run ended. If we were pushed inwards, compute the next run and
2540  // extend it backwards to find the end of this run for a tight box.
2541  int next_margin_right = INT32_MAX;
2542  int next_margin_left = -INT32_MAX;
2543  UpdateRightMargin(*part, &next_margin_left, &next_margin_right);
2544  if (next_margin_right < margin_left) {
2545  ColPartition_IT next_it(*part_it);
2546  do {
2547  next_it.backward();
2548  part = next_it.data();
2549  } while (!next_it.at_last() &&
2550  UpdateRightMargin(*part, &next_margin_left,
2551  &next_margin_right));
2552  // Now extend the next run forwards into the original run to get the
2553  // tightest fit.
2554  do {
2555  part_it->forward();
2556  part = part_it->data();
2557  } while (part != start_part &&
2558  UpdateRightMargin(*part, &next_margin_left,
2559  &next_margin_right));
2560  part_it->backward();
2561  }
2562  // Now calculate the end_y.
2563  part = part_it->data_relative(1);
2564  end_y = part->bounding_box().top();
2565  if (!part_it->at_last() &&
2566  part_it->data()->bounding_box_.bottom() > end_y)
2567  end_y = (end_y + part_it->data()->bounding_box_.bottom()) / 2;
2568  start->set_y(start_y);
2569  start->set_x(part->XAtY(margin_left, start_y));
2570  end->set_y(end_y);
2571  end->set_x(part->XAtY(margin_left, end_y));
2572  if (textord_debug_tabfind && !part_it->at_last())
2573  tprintf("Right run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
2574  start_y, end_y, end->x(), part->XAtY(margin_right, end_y),
2575  part->bounding_box_.right(), part->right_margin_);
2576 }
2577 
2578 } // namespace tesseract.
ELIST2IZE
#define ELIST2IZE(CLASSNAME)
Definition: elst2.h:928
TBOX
Definition: cleanapi_test.cc:19
tesseract::ColPartition::set_right_margin
void set_right_margin(int margin)
Definition: colpartition.h:121
tesseract::ColPartition::bottom_spacing
int bottom_spacing() const
Definition: colpartition.h:220
tesseract::ColPartition::set_block_owned
void set_block_owned(bool owned)
Definition: colpartition.h:208
BlobTextFlowType
BlobTextFlowType
Definition: blobbox.h:113
ICOORD::set_x
void set_x(int16_t xin)
rewrite function
Definition: points.h:60
C_BLOB::FakeBlob
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:236
tesseract::CST_FLOWING
Definition: colpartition.h:49
tesseract::ColPartition::SetRegionAndFlowTypesFromProjectionValue
void SetRegionAndFlowTypesFromProjectionValue(int value)
Definition: colpartition.cpp:1191
tesseract::CST_PULLOUT
Definition: colpartition.h:51
tesseract::ColPartition::Print
void Print() const
Definition: colpartition.cpp:1782
tesseract::TabVector
Definition: tabvector.h:111
BTFT_NONE
Definition: blobbox.h:114
DominatesInMerge
bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2)
Definition: blobbox.h:128
tesseract::ColPartition::PrintColors
void PrintColors()
Definition: colpartition.cpp:1800
BLOBNBOX::NoisyNeighbours
int NoisyNeighbours() const
Definition: blobbox.cpp:235
tesseract::ColPartition::IsLineType
bool IsLineType() const
Definition: colpartition.h:425
C_BLOB::out_list
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:69
BTFT_STRONG_CHAIN
Definition: blobbox.h:118
PDBLK::bounding_box
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:58
host.h
dppoint.h
tesseract::ColPartition::OverlapSplitBlob
BLOBNBOX * OverlapSplitBlob(const TBOX &box)
Definition: colpartition.cpp:769
tesseract::ColPartition::BoxLeftKey
int BoxLeftKey() const
Definition: colpartition.h:332
tesseract::kMaxLeaderGapFractionOfMin
const double kMaxLeaderGapFractionOfMin
Definition: colpartition.cpp:55
BRT_COUNT
Definition: blobbox.h:81
tesseract::ColPartition::set_bottom_spacing
void set_bottom_spacing(int spacing)
Definition: colpartition.h:223
tesseract::ColPartition::top_spacing
int top_spacing() const
Definition: colpartition.h:226
tesseract::ColPartition::TypesSimilar
static bool TypesSimilar(PolyBlockType type1, PolyBlockType type2)
Definition: colpartition.h:418
BRT_NOISE
Definition: blobbox.h:72
tesseract::kMinBaselineCoverage
const double kMinBaselineCoverage
Definition: colpartition.cpp:72
tesseract::ColPartition::LineSpacingBlocks
static void LineSpacingBlocks(const ICOORD &bleft, const ICOORD &tright, int resolution, ColPartition_LIST *block_parts, ColPartition_LIST *used_parts, BLOCK_LIST *completed_blocks, TO_BLOCK_LIST *to_blocks)
Definition: colpartition.cpp:1407
POLY_BLOCK::IsText
bool IsText() const
Definition: polyblk.h:62
tesseract::kMinChainTextValue
const int kMinChainTextValue
Definition: colpartition.cpp:61
tesseract::kMinLeaderCount
const int kMinLeaderCount
Definition: colpartition.cpp:57
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::ColPartition::median_width
int median_width() const
Definition: colpartition.h:142
tesseract::ColPartition::flow
BlobTextFlowType flow() const
Definition: colpartition.h:154
tesseract::ColPartition::HasGoodBaseline
bool HasGoodBaseline()
Definition: colpartition.cpp:1280
NearlyEqual
bool NearlyEqual(T x, T y, T tolerance)
Definition: host.h:36
tesseract::ColPartition::ColumnRange
void ColumnRange(int resolution, ColPartitionSet *columns, int *first_col, int *last_col)
Definition: colpartition.cpp:1056
tesseract::WorkingPartSet::AddPartition
void AddPartition(ColPartition *part)
Definition: workingpartset.cpp:31
BLOBNBOX::base_char_top
int base_char_top() const
Definition: blobbox.h:382
tesseract::ColPartition::MatchingSizes
bool MatchingSizes(const ColPartition &other) const
Definition: colpartition.cpp:405
tesseract::ColPartition::ReleaseNonLeaderBoxes
bool ReleaseNonLeaderBoxes()
Definition: colpartition.cpp:289
TBOX::overlap
bool overlap(const TBOX &box) const
Definition: rect.h:350
tesseract::kColumnWidthFactor
const int kColumnWidthFactor
Definition: tabfind.h:41
BRT_UNKNOWN
Definition: blobbox.h:77
tesseract::ColPartition::IsLegal
bool IsLegal()
Definition: colpartition.cpp:342
tesseract::ColPartition::IsPulloutType
bool IsPulloutType() const
Definition: colpartition.h:437
ICOORD
integer coordinate
Definition: points.h:30
tesseract::WidthCallback
std::function< bool(int)> WidthCallback
Definition: tabfind.h:35
BLOBNBOX::set_flow
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:297
BlobSpecialTextType
BlobSpecialTextType
Definition: blobbox.h:95
TBOX::print
void print() const
Definition: rect.h:277
tesseract::ColPartition::SpecialBlobsDensity
float SpecialBlobsDensity(const BlobSpecialTextType type) const
Definition: colpartition.cpp:556
PT_HEADING_IMAGE
Definition: capi.h:118
tesseract::kMaxLeaderGapFractionOfMax
const double kMaxLeaderGapFractionOfMax
Definition: colpartition.cpp:53
tesseract::ColPartition::median_height
int median_height() const
Definition: colpartition.h:136
TBOX::top
int16_t top() const
Definition: rect.h:57
tesseract::ColPartition::type
PolyBlockType type() const
Definition: colpartition.h:181
PT_NOISE
Definition: capi.h:122
TBOX::area
int32_t area() const
Definition: rect.h:121
tesseract::ColPartition::RemovePartner
void RemovePartner(bool upper, ColPartition *partner)
Definition: colpartition.cpp:618
TO_BLOCK
Definition: blobbox.h:691
BRT_VERT_TEXT
Definition: blobbox.h:78
TBOX::set_top
void set_top(int y)
Definition: rect.h:60
tesseract::WorkingPartSet::ExtractCompletedBlocks
void ExtractCompletedBlocks(const ICOORD &bleft, const ICOORD &tright, int resolution, ColPartition_LIST *used_parts, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: workingpartset.cpp:55
tesseract::ColPartitionSet::SpanningType
ColumnSpanningType SpanningType(int resolution, int left, int right, int height, int y, int left_margin, int right_margin, int *first_col, int *last_col, int *first_spanned_col)
Definition: colpartitionset.cpp:404
PT_FLOWING_IMAGE
Definition: capi.h:117
colpartitionset.h
tesseract::DPPoint
Definition: dppoint.h:59
tesseract::ColPartition::median_top
int median_top() const
Definition: colpartition.h:124
detlinefit.h
colpartition.h
tesseract::ColPartitionSet
Definition: colpartitionset.h:39
tesseract::ColPartition::median_bottom
int median_bottom() const
Definition: colpartition.h:127
PT_TABLE
Definition: capi.h:114
tesseract::CST_HEADING
Definition: colpartition.h:50
tesseract::DPPoint::CostWithVariance
int64_t CostWithVariance(const DPPoint *prev)
Definition: dppoint.cpp:85
BRT_RECTIMAGE
Definition: blobbox.h:75
tesseract::ColPartition::boxes
BLOBNBOX_CLIST * boxes()
Definition: colpartition.h:187
ICOORD::x
int16_t x() const
access function
Definition: points.h:51
tesseract::kMinStrongTextValue
const int kMinStrongTextValue
Definition: colpartition.cpp:59
BLOBNBOX
Definition: blobbox.h:142
tesseract::ColPartition::set_top_spacing
void set_top_spacing(int spacing)
Definition: colpartition.h:229
tesseract::ColPartition::CopyLeftTab
void CopyLeftTab(const ColPartition &src, bool take_box)
Definition: colpartition.cpp:519
BTFT_CHAIN
Definition: blobbox.h:117
tesseract::DetLineFit
Definition: detlinefit.h:56
BRT_HLINE
Definition: blobbox.h:73
BTFT_LEADER
Definition: blobbox.h:120
ICOORD::set_y
void set_y(int16_t yin)
rewrite function
Definition: points.h:64
BRT_POLYIMAGE
Definition: blobbox.h:76
PT_VERTICAL_TEXT
Definition: capi.h:115
PT_COUNT
Definition: capi.h:123
tesseract::ColPartition
Definition: colpartition.h:67
TBOX::height
int16_t height() const
Definition: rect.h:107
BLOBNBOX::TextlineColor
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type)
Definition: blobbox.cpp:442
BTFT_NONTEXT
Definition: blobbox.h:115
BLOBNBOX::GoodTextBlob
int GoodTextBlob() const
Definition: blobbox.cpp:224
tesseract::ColPartition::BoxRightKey
int BoxRightKey() const
Definition: colpartition.h:336
tesseract::kMaxColorDistance
const int kMaxColorDistance
Definition: colpartition.cpp:77
tesseract::ColPartition::SplitAt
ColPartition * SplitAt(int split_x)
Definition: colpartition.cpp:823
tesseract::DPPoint::Solve
static DPPoint * Solve(int min_step, int max_step, bool debug, CostFunc cost_func, int size, DPPoint *points)
Definition: dppoint.cpp:47
BLOBNBOX::special_text_type
BlobSpecialTextType special_text_type() const
Definition: blobbox.h:288
textord_debug_bugs
int textord_debug_bugs
Definition: alignedblob.cpp:28
tesseract::ColPartition::MarkAsLeaderIfMonospaced
bool MarkAsLeaderIfMonospaced()
Definition: colpartition.cpp:1083
tesseract::TabVector::sort_key
int sort_key() const
Definition: tabvector.h:157
TBOX::set_right
void set_right(int x)
Definition: rect.h:81
tesseract::ColPartition::SingletonPartner
ColPartition * SingletonPartner(bool upper)
Definition: colpartition.cpp:629
tesseract::ColumnSpanningType
ColumnSpanningType
Definition: colpartition.h:47
BLOCK
Definition: ocrblock.h:28
BLOCK::pdblk
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:189
tesseract::kHorzStrongTextlineAspect
const int kHorzStrongTextlineAspect
Definition: colpartition.cpp:67
BLOBNBOX::base_char_bottom
int base_char_bottom() const
Definition: blobbox.h:385
PDBLK::set_poly_block
void set_poly_block(POLY_BLOCK *blk)
set the poly block
Definition: pdblock.h:56
tesseract::ColPartition::set_owns_blobs
void set_owns_blobs(bool owns_blobs)
Definition: colpartition.h:294
BlobRegionType
BlobRegionType
Definition: blobbox.h:71
tesseract::ColPartition::IsImageType
bool IsImageType() const
Definition: colpartition.h:429
PDBLK::poly_block
POLY_BLOCK * poly_block() const
Definition: pdblock.h:54
TO_BLOCK::block
BLOCK * block
Definition: blobbox.h:776
tesseract::ColPartition::MakeBigPartition
static ColPartition * MakeBigPartition(BLOBNBOX *box, ColPartition_LIST *big_part_list)
Definition: colpartition.cpp:116
tesseract::AlignedBlob::WithinTestRegion
static bool WithinTestRegion(int detail_level, int x, int y)
Definition: alignedblob.cpp:150
PT_HEADING_TEXT
Definition: capi.h:110
tesseract::ColPartition::AddToWorkingSet
void AddToWorkingSet(const ICOORD &bleft, const ICOORD &tright, int resolution, ColPartition_LIST *used_parts, WorkingPartSet_LIST *working_set)
Definition: colpartition.cpp:1347
BRT_TEXT
Definition: blobbox.h:79
tesseract::kHorzStrongTextlineCount
const int kHorzStrongTextlineCount
Definition: colpartition.cpp:63
tesseract::ColPartition::MakeToRow
TO_ROW * MakeToRow()
Definition: colpartition.cpp:1706
tesseract::kHorzStrongTextlineHeight
const int kHorzStrongTextlineHeight
Definition: colpartition.cpp:65
tesseract::ColPartition::blob_type
BlobRegionType blob_type() const
Definition: colpartition.h:148
tesseract::ColPartition::MidY
int MidY() const
Definition: colpartition.h:304
tesseract::ColPartition::OKDiacriticMerge
bool OKDiacriticMerge(const ColPartition &candidate, bool debug) const
Definition: colpartition.cpp:458
tesseract::ColPartition::set_side_step
void set_side_step(int step)
Definition: colpartition.h:217
TBOX::width
int16_t width() const
Definition: rect.h:114
tesseract::kMaxSpacingDrift
const double kMaxSpacingDrift
Definition: colpartition.cpp:43
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
tesseract::ColPartition::MatchingColumns
bool MatchingColumns(const ColPartition &other) const
Definition: colpartition.cpp:370
BLOBNBOX::set_owner
void set_owner(tesseract::ColPartition *new_owner)
Definition: blobbox.h:354
tesseract::TabFind::DifferentSizes
static bool DifferentSizes(int size1, int size2)
Definition: tabfind.cpp:407
BLOBNBOX::IsDiacritic
bool IsDiacritic() const
Definition: blobbox.h:379
tesseract::ColPartition::SetSpecialBlobsDensity
void SetSpecialBlobsDensity(const BlobSpecialTextType type, const float density)
Definition: colpartition.cpp:576
tesseract::ColPartition::owns_blobs
bool owns_blobs() const
Definition: colpartition.h:291
CLISTIZE
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
tesseract
Definition: baseapi.h:65
BLOBNBOX::set_region_type
void set_region_type(BlobRegionType new_type)
Definition: blobbox.h:285
tesseract::ColPartition::ReflectInYAxis
void ReflectInYAxis()
Definition: colpartition.cpp:320
STATS::median
double median() const
Definition: statistc.cpp:218
tesseract::ColPartition::~ColPartition
~ColPartition()
Definition: colpartition.cpp:133
tesseract::kMaxRMSColorNoise
const int kMaxRMSColorNoise
Definition: colpartition.cpp:74
PT_VERT_LINE
Definition: capi.h:121
tesseract::ColPartition::FakePartition
static ColPartition * FakePartition(const TBOX &box, PolyBlockType block_type, BlobRegionType blob_type, BlobTextFlowType flow)
Definition: colpartition.cpp:95
TO_ROW::add_blob
void add_blob(BLOBNBOX *blob, float top, float bottom, float row_size)
Definition: blobbox.cpp:723
PT_UNKNOWN
Definition: capi.h:108
STATS
Definition: statistc.h:30
BLOBNBOX::bounding_box
const TBOX & bounding_box() const
Definition: blobbox.h:229
tesseract::ColPartition::set_flow
void set_flow(BlobTextFlowType f)
Definition: colpartition.h:157
tesseract::ColPartition::SpecialBlobsCount
int SpecialBlobsCount(const BlobSpecialTextType type)
Definition: colpartition.cpp:561
tesseract::ColPartition::AddPartner
void AddPartner(bool upper, ColPartition *partner)
Definition: colpartition.cpp:603
tesseract::ColPartition::IsInSameColumnAs
bool IsInSameColumnAs(const ColPartition &part) const
Definition: colpartition.cpp:2175
tesseract::kMaxTopSpacingFraction
const double kMaxTopSpacingFraction
Definition: colpartition.cpp:46
STATS::mode
int32_t mode() const
Definition: statistc.cpp:100
tesseract::ColPartition::MakeBlock
static TO_BLOCK * MakeBlock(const ICOORD &bleft, const ICOORD &tright, ColPartition_LIST *block_parts, ColPartition_LIST *used_parts)
Definition: colpartition.cpp:1623
tesseract::ColPartition::XAtY
int XAtY(int sort_key, int y) const
Definition: colpartition.h:320
tesseract::WorkingPartSet
Definition: workingpartset.h:32
tesseract::ColPartition::SetBlobTypes
void SetBlobTypes()
Definition: colpartition.cpp:1265
tesseract::ColPartition::SplitAtBlob
ColPartition * SplitAtBlob(BLOBNBOX *split_blob)
Definition: colpartition.cpp:787
tesseract::ColPartition::MatchingTextColor
bool MatchingTextColor(const ColPartition &other) const
Definition: colpartition.cpp:382
PT_PULLOUT_IMAGE
Definition: capi.h:119
STATS::ile
double ile(double frac) const
Definition: statistc.cpp:156
tesseract::ColPartition::set_type
void set_type(PolyBlockType t)
Definition: colpartition.h:184
tesseract::ColPartition::IsEmpty
bool IsEmpty() const
Definition: colpartition.h:357
tesseract::ColPartition::SetColumnGoodness
void SetColumnGoodness(WidthCallback cb)
Definition: colpartition.cpp:1070
tesseract::ColPartition::SmoothPartnerRun
void SmoothPartnerRun(int working_set_count)
Definition: colpartition.cpp:1808
tesseract::ColPartition::BoundsWithoutBox
TBOX BoundsWithoutBox(BLOBNBOX *box)
Definition: colpartition.cpp:234
tesseract::ColPartition::bounding_box
const TBOX & bounding_box() const
Definition: colpartition.h:109
tesseract::ColPartition::RightBlobRule
int RightBlobRule() const
Definition: colpartition.cpp:550
tesseract::ColPartition::PartitionType
PolyBlockType PartitionType(ColumnSpanningType flow) const
Definition: colpartition.cpp:1006
tesseract::ColPartition::ComputeSpecialBlobsDensity
void ComputeSpecialBlobsDensity()
Definition: colpartition.cpp:582
count
int count(LIST var_list)
Definition: oldlist.cpp:79
tesseract::ColPartition::AddBox
void AddBox(BLOBNBOX *box)
Definition: colpartition.cpp:169
BLOBNBOX::flow
BlobTextFlowType flow() const
Definition: blobbox.h:294
tesseract::ColPartition::RightAtY
int RightAtY(int y) const
Definition: colpartition.h:344
tesseract::ColPartition::ComputeLimits
void ComputeLimits()
Definition: colpartition.cpp:861
tesseract::ColPartition::SetPartitionType
void SetPartitionType(int resolution, ColPartitionSet *columns)
Definition: colpartition.cpp:973
tesseract::ColPartition::RefinePartners
void RefinePartners(PolyBlockType type, bool get_desperate, ColPartitionGrid *grid)
Definition: colpartition.cpp:1877
imagefind.h
BRT_VLINE
Definition: blobbox.h:74
TBOX::left
int16_t left() const
Definition: rect.h:71
tesseract::ColPartitionGrid
Definition: colpartitiongrid.h:32
STATS::add
void add(int32_t value, int32_t count)
Definition: statistc.cpp:87
tesseract::ColPartition::IsVerticalType
bool IsVerticalType() const
Definition: colpartition.h:441
tesseract::kMaxSizeRatio
const double kMaxSizeRatio
Definition: colpartition.cpp:51
PT_FLOWING_TEXT
Definition: capi.h:109
BLOBNBOX::region_type
BlobRegionType region_type() const
Definition: blobbox.h:282
tesseract::DetLineFit::Fit
double Fit(ICOORD *pt1, ICOORD *pt2)
Definition: detlinefit.h:75
TBOX::right
int16_t right() const
Definition: rect.h:78
tesseract::DetLineFit::Add
void Add(const ICOORD &pt)
Definition: detlinefit.cpp:51
tesseract::ColPartition::DeleteBoxes
void DeleteBoxes()
Definition: colpartition.cpp:305
POLY_BLOCK::ColorForPolyBlockType
static ScrollView::Color ColorForPolyBlockType(PolyBlockType type)
Returns a color to draw the given type.
Definition: polyblk.cpp:392
tesseract::ColPartition::Absorb
void Absorb(ColPartition *other, WidthCallback cb)
Definition: colpartition.cpp:638
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::ColPartition::CountOverlappingBoxes
int CountOverlappingBoxes(const TBOX &box)
Definition: colpartition.cpp:960
tesstrain_utils.type
type
Definition: tesstrain_utils.py:141
POLY_BLOCK
Definition: polyblk.h:26
tesseract::ColPartition::ConfirmNoTabViolation
bool ConfirmNoTabViolation(const ColPartition &other) const
Definition: colpartition.cpp:413
TO_ROW
Definition: blobbox.h:543
tesseract::ColPartition::ClaimBoxes
void ClaimBoxes()
Definition: colpartition.cpp:247
tesseract::ImageFind::ColorDistanceFromLine
static double ColorDistanceFromLine(const uint8_t *line1, const uint8_t *line2, const uint8_t *point)
Definition: imagefind.cpp:355
tesseract::ColPartition::SortByBBox
static int SortByBBox(const void *p1, const void *p2)
Definition: colpartition.h:714
tesseract::ColPartition::SetLeftTab
void SetLeftTab(const TabVector *tab_vector)
Definition: colpartition.cpp:494
tesseract::ColPartition::set_left_margin
void set_left_margin(int margin)
Definition: colpartition.h:115
tesseract::ColPartition::VSignificantCoreOverlap
bool VSignificantCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:390
PT_PULLOUT_TEXT
Definition: capi.h:111
workingpartset.h
TBOX::set_bottom
void set_bottom(int y)
Definition: rect.h:67
tesseract::ColPartition::RemoveBox
void RemoveBox(BLOBNBOX *box)
Definition: colpartition.cpp:202
tesseract::ColPartition::MatchingStrokeWidth
bool MatchingStrokeWidth(const ColPartition &other, double fractional_tolerance, double constant_tolerance) const
Definition: colpartition.cpp:430
ScrollView::Color
Color
Definition: scrollview.h:100
UpdateRange
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:118
BLOBNBOX::cblob
C_BLOB * cblob() const
Definition: blobbox.h:267
BLOBNBOX::owner
tesseract::ColPartition * owner() const
Definition: blobbox.h:351
PolyBlockType
PolyBlockType
Definition: publictypes.h:52
tesseract::kMaxBaselineError
const double kMaxBaselineError
Definition: colpartition.cpp:70
tesseract::ColPartition::ColPartition
ColPartition()=default
tesseract::kMaxSameBlockLineSpacing
const double kMaxSameBlockLineSpacing
Definition: colpartition.cpp:49
ICOORDELT
Definition: points.h:160
tesseract::ColPartition::DisownBoxes
void DisownBoxes()
Definition: colpartition.cpp:263
tesseract::ColPartition::CopyButDontOwnBlobs
ColPartition * CopyButDontOwnBlobs()
Definition: colpartition.cpp:1758
tesseract::ColPartition::LeftAtY
int LeftAtY(int y) const
Definition: colpartition.h:340
tesseract::DPPoint::total_cost
int total_cost() const
Definition: dppoint.h:103
PT_HORZ_LINE
Definition: capi.h:120
tesseract::ColPartition::VCoreOverlap
int VCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:375
textord_debug_tabfind
int textord_debug_tabfind
Definition: alignedblob.cpp:27
tesseract::ColPartition::BiggestBox
BLOBNBOX * BiggestBox()
Definition: colpartition.cpp:215
tesseract::ColPartition::MakeVerticalTextBlock
static TO_BLOCK * MakeVerticalTextBlock(const ICOORD &bleft, const ICOORD &tright, ColPartition_LIST *block_parts, ColPartition_LIST *used_parts)
Definition: colpartition.cpp:1680
colpartitiongrid.h
tesseract::ColPartition::BoxColor
ScrollView::Color BoxColor() const
Definition: colpartition.cpp:1771
BTFT_NEIGHBOURS
Definition: blobbox.h:116
tesseract::ColPartition::OKMergeOverlap
bool OKMergeOverlap(const ColPartition &merge1, const ColPartition &merge2, int ok_box_overlap, bool debug)
Definition: colpartition.cpp:736
tesseract::ColPartition::ShallowCopy
ColPartition * ShallowCopy() const
Definition: colpartition.cpp:1731
tesseract::ColPartition::MakeLinePartition
static ColPartition * MakeLinePartition(BlobRegionType blob_type, const ICOORD &vertical, int left, int bottom, int right, int top)
Definition: colpartition.cpp:148
tesseract::ColPartition::CopyRightTab
void CopyRightTab(const ColPartition &src, bool take_box)
Definition: colpartition.cpp:532
tesseract::ColPartition::LeftBlobRule
int LeftBlobRule() const
Definition: colpartition.cpp:545
TBOX::set_left
void set_left(int x)
Definition: rect.h:74
tesseract::ColPartition::DisownBoxesNoAssert
void DisownBoxesNoAssert()
Definition: colpartition.cpp:276
ICOORD::y
int16_t y() const
access_function
Definition: points.h:55
tesseract::CST_NOISE
Definition: colpartition.h:48
BSTT_COUNT
Definition: blobbox.h:102
TBOX
Definition: rect.h:33
tesseract::ColPartition::SetRightTab
void SetRightTab(const TabVector *tab_vector)
Definition: colpartition.cpp:506