tesseract  4.0.0-1-g2a2b
strokewidth.cpp
Go to the documentation of this file.
1 // File: strokewidth.cpp
3 // Description: Subclass of BBGrid to find uniformity of strokewidth.
4 // Author: Ray Smith
5 // Created: Mon Mar 31 16:17:01 PST 2008
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #include "strokewidth.h"
25 
26 #include <algorithm>
27 #include <cmath>
28 
29 #include "blobbox.h"
30 #include "colpartition.h"
31 #include "colpartitiongrid.h"
32 #include "imagefind.h"
33 #include "linlsq.h"
34 #include "statistc.h"
35 #include "tabfind.h"
36 #include "textlineprojection.h"
37 #include "tordmain.h" // For SetBlobStrokeWidth.
38 
39 namespace tesseract {
40 
41 INT_VAR(textord_tabfind_show_strokewidths, 0, "Show stroke widths");
42 BOOL_VAR(textord_tabfind_only_strokewidths, false, "Only run stroke widths");
43 
45 const double kStrokeWidthFractionTolerance = 0.125;
50 const double kStrokeWidthTolerance = 1.5;
51 // Same but for CJK we are a bit more generous.
52 const double kStrokeWidthFractionCJK = 0.25;
53 const double kStrokeWidthCJK = 2.0;
54 // Radius in grid cells of search for broken CJK. Doesn't need to be very
55 // large as the grid size should be about the size of a character anyway.
56 const int kCJKRadius = 2;
57 // Max distance fraction of size to join close but broken CJK characters.
58 const double kCJKBrokenDistanceFraction = 0.25;
59 // Max number of components in a broken CJK character.
60 const int kCJKMaxComponents = 8;
61 // Max aspect ratio of CJK broken characters when put back together.
62 const double kCJKAspectRatio = 1.25;
63 // Max increase in aspect ratio of CJK broken characters when merged.
64 const double kCJKAspectRatioIncrease = 1.0625;
65 // Max multiple of the grid size that will be used in computing median CJKsize.
66 const int kMaxCJKSizeRatio = 5;
67 // Min fraction of blobs broken CJK to iterate and run it again.
68 const double kBrokenCJKIterationFraction = 0.125;
69 // Multiple of gridsize as x-padding for a search box for diacritic base
70 // characters.
71 const double kDiacriticXPadRatio = 7.0;
72 // Multiple of gridsize as y-padding for a search box for diacritic base
73 // characters.
74 const double kDiacriticYPadRatio = 1.75;
75 // Min multiple of diacritic height that a neighbour must be to be a
76 // convincing base character.
77 const double kMinDiacriticSizeRatio = 1.0625;
78 // Max multiple of a textline's median height as a threshold for the sum of
79 // a diacritic's farthest x and y distances (gap + size).
80 const double kMaxDiacriticDistanceRatio = 1.25;
81 // Max x-gap between a diacritic and its base char as a fraction of the height
82 // of the base char (allowing other blobs to fill the gap.)
84 // Ratio between longest side of a line and longest side of a character.
85 // (neighbor_min > blob_min * kLineTrapShortest &&
86 // neighbor_max < blob_max / kLineTrapLongest)
87 // => neighbor is a grapheme and blob is a line.
88 const int kLineTrapLongest = 4;
89 // Ratio between shortest side of a line and shortest side of a character.
90 const int kLineTrapShortest = 2;
91 // Max aspect ratio of the total box before CountNeighbourGaps
92 // decides immediately based on the aspect ratio.
93 const int kMostlyOneDirRatio = 3;
94 // Aspect ratio for a blob to be considered as line residue.
95 const double kLineResidueAspectRatio = 8.0;
96 // Padding ratio for line residue search box.
97 const int kLineResiduePadRatio = 3;
98 // Min multiple of neighbour size for a line residue to be genuine.
99 const double kLineResidueSizeRatio = 1.75;
100 // Aspect ratio filter for OSD.
101 const float kSizeRatioToReject = 2.0;
102 // Expansion factor for search box for good neighbours.
103 const double kNeighbourSearchFactor = 2.5;
104 // Factor of increase of overlap when adding diacritics to make an image noisy.
105 const double kNoiseOverlapGrowthFactor = 4.0;
106 // Fraction of the image size to add overlap when adding diacritics for an
107 // image to qualify as noisy.
108 const double kNoiseOverlapAreaFactor = 1.0 / 512;
109 
111  const ICOORD& bleft, const ICOORD& tright)
112  : BlobGrid(gridsize, bleft, tright), nontext_map_(nullptr), projection_(nullptr),
113  denorm_(nullptr), grid_box_(bleft, tright), rerotation_(1.0f, 0.0f) {
114  leaders_win_ = nullptr;
115  widths_win_ = nullptr;
116  initial_widths_win_ = nullptr;
117  chains_win_ = nullptr;
118  diacritics_win_ = nullptr;
119  textlines_win_ = nullptr;
120  smoothed_win_ = nullptr;
121 }
122 
124  if (widths_win_ != nullptr) {
125  #ifndef GRAPHICS_DISABLED
126  delete widths_win_->AwaitEvent(SVET_DESTROY);
127  #endif // GRAPHICS_DISABLED
129  exit(0);
130  delete widths_win_;
131  }
132  delete leaders_win_;
133  delete initial_widths_win_;
134  delete chains_win_;
135  delete textlines_win_;
136  delete smoothed_win_;
137  delete diacritics_win_;
138 }
139 
140 // Sets the neighbours member of the medium-sized blobs in the block.
141 // Searches on 4 sides of each blob for similar-sized, similar-strokewidth
142 // blobs and sets pointers to the good neighbours.
144  // Run a preliminary strokewidth neighbour detection on the medium blobs.
145  InsertBlobList(&block->blobs);
146  BLOBNBOX_IT blob_it(&block->blobs);
147  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
148  SetNeighbours(false, false, blob_it.data());
149  }
150  Clear();
151 }
152 
153 // Sets the neighbour/textline writing direction members of the medium
154 // and large blobs with optional repair of broken CJK characters first.
155 // Repair of broken CJK is needed here because broken CJK characters
156 // can fool the textline direction detection algorithm.
158  bool cjk_merge,
159  TO_BLOCK* input_block) {
160  // Setup the grid with the remaining (non-noise) blobs.
161  InsertBlobs(input_block);
162  // Repair broken CJK characters if needed.
163  while (cjk_merge && FixBrokenCJK(input_block));
164  // Grade blobs by inspection of neighbours.
165  FindTextlineFlowDirection(pageseg_mode, false);
166  // Clear the grid ready for rotation or leader finding.
167  Clear();
168 }
169 
170 // Helper to collect and count horizontal and vertical blobs from a list.
171 static void CollectHorizVertBlobs(BLOBNBOX_LIST* input_blobs,
172  int* num_vertical_blobs,
173  int* num_horizontal_blobs,
174  BLOBNBOX_CLIST* vertical_blobs,
175  BLOBNBOX_CLIST* horizontal_blobs,
176  BLOBNBOX_CLIST* nondescript_blobs) {
177  BLOBNBOX_C_IT v_it(vertical_blobs);
178  BLOBNBOX_C_IT h_it(horizontal_blobs);
179  BLOBNBOX_C_IT n_it(nondescript_blobs);
180  BLOBNBOX_IT blob_it(input_blobs);
181  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
182  BLOBNBOX* blob = blob_it.data();
183  const TBOX& box = blob->bounding_box();
184  float y_x = static_cast<float>(box.height()) / box.width();
185  float x_y = 1.0f / y_x;
186  // Select a >= 1.0 ratio
187  float ratio = x_y > y_x ? x_y : y_x;
188  // If the aspect ratio is small and we want them for osd, save the blob.
189  bool ok_blob = ratio <= kSizeRatioToReject;
190  if (blob->UniquelyVertical()) {
191  ++*num_vertical_blobs;
192  if (ok_blob) v_it.add_after_then_move(blob);
193  } else if (blob->UniquelyHorizontal()) {
194  ++*num_horizontal_blobs;
195  if (ok_blob) h_it.add_after_then_move(blob);
196  } else if (ok_blob) {
197  n_it.add_after_then_move(blob);
198  }
199  }
200 }
201 
202 
203 // Types all the blobs as vertical or horizontal text or unknown and
204 // returns true if the majority are vertical.
205 // If the blobs are rotated, it is necessary to call CorrectForRotation
206 // after rotating everything, otherwise the work done here will be enough.
207 // If osd_blobs is not null, a list of blobs from the dominant textline
208 // direction are returned for use in orientation and script detection.
209 bool StrokeWidth::TestVerticalTextDirection(double find_vertical_text_ratio,
210  TO_BLOCK* block,
211  BLOBNBOX_CLIST* osd_blobs) {
212  int vertical_boxes = 0;
213  int horizontal_boxes = 0;
214  // Count vertical normal and large blobs.
215  BLOBNBOX_CLIST vertical_blobs;
216  BLOBNBOX_CLIST horizontal_blobs;
217  BLOBNBOX_CLIST nondescript_blobs;
218  CollectHorizVertBlobs(&block->blobs, &vertical_boxes, &horizontal_boxes,
219  &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
220  CollectHorizVertBlobs(&block->large_blobs, &vertical_boxes, &horizontal_boxes,
221  &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
223  tprintf("TextDir hbox=%d vs vbox=%d, %dH, %dV, %dN osd blobs\n",
224  horizontal_boxes, vertical_boxes,
225  horizontal_blobs.length(), vertical_blobs.length(),
226  nondescript_blobs.length());
227  if (osd_blobs != nullptr && vertical_boxes == 0 && horizontal_boxes == 0) {
228  // Only nondescript blobs available, so return those.
229  BLOBNBOX_C_IT osd_it(osd_blobs);
230  osd_it.add_list_after(&nondescript_blobs);
231  return false;
232  }
233  int min_vert_boxes = static_cast<int>((vertical_boxes + horizontal_boxes) *
234  find_vertical_text_ratio);
235  if (vertical_boxes >= min_vert_boxes) {
236  if (osd_blobs != nullptr) {
237  BLOBNBOX_C_IT osd_it(osd_blobs);
238  osd_it.add_list_after(&vertical_blobs);
239  }
240  return true;
241  } else {
242  if (osd_blobs != nullptr) {
243  BLOBNBOX_C_IT osd_it(osd_blobs);
244  osd_it.add_list_after(&horizontal_blobs);
245  }
246  return false;
247  }
248 }
249 
250 // Corrects the data structures for the given rotation.
252  ColPartitionGrid* part_grid) {
253  Init(part_grid->gridsize(), part_grid->bleft(), part_grid->tright());
254  grid_box_ = TBOX(bleft(), tright());
255  rerotation_.set_x(rotation.x());
256  rerotation_.set_y(-rotation.y());
257 }
258 
259 // Finds leader partitions and inserts them into the given part_grid.
261  ColPartitionGrid* part_grid) {
262  Clear();
263  // Find and isolate leaders in the noise list.
264  ColPartition_LIST leader_parts;
265  FindLeadersAndMarkNoise(block, &leader_parts);
266  // Setup the strokewidth grid with the block's remaining (non-noise) blobs.
267  InsertBlobList(&block->blobs);
268  // Mark blobs that have leader neighbours.
269  for (ColPartition_IT it(&leader_parts); !it.empty(); it.forward()) {
270  ColPartition* part = it.extract();
271  part->ClaimBoxes();
272  MarkLeaderNeighbours(part, LR_LEFT);
273  MarkLeaderNeighbours(part, LR_RIGHT);
274  part_grid->InsertBBox(true, true, part);
275  }
276 }
277 
278 // Finds and marks noise those blobs that look like bits of vertical lines
279 // that would otherwise screw up layout analysis.
280 void StrokeWidth::RemoveLineResidue(ColPartition_LIST* big_part_list) {
281  BlobGridSearch gsearch(this);
282  BLOBNBOX* bbox;
283  // For every vertical line-like bbox in the grid, search its neighbours
284  // to find the tallest, and if the original box is taller by sufficient
285  // margin, then call it line residue and delete it.
286  gsearch.StartFullSearch();
287  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
288  TBOX box = bbox->bounding_box();
289  if (box.height() < box.width() * kLineResidueAspectRatio)
290  continue;
291  // Set up a rectangle search around the blob to find the size of its
292  // neighbours.
293  int padding = box.height() * kLineResiduePadRatio;
294  TBOX search_box = box;
295  search_box.pad(padding, padding);
296  bool debug = AlignedBlob::WithinTestRegion(2, box.left(),
297  box.bottom());
298  // Find the largest object in the search box not equal to bbox.
299  BlobGridSearch rsearch(this);
300  int max_height = 0;
301  BLOBNBOX* n;
302  rsearch.StartRectSearch(search_box);
303  while ((n = rsearch.NextRectSearch()) != nullptr) {
304  if (n == bbox) continue;
305  TBOX nbox = n->bounding_box();
306  if (nbox.height() > max_height) {
307  max_height = nbox.height();
308  }
309  }
310  if (debug) {
311  tprintf("Max neighbour size=%d for candidate line box at:", max_height);
312  box.print();
313  }
314  if (max_height * kLineResidueSizeRatio < box.height()) {
315  #ifndef GRAPHICS_DISABLED
316  if (leaders_win_ != nullptr) {
317  // We are debugging, so display deleted in pink blobs in the same
318  // window that we use to display leader detection.
319  leaders_win_->Pen(ScrollView::PINK);
320  leaders_win_->Rectangle(box.left(), box.bottom(),
321  box.right(), box.top());
322  }
323  #endif // GRAPHICS_DISABLED
324  ColPartition::MakeBigPartition(bbox, big_part_list);
325  }
326  }
327 }
328 
329 // Types all the blobs as vertical text or horizontal text or unknown and
330 // puts them into initial ColPartitions in the supplied part_grid.
331 // rerotation determines how to get back to the image coordinates from the
332 // blob coordinates (since they may have been rotated for vertical text).
333 // block is the single block for the whole page or rectangle to be OCRed.
334 // nontext_pix (full-size), is a binary mask used to prevent merges across
335 // photo/text boundaries. It is not kept beyond this function.
336 // denorm provides a mapping back to the image from the current blob
337 // coordinate space.
338 // projection provides a measure of textline density over the image and
339 // provides functions to assist with diacritic detection. It should be a
340 // pointer to a new TextlineProjection, and will be setup here.
341 // part_grid is the output grid of textline partitions.
342 // Large blobs that cause overlap are put in separate partitions and added
343 // to the big_parts list.
345  PageSegMode pageseg_mode, const FCOORD& rerotation, TO_BLOCK* block,
346  Pix* nontext_pix, const DENORM* denorm, bool cjk_script,
347  TextlineProjection* projection, BLOBNBOX_LIST* diacritic_blobs,
348  ColPartitionGrid* part_grid, ColPartition_LIST* big_parts) {
349  nontext_map_ = nontext_pix;
350  projection_ = projection;
351  denorm_ = denorm;
352  // Clear and re Insert to take advantage of the tab stops in the blobs.
353  Clear();
354  // Setup the strokewidth grid with the remaining non-noise, non-leader blobs.
355  InsertBlobs(block);
356 
357  // Run FixBrokenCJK() again if the page is CJK.
358  if (cjk_script) {
359  FixBrokenCJK(block);
360  }
361  FindTextlineFlowDirection(pageseg_mode, false);
362  projection_->ConstructProjection(block, rerotation, nontext_map_);
364  ScrollView* line_blobs_win = MakeWindow(0, 0, "Initial textline Blobs");
365  projection_->PlotGradedBlobs(&block->blobs, line_blobs_win);
366  projection_->PlotGradedBlobs(&block->small_blobs, line_blobs_win);
367  }
368  projection_->MoveNonTextlineBlobs(&block->blobs, &block->noise_blobs);
369  projection_->MoveNonTextlineBlobs(&block->small_blobs, &block->noise_blobs);
370  // Clear and re Insert to take advantage of the removed diacritics.
371  Clear();
372  InsertBlobs(block);
373  FCOORD skew;
374  FindTextlineFlowDirection(pageseg_mode, true);
376  FindInitialPartitions(pageseg_mode, rerotation, true, block,
377  diacritic_blobs, part_grid, big_parts, &skew);
378  if (r == PFR_NOISE) {
379  tprintf("Detected %d diacritics\n", diacritic_blobs->length());
380  // Noise was found, and removed.
381  Clear();
382  InsertBlobs(block);
383  FindTextlineFlowDirection(pageseg_mode, true);
384  r = FindInitialPartitions(pageseg_mode, rerotation, false, block,
385  diacritic_blobs, part_grid, big_parts, &skew);
386  }
387  nontext_map_ = nullptr;
388  projection_ = nullptr;
389  denorm_ = nullptr;
390 }
391 
392 static void PrintBoxWidths(BLOBNBOX* neighbour) {
393  const TBOX& nbox = neighbour->bounding_box();
394  tprintf("Box (%d,%d)->(%d,%d): h-width=%.1f, v-width=%.1f p-width=%1.f\n",
395  nbox.left(), nbox.bottom(), nbox.right(), nbox.top(),
396  neighbour->horz_stroke_width(), neighbour->vert_stroke_width(),
397  2.0 * neighbour->cblob()->area()/neighbour->cblob()->perimeter());
398 }
399 
401 void StrokeWidth::HandleClick(int x, int y) {
403  // Run a radial search for blobs that overlap.
404  BlobGridSearch radsearch(this);
405  radsearch.StartRadSearch(x, y, 1);
406  BLOBNBOX* neighbour;
407  FCOORD click(static_cast<float>(x), static_cast<float>(y));
408  while ((neighbour = radsearch.NextRadSearch()) != nullptr) {
409  TBOX nbox = neighbour->bounding_box();
410  if (nbox.contains(click) && neighbour->cblob() != nullptr) {
411  PrintBoxWidths(neighbour);
412  if (neighbour->neighbour(BND_LEFT) != nullptr)
413  PrintBoxWidths(neighbour->neighbour(BND_LEFT));
414  if (neighbour->neighbour(BND_RIGHT) != nullptr)
415  PrintBoxWidths(neighbour->neighbour(BND_RIGHT));
416  if (neighbour->neighbour(BND_ABOVE) != nullptr)
417  PrintBoxWidths(neighbour->neighbour(BND_ABOVE));
418  if (neighbour->neighbour(BND_BELOW) != nullptr)
419  PrintBoxWidths(neighbour->neighbour(BND_BELOW));
420  int gaps[BND_COUNT];
421  neighbour->NeighbourGaps(gaps);
422  tprintf("Left gap=%d, right=%d, above=%d, below=%d, horz=%d, vert=%d\n"
423  "Good= %d %d %d %d\n",
424  gaps[BND_LEFT], gaps[BND_RIGHT],
425  gaps[BND_ABOVE], gaps[BND_BELOW],
426  neighbour->horz_possible(),
427  neighbour->vert_possible(),
428  neighbour->good_stroke_neighbour(BND_LEFT),
429  neighbour->good_stroke_neighbour(BND_RIGHT),
430  neighbour->good_stroke_neighbour(BND_ABOVE),
431  neighbour->good_stroke_neighbour(BND_BELOW));
432  break;
433  }
434  }
435 }
436 
437 // Detects and marks leader dots/dashes.
438 // Leaders are horizontal chains of small or noise blobs that look
439 // monospace according to ColPartition::MarkAsLeaderIfMonospaced().
440 // Detected leaders become the only occupants of the block->small_blobs list.
441 // Non-leader small blobs get moved to the blobs list.
442 // Non-leader noise blobs remain singletons in the noise list.
443 // All small and noise blobs in high density regions are marked BTFT_NONTEXT.
444 // block is the single block for the whole page or rectangle to be OCRed.
445 // leader_parts is the output.
446 void StrokeWidth::FindLeadersAndMarkNoise(TO_BLOCK* block,
447  ColPartition_LIST* leader_parts) {
448  InsertBlobList(&block->small_blobs);
449  InsertBlobList(&block->noise_blobs);
450  BlobGridSearch gsearch(this);
451  BLOBNBOX* bbox;
452  // For every bbox in the grid, set its neighbours.
453  gsearch.StartFullSearch();
454  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
455  SetNeighbours(true, false, bbox);
456  }
457  ColPartition_IT part_it(leader_parts);
458  gsearch.StartFullSearch();
459  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
460  if (bbox->flow() == BTFT_NONE) {
461  if (bbox->neighbour(BND_RIGHT) == nullptr &&
462  bbox->neighbour(BND_LEFT) == nullptr)
463  continue;
464  // Put all the linked blobs into a ColPartition.
465  ColPartition* part = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
466  BLOBNBOX* blob;
467  for (blob = bbox; blob != nullptr && blob->flow() == BTFT_NONE;
468  blob = blob->neighbour(BND_RIGHT))
469  part->AddBox(blob);
470  for (blob = bbox->neighbour(BND_LEFT); blob != nullptr &&
471  blob->flow() == BTFT_NONE;
472  blob = blob->neighbour(BND_LEFT))
473  part->AddBox(blob);
474  if (part->MarkAsLeaderIfMonospaced())
475  part_it.add_after_then_move(part);
476  else
477  delete part;
478  }
479  }
481  leaders_win_ = DisplayGoodBlobs("LeaderNeighbours", 0, 0);
482  }
483  // Move any non-leaders from the small to the blobs list, as they are
484  // most likely dashes or broken characters.
485  BLOBNBOX_IT blob_it(&block->blobs);
486  BLOBNBOX_IT small_it(&block->small_blobs);
487  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
488  BLOBNBOX* blob = small_it.data();
489  if (blob->flow() != BTFT_LEADER) {
490  if (blob->flow() == BTFT_NEIGHBOURS)
491  blob->set_flow(BTFT_NONE);
492  blob->ClearNeighbours();
493  blob_it.add_to_end(small_it.extract());
494  }
495  }
496  // Move leaders from the noise list to the small list, leaving the small
497  // list exclusively leaders, so they don't get processed further,
498  // and the remaining small blobs all in the noise list.
499  BLOBNBOX_IT noise_it(&block->noise_blobs);
500  for (noise_it.mark_cycle_pt(); !noise_it.cycled_list(); noise_it.forward()) {
501  BLOBNBOX* blob = noise_it.data();
502  if (blob->flow() == BTFT_LEADER || blob->joined_to_prev()) {
503  small_it.add_to_end(noise_it.extract());
504  } else if (blob->flow() == BTFT_NEIGHBOURS) {
505  blob->set_flow(BTFT_NONE);
506  blob->ClearNeighbours();
507  }
508  }
509  // Clear the grid as we don't want the small stuff hanging around in it.
510  Clear();
511 }
512 
515 void StrokeWidth::InsertBlobs(TO_BLOCK* block) {
516  InsertBlobList(&block->blobs);
517  InsertBlobList(&block->large_blobs);
518 }
519 
520 // Checks the left or right side of the given leader partition and sets the
521 // (opposite) leader_on_right or leader_on_left flags for blobs
522 // that are next to the given side of the given leader partition.
523 void StrokeWidth::MarkLeaderNeighbours(const ColPartition* part,
524  LeftOrRight side) {
525  const TBOX& part_box = part->bounding_box();
526  BlobGridSearch blobsearch(this);
527  // Search to the side of the leader for the nearest neighbour.
528  BLOBNBOX* best_blob = nullptr;
529  int best_gap = 0;
530  blobsearch.StartSideSearch(side == LR_LEFT ? part_box.left()
531  : part_box.right(),
532  part_box.bottom(), part_box.top());
533  BLOBNBOX* blob;
534  while ((blob = blobsearch.NextSideSearch(side == LR_LEFT)) != nullptr) {
535  const TBOX& blob_box = blob->bounding_box();
536  if (!blob_box.y_overlap(part_box))
537  continue;
538  int x_gap = blob_box.x_gap(part_box);
539  if (x_gap > 2 * gridsize()) {
540  break;
541  } else if (best_blob == nullptr || x_gap < best_gap) {
542  best_blob = blob;
543  best_gap = x_gap;
544  }
545  }
546  if (best_blob != nullptr) {
547  if (side == LR_LEFT)
548  best_blob->set_leader_on_right(true);
549  else
550  best_blob->set_leader_on_left(true);
551  #ifndef GRAPHICS_DISABLED
552  if (leaders_win_ != nullptr) {
553  leaders_win_->Pen(side == LR_LEFT ? ScrollView::RED : ScrollView::GREEN);
554  const TBOX& blob_box = best_blob->bounding_box();
555  leaders_win_->Rectangle(blob_box.left(), blob_box.bottom(),
556  blob_box.right(), blob_box.top());
557  }
558  #endif // GRAPHICS_DISABLED
559  }
560 }
561 
562 // Helper to compute the UQ of the square-ish CJK characters.
563 static int UpperQuartileCJKSize(int gridsize, BLOBNBOX_LIST* blobs) {
564  STATS sizes(0, gridsize * kMaxCJKSizeRatio);
565  BLOBNBOX_IT it(blobs);
566  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
567  BLOBNBOX* blob = it.data();
568  int width = blob->bounding_box().width();
569  int height = blob->bounding_box().height();
570  if (width <= height * kCJKAspectRatio && height < width * kCJKAspectRatio)
571  sizes.add(height, 1);
572  }
573  return static_cast<int>(sizes.ile(0.75f) + 0.5);
574 }
575 
576 // Fix broken CJK characters, using the fake joined blobs mechanism.
577 // Blobs are really merged, ie the master takes all the outlines and the
578 // others are deleted.
579 // Returns true if sufficient blobs are merged that it may be worth running
580 // again, due to a better estimate of character size.
581 bool StrokeWidth::FixBrokenCJK(TO_BLOCK* block) {
582  BLOBNBOX_LIST* blobs = &block->blobs;
583  int median_height = UpperQuartileCJKSize(gridsize(), blobs);
584  int max_dist = static_cast<int>(median_height * kCJKBrokenDistanceFraction);
585  int max_height = static_cast<int>(median_height * kCJKAspectRatio);
586  int num_fixed = 0;
587  BLOBNBOX_IT blob_it(blobs);
588 
589  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
590  BLOBNBOX* blob = blob_it.data();
591  if (blob->cblob() == nullptr || blob->cblob()->out_list()->empty())
592  continue;
593  TBOX bbox = blob->bounding_box();
594  bool debug = AlignedBlob::WithinTestRegion(3, bbox.left(),
595  bbox.bottom());
596  if (debug) {
597  tprintf("Checking for Broken CJK (max size=%d):", max_height);
598  bbox.print();
599  }
600  // Generate a list of blobs that overlap or are near enough to merge.
601  BLOBNBOX_CLIST overlapped_blobs;
602  AccumulateOverlaps(blob, debug, max_height, max_dist,
603  &bbox, &overlapped_blobs);
604  if (!overlapped_blobs.empty()) {
605  // There are overlapping blobs, so qualify them as being satisfactory
606  // before removing them from the grid and replacing them with the union.
607  // The final box must be roughly square.
608  if (bbox.width() > bbox.height() * kCJKAspectRatio ||
609  bbox.height() > bbox.width() * kCJKAspectRatio) {
610  if (debug) {
611  tprintf("Bad final aspectratio:");
612  bbox.print();
613  }
614  continue;
615  }
616  // There can't be too many blobs to merge.
617  if (overlapped_blobs.length() >= kCJKMaxComponents) {
618  if (debug)
619  tprintf("Too many neighbours: %d\n", overlapped_blobs.length());
620  continue;
621  }
622  // The strokewidths must match amongst the join candidates.
623  BLOBNBOX_C_IT n_it(&overlapped_blobs);
624  for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
625  BLOBNBOX* neighbour = nullptr;
626  neighbour = n_it.data();
627  if (!blob->MatchingStrokeWidth(*neighbour, kStrokeWidthFractionCJK,
629  break;
630  }
631  if (!n_it.cycled_list()) {
632  if (debug) {
633  tprintf("Bad stroke widths:");
634  PrintBoxWidths(blob);
635  }
636  continue; // Not good enough.
637  }
638 
639  // Merge all the candidates into blob.
640  // We must remove blob from the grid and reinsert it after merging
641  // to maintain the integrity of the grid.
642  RemoveBBox(blob);
643  // Everything else will be calculated later.
644  for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
645  BLOBNBOX* neighbour = n_it.data();
646  RemoveBBox(neighbour);
647  // Mark empty blob for deletion.
648  neighbour->set_region_type(BRT_NOISE);
649  blob->really_merge(neighbour);
650  if (rerotation_.x() != 1.0f || rerotation_.y() != 0.0f) {
651  blob->rotate_box(rerotation_);
652  }
653  }
654  InsertBBox(true, true, blob);
655  ++num_fixed;
656  if (debug) {
657  tprintf("Done! Final box:");
658  bbox.print();
659  }
660  }
661  }
662  // Count remaining blobs.
663  int num_remaining = 0;
664  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
665  BLOBNBOX* blob = blob_it.data();
666  if (blob->cblob() != nullptr && !blob->cblob()->out_list()->empty()) {
667  ++num_remaining;
668  }
669  }
670  // Permanently delete all the marked blobs after first removing all
671  // references in the neighbour members.
672  block->DeleteUnownedNoise();
673  return num_fixed > num_remaining * kBrokenCJKIterationFraction;
674 }
675 
676 // Helper function to determine whether it is reasonable to merge the
677 // bbox and the nbox for repairing broken CJK.
678 // The distance apart must not exceed max_dist, the combined size must
679 // not exceed max_size, and the aspect ratio must either improve or at
680 // least not get worse by much.
681 static bool AcceptableCJKMerge(const TBOX& bbox, const TBOX& nbox,
682  bool debug, int max_size, int max_dist,
683  int* x_gap, int* y_gap) {
684  *x_gap = bbox.x_gap(nbox);
685  *y_gap = bbox.y_gap(nbox);
686  TBOX merged(nbox);
687  merged += bbox;
688  if (debug) {
689  tprintf("gaps = %d, %d, merged_box:", *x_gap, *y_gap);
690  merged.print();
691  }
692  if (*x_gap <= max_dist && *y_gap <= max_dist &&
693  merged.width() <= max_size && merged.height() <= max_size) {
694  // Close enough to call overlapping. Check aspect ratios.
695  double old_ratio = static_cast<double>(bbox.width()) / bbox.height();
696  if (old_ratio < 1.0) old_ratio = 1.0 / old_ratio;
697  double new_ratio = static_cast<double>(merged.width()) / merged.height();
698  if (new_ratio < 1.0) new_ratio = 1.0 / new_ratio;
699  if (new_ratio <= old_ratio * kCJKAspectRatioIncrease)
700  return true;
701  }
702  return false;
703 }
704 
705 // Collect blobs that overlap or are within max_dist of the input bbox.
706 // Return them in the list of blobs and expand the bbox to be the union
707 // of all the boxes. not_this is excluded from the search, as are blobs
708 // that cause the merged box to exceed max_size in either dimension.
709 void StrokeWidth::AccumulateOverlaps(const BLOBNBOX* not_this, bool debug,
710  int max_size, int max_dist,
711  TBOX* bbox, BLOBNBOX_CLIST* blobs) {
712  // While searching, nearests holds the nearest failed blob in each
713  // direction. When we have a nearest in each of the 4 directions, then
714  // the search is over, and at this point the final bbox must not overlap
715  // any of the nearests.
716  BLOBNBOX* nearests[BND_COUNT];
717  for (int i = 0; i < BND_COUNT; ++i) {
718  nearests[i] = nullptr;
719  }
720  int x = (bbox->left() + bbox->right()) / 2;
721  int y = (bbox->bottom() + bbox->top()) / 2;
722  // Run a radial search for blobs that overlap or are sufficiently close.
723  BlobGridSearch radsearch(this);
724  radsearch.StartRadSearch(x, y, kCJKRadius);
725  BLOBNBOX* neighbour;
726  while ((neighbour = radsearch.NextRadSearch()) != nullptr) {
727  if (neighbour == not_this) continue;
728  TBOX nbox = neighbour->bounding_box();
729  int x_gap, y_gap;
730  if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, max_dist,
731  &x_gap, &y_gap)) {
732  // Close enough to call overlapping. Merge boxes.
733  *bbox += nbox;
734  blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
735  if (debug) {
736  tprintf("Added:");
737  nbox.print();
738  }
739  // Since we merged, search the nearests, as some might now me mergeable.
740  for (int dir = 0; dir < BND_COUNT; ++dir) {
741  if (nearests[dir] == nullptr) continue;
742  nbox = nearests[dir]->bounding_box();
743  if (AcceptableCJKMerge(*bbox, nbox, debug, max_size,
744  max_dist, &x_gap, &y_gap)) {
745  // Close enough to call overlapping. Merge boxes.
746  *bbox += nbox;
747  blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, nearests[dir]);
748  if (debug) {
749  tprintf("Added:");
750  nbox.print();
751  }
752  nearests[dir] = nullptr;
753  dir = -1; // Restart the search.
754  }
755  }
756  } else if (x_gap < 0 && x_gap <= y_gap) {
757  // A vertical neighbour. Record the nearest.
758  BlobNeighbourDir dir = nbox.top() > bbox->top() ? BND_ABOVE : BND_BELOW;
759  if (nearests[dir] == nullptr ||
760  y_gap < bbox->y_gap(nearests[dir]->bounding_box())) {
761  nearests[dir] = neighbour;
762  }
763  } else if (y_gap < 0 && y_gap <= x_gap) {
764  // A horizontal neighbour. Record the nearest.
765  BlobNeighbourDir dir = nbox.left() > bbox->left() ? BND_RIGHT : BND_LEFT;
766  if (nearests[dir] == nullptr ||
767  x_gap < bbox->x_gap(nearests[dir]->bounding_box())) {
768  nearests[dir] = neighbour;
769  }
770  }
771  // If all nearests are non-null, then we have finished.
772  if (nearests[BND_LEFT] && nearests[BND_RIGHT] &&
773  nearests[BND_ABOVE] && nearests[BND_BELOW])
774  break;
775  }
776  // Final overlap with a nearest is not allowed.
777  for (int dir = 0; dir < BND_COUNT; ++dir) {
778  if (nearests[dir] == nullptr) continue;
779  const TBOX& nbox = nearests[dir]->bounding_box();
780  if (debug) {
781  tprintf("Testing for overlap with:");
782  nbox.print();
783  }
784  if (bbox->overlap(nbox)) {
785  blobs->shallow_clear();
786  if (debug)
787  tprintf("Final box overlaps nearest\n");
788  return;
789  }
790  }
791 }
792 
793 // For each blob in this grid, Finds the textline direction to be horizontal
794 // or vertical according to distance to neighbours and 1st and 2nd order
795 // neighbours. Non-text tends to end up without a definite direction.
796 // Result is setting of the neighbours and vert_possible/horz_possible
797 // flags in the BLOBNBOXes currently in this grid.
798 // This function is called more than once if page orientation is uncertain,
799 // so display_if_debugging is true on the final call to display the results.
800 void StrokeWidth::FindTextlineFlowDirection(PageSegMode pageseg_mode,
801  bool display_if_debugging) {
802  BlobGridSearch gsearch(this);
803  BLOBNBOX* bbox;
804  // For every bbox in the grid, set its neighbours.
805  gsearch.StartFullSearch();
806  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
807  SetNeighbours(false, display_if_debugging, bbox);
808  }
809  // Where vertical or horizontal wins by a big margin, clarify it.
810  gsearch.StartFullSearch();
811  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
812  SimplifyObviousNeighbours(bbox);
813  }
814  // Now try to make the blobs only vertical or horizontal using neighbours.
815  gsearch.StartFullSearch();
816  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
817  if (FindingVerticalOnly(pageseg_mode)) {
818  bbox->set_vert_possible(true);
819  bbox->set_horz_possible(false);
820  } else if (FindingHorizontalOnly(pageseg_mode)) {
821  bbox->set_vert_possible(false);
822  bbox->set_horz_possible(true);
823  } else {
824  SetNeighbourFlows(bbox);
825  }
826  }
827  if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
829  initial_widths_win_ = DisplayGoodBlobs("InitialStrokewidths", 400, 0);
830  }
831  // Improve flow direction with neighbours.
832  gsearch.StartFullSearch();
833  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
834  SmoothNeighbourTypes(pageseg_mode, false, bbox);
835  }
836  // Now allow reset of firm values to fix renegades.
837  gsearch.StartFullSearch();
838  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
839  SmoothNeighbourTypes(pageseg_mode, true, bbox);
840  }
841  // Repeat.
842  gsearch.StartFullSearch();
843  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
844  SmoothNeighbourTypes(pageseg_mode, true, bbox);
845  }
846  if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
848  widths_win_ = DisplayGoodBlobs("ImprovedStrokewidths", 800, 0);
849  }
850 }
851 
852 // Sets the neighbours and good_stroke_neighbours members of the blob by
853 // searching close on all 4 sides.
854 // When finding leader dots/dashes, there is a slightly different rule for
855 // what makes a good neighbour.
856 void StrokeWidth::SetNeighbours(bool leaders, bool activate_line_trap,
857  BLOBNBOX* blob) {
858  int line_trap_count = 0;
859  for (int dir = 0; dir < BND_COUNT; ++dir) {
860  BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
861  line_trap_count += FindGoodNeighbour(bnd, leaders, blob);
862  }
863  if (line_trap_count > 0 && activate_line_trap) {
864  // It looks like a line so isolate it by clearing its neighbours.
865  blob->ClearNeighbours();
866  const TBOX& box = blob->bounding_box();
867  blob->set_region_type(box.width() > box.height() ? BRT_HLINE : BRT_VLINE);
868  }
869 }
870 
871 
872 // Sets the good_stroke_neighbours member of the blob if it has a
873 // GoodNeighbour on the given side.
874 // Also sets the neighbour in the blob, whether or not a good one is found.
875 // Returns the number of blobs in the nearby search area that would lead us to
876 // believe that this blob is a line separator.
877 // Leaders get extra special lenient treatment.
878 int StrokeWidth::FindGoodNeighbour(BlobNeighbourDir dir, bool leaders,
879  BLOBNBOX* blob) {
880  // Search for neighbours that overlap vertically.
881  TBOX blob_box = blob->bounding_box();
882  bool debug = AlignedBlob::WithinTestRegion(2, blob_box.left(),
883  blob_box.bottom());
884  if (debug) {
885  tprintf("FGN in dir %d for blob:", dir);
886  blob_box.print();
887  }
888  int top = blob_box.top();
889  int bottom = blob_box.bottom();
890  int left = blob_box.left();
891  int right = blob_box.right();
892  int width = right - left;
893  int height = top - bottom;
894 
895  // A trap to detect lines tests for the min dimension of neighbours
896  // being larger than a multiple of the min dimension of the line
897  // and the larger dimension being smaller than a fraction of the max
898  // dimension of the line.
899  int line_trap_max = std::max(width, height) / kLineTrapLongest;
900  int line_trap_min = std::min(width, height) * kLineTrapShortest;
901  int line_trap_count = 0;
902 
903  int min_good_overlap = (dir == BND_LEFT || dir == BND_RIGHT)
904  ? height / 2 : width / 2;
905  int min_decent_overlap = (dir == BND_LEFT || dir == BND_RIGHT)
906  ? height / 3 : width / 3;
907  if (leaders)
908  min_good_overlap = min_decent_overlap = 1;
909 
910  int search_pad = static_cast<int>(
911  sqrt(static_cast<double>(width * height)) * kNeighbourSearchFactor);
912  if (gridsize() > search_pad)
913  search_pad = gridsize();
914  TBOX search_box = blob_box;
915  // Pad the search in the appropriate direction.
916  switch (dir) {
917  case BND_LEFT:
918  search_box.set_left(search_box.left() - search_pad);
919  break;
920  case BND_RIGHT:
921  search_box.set_right(search_box.right() + search_pad);
922  break;
923  case BND_BELOW:
924  search_box.set_bottom(search_box.bottom() - search_pad);
925  break;
926  case BND_ABOVE:
927  search_box.set_top(search_box.top() + search_pad);
928  break;
929  case BND_COUNT:
930  return 0;
931  }
932 
933  BlobGridSearch rectsearch(this);
934  rectsearch.StartRectSearch(search_box);
935  BLOBNBOX* best_neighbour = nullptr;
936  double best_goodness = 0.0;
937  bool best_is_good = false;
938  BLOBNBOX* neighbour;
939  while ((neighbour = rectsearch.NextRectSearch()) != nullptr) {
940  TBOX nbox = neighbour->bounding_box();
941  if (neighbour == blob)
942  continue;
943  int mid_x = (nbox.left() + nbox.right()) / 2;
944  if (mid_x < blob->left_rule() || mid_x > blob->right_rule())
945  continue; // In a different column.
946  if (debug) {
947  tprintf("Neighbour at:");
948  nbox.print();
949  }
950 
951  // Last-minute line detector. There is a small upper limit to the line
952  // width accepted by the morphological line detector.
953  int n_width = nbox.width();
954  int n_height = nbox.height();
955  if (std::min(n_width, n_height) > line_trap_min &&
956  std::max(n_width, n_height) < line_trap_max)
957  ++line_trap_count;
958  // Heavily joined text, such as Arabic may have very different sizes when
959  // looking at the maxes, but the heights may be almost identical, so check
960  // for a difference in height if looking sideways or width vertically.
961  if (TabFind::VeryDifferentSizes(std::max(n_width, n_height),
962  std::max(width, height)) &&
963  (((dir == BND_LEFT || dir ==BND_RIGHT) &&
964  TabFind::DifferentSizes(n_height, height)) ||
965  ((dir == BND_BELOW || dir ==BND_ABOVE) &&
966  TabFind::DifferentSizes(n_width, width)))) {
967  if (debug) tprintf("Bad size\n");
968  continue; // Could be a different font size or non-text.
969  }
970  // Amount of vertical overlap between the blobs.
971  int overlap;
972  // If the overlap is along the short side of the neighbour, and it
973  // is fully overlapped, then perp_overlap holds the length of the long
974  // side of the neighbour. A measure to include hyphens and dashes as
975  // legitimate neighbours.
976  int perp_overlap;
977  int gap;
978  if (dir == BND_LEFT || dir == BND_RIGHT) {
979  overlap = std::min(static_cast<int>(nbox.top()), top) - std::max(static_cast<int>(nbox.bottom()), bottom);
980  if (overlap == nbox.height() && nbox.width() > nbox.height())
981  perp_overlap = nbox.width();
982  else
983  perp_overlap = overlap;
984  gap = dir == BND_LEFT ? left - nbox.left() : nbox.right() - right;
985  if (gap <= 0) {
986  if (debug) tprintf("On wrong side\n");
987  continue; // On the wrong side.
988  }
989  gap -= n_width;
990  } else {
991  overlap = std::min(static_cast<int>(nbox.right()), right) - std::max(static_cast<int>(nbox.left()), left);
992  if (overlap == nbox.width() && nbox.height() > nbox.width())
993  perp_overlap = nbox.height();
994  else
995  perp_overlap = overlap;
996  gap = dir == BND_BELOW ? bottom - nbox.bottom() : nbox.top() - top;
997  if (gap <= 0) {
998  if (debug) tprintf("On wrong side\n");
999  continue; // On the wrong side.
1000  }
1001  gap -= n_height;
1002  }
1003  if (-gap > overlap) {
1004  if (debug) tprintf("Overlaps wrong way\n");
1005  continue; // Overlaps the wrong way.
1006  }
1007  if (perp_overlap < min_decent_overlap) {
1008  if (debug) tprintf("Doesn't overlap enough\n");
1009  continue; // Doesn't overlap enough.
1010  }
1011  bool bad_sizes = TabFind::DifferentSizes(height, n_height) &&
1012  TabFind::DifferentSizes(width, n_width);
1013  bool is_good = overlap >= min_good_overlap && !bad_sizes &&
1014  blob->MatchingStrokeWidth(*neighbour,
1017  // Best is a fuzzy combination of gap, overlap and is good.
1018  // Basically if you make one thing twice as good without making
1019  // anything else twice as bad, then it is better.
1020  if (gap < 1) gap = 1;
1021  double goodness = (1.0 + is_good) * overlap / gap;
1022  if (debug) {
1023  tprintf("goodness = %g vs best of %g, good=%d, overlap=%d, gap=%d\n",
1024  goodness, best_goodness, is_good, overlap, gap);
1025  }
1026  if (goodness > best_goodness) {
1027  best_neighbour = neighbour;
1028  best_goodness = goodness;
1029  best_is_good = is_good;
1030  }
1031  }
1032  blob->set_neighbour(dir, best_neighbour, best_is_good);
1033  return line_trap_count;
1034 }
1035 
1036 // Helper to get a list of 1st-order neighbours.
1037 static void ListNeighbours(const BLOBNBOX* blob,
1038  BLOBNBOX_CLIST* neighbours) {
1039  for (int dir = 0; dir < BND_COUNT; ++dir) {
1040  BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
1041  BLOBNBOX* neighbour = blob->neighbour(bnd);
1042  if (neighbour != nullptr) {
1043  neighbours->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
1044  }
1045  }
1046 }
1047 
1048 // Helper to get a list of 1st and 2nd order neighbours.
1049 static void List2ndNeighbours(const BLOBNBOX* blob,
1050  BLOBNBOX_CLIST* neighbours) {
1051  ListNeighbours(blob, neighbours);
1052  for (int dir = 0; dir < BND_COUNT; ++dir) {
1053  BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
1054  BLOBNBOX* neighbour = blob->neighbour(bnd);
1055  if (neighbour != nullptr) {
1056  ListNeighbours(neighbour, neighbours);
1057  }
1058  }
1059 }
1060 
1061 // Helper to get a list of 1st, 2nd and 3rd order neighbours.
1062 static void List3rdNeighbours(const BLOBNBOX* blob,
1063  BLOBNBOX_CLIST* neighbours) {
1064  List2ndNeighbours(blob, neighbours);
1065  for (int dir = 0; dir < BND_COUNT; ++dir) {
1066  BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
1067  BLOBNBOX* neighbour = blob->neighbour(bnd);
1068  if (neighbour != nullptr) {
1069  List2ndNeighbours(neighbour, neighbours);
1070  }
1071  }
1072 }
1073 
1074 // Helper to count the evidence for verticalness or horizontalness
1075 // in a list of neighbours.
1076 static void CountNeighbourGaps(bool debug, BLOBNBOX_CLIST* neighbours,
1077  int* pure_h_count, int* pure_v_count) {
1078  if (neighbours->length() <= kMostlyOneDirRatio)
1079  return;
1080  BLOBNBOX_C_IT it(neighbours);
1081  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1082  BLOBNBOX* blob = it.data();
1083  int h_min, h_max, v_min, v_max;
1084  blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1085  if (debug)
1086  tprintf("Hgaps [%d,%d], vgaps [%d,%d]:", h_min, h_max, v_min, v_max);
1087  if (h_max < v_min ||
1088  blob->leader_on_left() || blob->leader_on_right()) {
1089  // Horizontal gaps are clear winners. Count a pure horizontal.
1090  ++*pure_h_count;
1091  if (debug) tprintf("Horz at:");
1092  } else if (v_max < h_min) {
1093  // Vertical gaps are clear winners. Clear a pure vertical.
1094  ++*pure_v_count;
1095  if (debug) tprintf("Vert at:");
1096  } else {
1097  if (debug) tprintf("Neither at:");
1098  }
1099  if (debug)
1100  blob->bounding_box().print();
1101  }
1102 }
1103 
1104 // Makes the blob to be only horizontal or vertical where evidence
1105 // is clear based on gaps of 2nd order neighbours, or definite individual
1106 // blobs.
1107 void StrokeWidth::SetNeighbourFlows(BLOBNBOX* blob) {
1108  if (blob->DefiniteIndividualFlow())
1109  return;
1110  bool debug = AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1111  blob->bounding_box().bottom());
1112  if (debug) {
1113  tprintf("SetNeighbourFlows (current flow=%d, type=%d) on:",
1114  blob->flow(), blob->region_type());
1115  blob->bounding_box().print();
1116  }
1117  BLOBNBOX_CLIST neighbours;
1118  List3rdNeighbours(blob, &neighbours);
1119  // The number of pure horizontal and vertical neighbours.
1120  int pure_h_count = 0;
1121  int pure_v_count = 0;
1122  CountNeighbourGaps(debug, &neighbours, &pure_h_count, &pure_v_count);
1123  if (debug) {
1124  HandleClick(blob->bounding_box().left() + 1,
1125  blob->bounding_box().bottom() + 1);
1126  tprintf("SetFlows: h_count=%d, v_count=%d\n",
1127  pure_h_count, pure_v_count);
1128  }
1129  if (!neighbours.empty()) {
1130  blob->set_vert_possible(true);
1131  blob->set_horz_possible(true);
1132  if (pure_h_count > 2 * pure_v_count) {
1133  // Horizontal gaps are clear winners. Clear vertical neighbours.
1134  blob->set_vert_possible(false);
1135  } else if (pure_v_count > 2 * pure_h_count) {
1136  // Vertical gaps are clear winners. Clear horizontal neighbours.
1137  blob->set_horz_possible(false);
1138  }
1139  } else {
1140  // Lonely blob. Can't tell its flow direction.
1141  blob->set_vert_possible(false);
1142  blob->set_horz_possible(false);
1143  }
1144 }
1145 
1146 
1147 // Helper to count the number of horizontal and vertical blobs in a list.
1148 static void CountNeighbourTypes(BLOBNBOX_CLIST* neighbours,
1149  int* pure_h_count, int* pure_v_count) {
1150  BLOBNBOX_C_IT it(neighbours);
1151  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1152  BLOBNBOX* blob = it.data();
1153  if (blob->UniquelyHorizontal())
1154  ++*pure_h_count;
1155  if (blob->UniquelyVertical())
1156  ++*pure_v_count;
1157  }
1158 }
1159 
1160 // Nullify the neighbours in the wrong directions where the direction
1161 // is clear-cut based on a distance margin. Good for isolating vertical
1162 // text from neighbouring horizontal text.
1163 void StrokeWidth::SimplifyObviousNeighbours(BLOBNBOX* blob) {
1164  // Case 1: We have text that is likely several characters, blurry and joined
1165  // together.
1166  if ((blob->bounding_box().width() > 3 * blob->area_stroke_width() &&
1167  blob->bounding_box().height() > 3 * blob->area_stroke_width())) {
1168  // The blob is complex (not stick-like).
1169  if (blob->bounding_box().width() > 4 * blob->bounding_box().height()) {
1170  // Horizontal conjoined text.
1171  blob->set_neighbour(BND_ABOVE, nullptr, false);
1172  blob->set_neighbour(BND_BELOW, nullptr, false);
1173  return;
1174  }
1175  if (blob->bounding_box().height() > 4 * blob->bounding_box().width()) {
1176  // Vertical conjoined text.
1177  blob->set_neighbour(BND_LEFT, nullptr, false);
1178  blob->set_neighbour(BND_RIGHT, nullptr, false);
1179  return;
1180  }
1181  }
1182 
1183  // Case 2: This blob is likely a single character.
1184  int margin = gridsize() / 2;
1185  int h_min, h_max, v_min, v_max;
1186  blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1187  if ((h_max + margin < v_min && h_max < margin / 2) ||
1188  blob->leader_on_left() || blob->leader_on_right()) {
1189  // Horizontal gaps are clear winners. Clear vertical neighbours.
1190  blob->set_neighbour(BND_ABOVE, nullptr, false);
1191  blob->set_neighbour(BND_BELOW, nullptr, false);
1192  } else if (v_max + margin < h_min && v_max < margin / 2) {
1193  // Vertical gaps are clear winners. Clear horizontal neighbours.
1194  blob->set_neighbour(BND_LEFT, nullptr, false);
1195  blob->set_neighbour(BND_RIGHT, nullptr, false);
1196  }
1197 }
1198 
1199 // Smoothes the vertical/horizontal type of the blob based on the
1200 // 2nd-order neighbours. If reset_all is true, then all blobs are
1201 // changed. Otherwise, only ambiguous blobs are processed.
1202 void StrokeWidth::SmoothNeighbourTypes(PageSegMode pageseg_mode, bool reset_all,
1203  BLOBNBOX* blob) {
1204  if ((blob->vert_possible() && blob->horz_possible()) || reset_all) {
1205  // There are both horizontal and vertical so try to fix it.
1206  BLOBNBOX_CLIST neighbours;
1207  List2ndNeighbours(blob, &neighbours);
1208  // The number of pure horizontal and vertical neighbours.
1209  int pure_h_count = 0;
1210  int pure_v_count = 0;
1211  CountNeighbourTypes(&neighbours, &pure_h_count, &pure_v_count);
1213  blob->bounding_box().bottom())) {
1214  HandleClick(blob->bounding_box().left() + 1,
1215  blob->bounding_box().bottom() + 1);
1216  tprintf("pure_h=%d, pure_v=%d\n",
1217  pure_h_count, pure_v_count);
1218  }
1219  if (pure_h_count > pure_v_count && !FindingVerticalOnly(pageseg_mode)) {
1220  // Horizontal gaps are clear winners. Clear vertical neighbours.
1221  blob->set_vert_possible(false);
1222  blob->set_horz_possible(true);
1223  } else if (pure_v_count > pure_h_count &&
1224  !FindingHorizontalOnly(pageseg_mode)) {
1225  // Vertical gaps are clear winners. Clear horizontal neighbours.
1226  blob->set_horz_possible(false);
1227  blob->set_vert_possible(true);
1228  }
1229  } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1230  blob->bounding_box().bottom())) {
1231  HandleClick(blob->bounding_box().left() + 1,
1232  blob->bounding_box().bottom() + 1);
1233  tprintf("Clean on pass 3!\n");
1234  }
1235 }
1236 
1237 // Partition creation. Accumulates vertical and horizontal text chains,
1238 // puts the remaining blobs in as unknowns, and then merges/splits to
1239 // minimize overlap and smoothes the types with neighbours and the color
1240 // image if provided. rerotation is used to rotate the coordinate space
1241 // back to the nontext_map_ image.
1242 // If find_problems is true, detects possible noise pollution by the amount
1243 // of partition overlap that is created by the diacritics. If excessive, the
1244 // noise is separated out into diacritic blobs, and PFR_NOISE is returned.
1245 // [TODO(rays): if the partition overlap is caused by heavy skew, deskews
1246 // the components, saves the skew_angle and returns PFR_SKEW.] If the return
1247 // is not PFR_OK, the job is incomplete, and FindInitialPartitions must be
1248 // called again after cleaning up the partly done work.
1249 PartitionFindResult StrokeWidth::FindInitialPartitions(
1250  PageSegMode pageseg_mode, const FCOORD& rerotation, bool find_problems,
1251  TO_BLOCK* block, BLOBNBOX_LIST* diacritic_blobs,
1252  ColPartitionGrid* part_grid, ColPartition_LIST* big_parts,
1253  FCOORD* skew_angle) {
1254  if (!FindingHorizontalOnly(pageseg_mode)) FindVerticalTextChains(part_grid);
1255  if (!FindingVerticalOnly(pageseg_mode)) FindHorizontalTextChains(part_grid);
1257  chains_win_ = MakeWindow(0, 400, "Initial text chains");
1258  part_grid->DisplayBoxes(chains_win_);
1259  projection_->DisplayProjection();
1260  }
1261  if (find_problems) {
1262  // TODO(rays) Do something to find skew, set skew_angle and return if there
1263  // is some.
1264  }
1265  part_grid->SplitOverlappingPartitions(big_parts);
1266  EasyMerges(part_grid);
1267  RemoveLargeUnusedBlobs(block, part_grid, big_parts);
1268  TBOX grid_box(bleft(), tright());
1269  while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box,
1270  rerotation));
1271  while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
1272  grid_box, rerotation));
1273  int pre_overlap = part_grid->ComputeTotalOverlap(nullptr);
1274  TestDiacritics(part_grid, block);
1275  MergeDiacritics(block, part_grid);
1276  if (find_problems && diacritic_blobs != nullptr &&
1277  DetectAndRemoveNoise(pre_overlap, grid_box, block, part_grid,
1278  diacritic_blobs)) {
1279  return PFR_NOISE;
1280  }
1282  textlines_win_ = MakeWindow(400, 400, "GoodTextline blobs");
1283  part_grid->DisplayBoxes(textlines_win_);
1284  diacritics_win_ = DisplayDiacritics("Diacritics", 0, 0, block);
1285  }
1286  PartitionRemainingBlobs(pageseg_mode, part_grid);
1287  part_grid->SplitOverlappingPartitions(big_parts);
1288  EasyMerges(part_grid);
1289  while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box,
1290  rerotation));
1291  while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
1292  grid_box, rerotation));
1293  // Now eliminate strong stuff in a sea of the opposite.
1294  while (part_grid->GridSmoothNeighbours(BTFT_STRONG_CHAIN, nontext_map_,
1295  grid_box, rerotation));
1297  smoothed_win_ = MakeWindow(800, 400, "Smoothed blobs");
1298  part_grid->DisplayBoxes(smoothed_win_);
1299  }
1300  return PFR_OK;
1301 }
1302 
1303 // Detects noise by a significant increase in partition overlap from
1304 // pre_overlap to now, and removes noise from the union of all the overlapping
1305 // partitions, placing the blobs in diacritic_blobs. Returns true if any noise
1306 // was found and removed.
1307 bool StrokeWidth::DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box,
1308  TO_BLOCK* block,
1309  ColPartitionGrid* part_grid,
1310  BLOBNBOX_LIST* diacritic_blobs) {
1311  ColPartitionGrid* noise_grid = nullptr;
1312  int post_overlap = part_grid->ComputeTotalOverlap(&noise_grid);
1313  if (pre_overlap == 0) pre_overlap = 1;
1314  BLOBNBOX_IT diacritic_it(diacritic_blobs);
1315  if (noise_grid != nullptr) {
1316  if (post_overlap > pre_overlap * kNoiseOverlapGrowthFactor &&
1317  post_overlap > grid_box.area() * kNoiseOverlapAreaFactor) {
1318  // This is noisy enough to fix.
1320  ScrollView* noise_win = MakeWindow(1000, 500, "Noise Areas");
1321  noise_grid->DisplayBoxes(noise_win);
1322  }
1323  part_grid->DeleteNonLeaderParts();
1324  BLOBNBOX_IT blob_it(&block->noise_blobs);
1325  ColPartitionGridSearch rsearch(noise_grid);
1326  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1327  BLOBNBOX* blob = blob_it.data();
1328  blob->ClearNeighbours();
1329  if (!blob->IsDiacritic() || blob->owner() != nullptr)
1330  continue; // Not a noise candidate.
1331  TBOX blob_box(blob->bounding_box());
1332  TBOX search_box(blob->bounding_box());
1333  search_box.pad(gridsize(), gridsize());
1334  rsearch.StartRectSearch(search_box);
1335  ColPartition* part = rsearch.NextRectSearch();
1336  if (part != nullptr) {
1337  // Consider blob as possible noise.
1338  blob->set_owns_cblob(true);
1339  blob->compute_bounding_box();
1340  diacritic_it.add_after_then_move(blob_it.extract());
1341  }
1342  }
1343  noise_grid->DeleteParts();
1344  delete noise_grid;
1345  return true;
1346  }
1347  noise_grid->DeleteParts();
1348  delete noise_grid;
1349  }
1350  return false;
1351 }
1352 
1353 // Helper verifies that blob's neighbour in direction dir is good to add to a
1354 // vertical text chain by returning the neighbour if it is not null, not owned,
1355 // and not uniquely horizontal, as well as its neighbour in the opposite
1356 // direction is blob.
1357 static BLOBNBOX* MutualUnusedVNeighbour(const BLOBNBOX* blob,
1358  BlobNeighbourDir dir) {
1359  BLOBNBOX* next_blob = blob->neighbour(dir);
1360  if (next_blob == nullptr || next_blob->owner() != nullptr ||
1361  next_blob->UniquelyHorizontal())
1362  return nullptr;
1363  if (next_blob->neighbour(DirOtherWay(dir)) == blob)
1364  return next_blob;
1365  return nullptr;
1366 }
1367 
1368 // Finds vertical chains of text-like blobs and puts them in ColPartitions.
1369 void StrokeWidth::FindVerticalTextChains(ColPartitionGrid* part_grid) {
1370  // A PageSegMode that forces vertical textlines with the current rotation.
1371  PageSegMode pageseg_mode =
1372  rerotation_.y() == 0.0f ? PSM_SINGLE_BLOCK_VERT_TEXT : PSM_SINGLE_COLUMN;
1373  BlobGridSearch gsearch(this);
1374  BLOBNBOX* bbox;
1375  gsearch.StartFullSearch();
1376  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1377  // Only process boxes that have no horizontal hope and have not yet
1378  // been included in a chain.
1379  BLOBNBOX* blob;
1380  if (bbox->owner() == nullptr && bbox->UniquelyVertical() &&
1381  (blob = MutualUnusedVNeighbour(bbox, BND_ABOVE)) != nullptr) {
1382  // Put all the linked blobs into a ColPartition.
1383  ColPartition* part = new ColPartition(BRT_VERT_TEXT, ICOORD(0, 1));
1384  part->AddBox(bbox);
1385  while (blob != nullptr) {
1386  part->AddBox(blob);
1387  blob = MutualUnusedVNeighbour(blob, BND_ABOVE);
1388  }
1389  blob = MutualUnusedVNeighbour(bbox, BND_BELOW);
1390  while (blob != nullptr) {
1391  part->AddBox(blob);
1392  blob = MutualUnusedVNeighbour(blob, BND_BELOW);
1393  }
1394  CompletePartition(pageseg_mode, part, part_grid);
1395  }
1396  }
1397 }
1398 
1399 // Helper verifies that blob's neighbour in direction dir is good to add to a
1400 // horizontal text chain by returning the neighbour if it is not null, not
1401 // owned, and not uniquely vertical, as well as its neighbour in the opposite
1402 // direction is blob.
1403 static BLOBNBOX* MutualUnusedHNeighbour(const BLOBNBOX* blob,
1404  BlobNeighbourDir dir) {
1405  BLOBNBOX* next_blob = blob->neighbour(dir);
1406  if (next_blob == nullptr || next_blob->owner() != nullptr ||
1407  next_blob->UniquelyVertical())
1408  return nullptr;
1409  if (next_blob->neighbour(DirOtherWay(dir)) == blob)
1410  return next_blob;
1411  return nullptr;
1412 }
1413 
1414 // Finds horizontal chains of text-like blobs and puts them in ColPartitions.
1415 void StrokeWidth::FindHorizontalTextChains(ColPartitionGrid* part_grid) {
1416  // A PageSegMode that forces horizontal textlines with the current rotation.
1417  PageSegMode pageseg_mode =
1418  rerotation_.y() == 0.0f ? PSM_SINGLE_COLUMN : PSM_SINGLE_BLOCK_VERT_TEXT;
1419  BlobGridSearch gsearch(this);
1420  BLOBNBOX* bbox;
1421  gsearch.StartFullSearch();
1422  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1423  BLOBNBOX* blob;
1424  if (bbox->owner() == nullptr && bbox->UniquelyHorizontal() &&
1425  (blob = MutualUnusedHNeighbour(bbox, BND_RIGHT)) != nullptr) {
1426  // Put all the linked blobs into a ColPartition.
1427  ColPartition* part = new ColPartition(BRT_TEXT, ICOORD(0, 1));
1428  part->AddBox(bbox);
1429  while (blob != nullptr) {
1430  part->AddBox(blob);
1431  blob = MutualUnusedHNeighbour(blob, BND_RIGHT);
1432  }
1433  blob = MutualUnusedHNeighbour(bbox, BND_LEFT);
1434  while (blob != nullptr) {
1435  part->AddBox(blob);
1436  blob = MutualUnusedVNeighbour(blob, BND_LEFT);
1437  }
1438  CompletePartition(pageseg_mode, part, part_grid);
1439  }
1440  }
1441 }
1442 
1443 // Finds diacritics and saves their base character in the blob.
1444 // The objective is to move all diacritics to the noise_blobs list, so
1445 // they don't mess up early textline finding/merging, or force splits
1446 // on textlines that overlap a bit. Blobs that become diacritics must be
1447 // either part of no ColPartition (nullptr owner) or in a small partition in
1448 // which ALL the blobs are diacritics, in which case the partition is
1449 // exploded (deleted) back to its blobs.
1450 void StrokeWidth::TestDiacritics(ColPartitionGrid* part_grid, TO_BLOCK* block) {
1451  BlobGrid small_grid(gridsize(), bleft(), tright());
1452  small_grid.InsertBlobList(&block->noise_blobs);
1453  small_grid.InsertBlobList(&block->blobs);
1454  int medium_diacritics = 0;
1455  int small_diacritics = 0;
1456  BLOBNBOX_IT small_it(&block->noise_blobs);
1457  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1458  BLOBNBOX* blob = small_it.data();
1459  if (blob->owner() == nullptr && !blob->IsDiacritic() &&
1460  DiacriticBlob(&small_grid, blob)) {
1461  ++small_diacritics;
1462  }
1463  }
1464  BLOBNBOX_IT blob_it(&block->blobs);
1465  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1466  BLOBNBOX* blob = blob_it.data();
1467  if (blob->IsDiacritic()) {
1468  small_it.add_to_end(blob_it.extract());
1469  continue; // Already a diacritic.
1470  }
1471  ColPartition* part = blob->owner();
1472  if (part == nullptr && DiacriticBlob(&small_grid, blob)) {
1473  ++medium_diacritics;
1474  RemoveBBox(blob);
1475  small_it.add_to_end(blob_it.extract());
1476  } else if (part != nullptr && !part->block_owned() &&
1477  part->boxes_count() < 3) {
1478  // We allow blobs in small partitions to become diacritics if ALL the
1479  // blobs in the partition qualify as we can then cleanly delete the
1480  // partition, turn all the blobs in it to diacritics and they can be
1481  // merged into the base character partition more easily than merging
1482  // the partitions.
1483  BLOBNBOX_C_IT box_it(part->boxes());
1484  for (box_it.mark_cycle_pt(); !box_it.cycled_list() &&
1485  DiacriticBlob(&small_grid, box_it.data());
1486  box_it.forward());
1487  if (box_it.cycled_list()) {
1488  // They are all good.
1489  while (!box_it.empty()) {
1490  // Liberate the blob from its partition so it can be treated
1491  // as a diacritic and merged explicitly with the base part.
1492  // The blob is really owned by the block. The partition "owner"
1493  // is nulled to allow the blob to get merged with its base character
1494  // partition.
1495  BLOBNBOX* box = box_it.extract();
1496  box->set_owner(nullptr);
1497  box_it.forward();
1498  ++medium_diacritics;
1499  // We remove the blob from the grid so it isn't found by subsequent
1500  // searches where we might not want to include diacritics.
1501  RemoveBBox(box);
1502  }
1503  // We only move the one blob to the small list here, but the others
1504  // all get moved by the test at the top of the loop.
1505  small_it.add_to_end(blob_it.extract());
1506  part_grid->RemoveBBox(part);
1507  delete part;
1508  }
1509  } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1510  blob->bounding_box().bottom())) {
1511  tprintf("Blob not available to be a diacritic at:");
1512  blob->bounding_box().print();
1513  }
1514  }
1516  tprintf("Found %d small diacritics, %d medium\n",
1517  small_diacritics, medium_diacritics);
1518  }
1519 }
1520 
1521 // Searches this grid for an appropriately close and sized neighbour of the
1522 // given [small] blob. If such a blob is found, the diacritic base is saved
1523 // in the blob and true is returned.
1524 // The small_grid is a secondary grid that contains the small/noise objects
1525 // that are not in this grid, but may be useful for determining a connection
1526 // between blob and its potential base character. (See DiacriticXGapFilled.)
1527 bool StrokeWidth::DiacriticBlob(BlobGrid* small_grid, BLOBNBOX* blob) {
1528  if (BLOBNBOX::UnMergeableType(blob->region_type()) ||
1529  blob->region_type() == BRT_VERT_TEXT)
1530  return false;
1531  TBOX small_box(blob->bounding_box());
1532  bool debug = AlignedBlob::WithinTestRegion(2, small_box.left(),
1533  small_box.bottom());
1534  if (debug) {
1535  tprintf("Testing blob for diacriticness at:");
1536  small_box.print();
1537  }
1538  int x = (small_box.left() + small_box.right()) / 2;
1539  int y = (small_box.bottom() + small_box.top()) / 2;
1540  int grid_x, grid_y;
1541  GridCoords(x, y, &grid_x, &grid_y);
1542  int height = small_box.height();
1543  // Setup a rectangle search to find its nearest base-character neighbour.
1544  // We keep 2 different best candidates:
1545  // best_x_overlap is a category of base characters that have an overlap in x
1546  // (like a acute) in which we look for the least y-gap, computed using the
1547  // projection to favor base characters in the same textline.
1548  // best_y_overlap is a category of base characters that have no x overlap,
1549  // (nominally a y-overlap is preferrecd but not essential) in which we
1550  // look for the least weighted sum of x-gap and y-gap, with x-gap getting
1551  // a lower weight to catch quotes at the end of a textline.
1552  // NOTE that x-gap and y-gap are measured from the nearest side of the base
1553  // character to the FARTHEST side of the diacritic to allow small diacritics
1554  // to be a reasonable distance away, but not big diacritics.
1555  BLOBNBOX* best_x_overlap = nullptr;
1556  BLOBNBOX* best_y_overlap = nullptr;
1557  int best_total_dist = 0;
1558  int best_y_gap = 0;
1559  TBOX best_xbox;
1560  // TODO(rays) the search box could be setup using the projection as a guide.
1561  TBOX search_box(small_box);
1562  int x_pad = IntCastRounded(gridsize() * kDiacriticXPadRatio);
1563  int y_pad = IntCastRounded(gridsize() * kDiacriticYPadRatio);
1564  search_box.pad(x_pad, y_pad);
1565  BlobGridSearch rsearch(this);
1566  rsearch.SetUniqueMode(true);
1567  int min_height = height * kMinDiacriticSizeRatio;
1568  rsearch.StartRectSearch(search_box);
1569  BLOBNBOX* neighbour;
1570  while ((neighbour = rsearch.NextRectSearch()) != nullptr) {
1571  if (BLOBNBOX::UnMergeableType(neighbour->region_type()) ||
1572  neighbour == blob || neighbour->owner() == blob->owner())
1573  continue;
1574  TBOX nbox = neighbour->bounding_box();
1575  if (neighbour->owner() == nullptr || neighbour->owner()->IsVerticalType() ||
1576  (neighbour->flow() != BTFT_CHAIN &&
1577  neighbour->flow() != BTFT_STRONG_CHAIN)) {
1578  if (debug) {
1579  tprintf("Neighbour not strong enough:");
1580  nbox.print();
1581  }
1582  continue; // Diacritics must be attached to strong text.
1583  }
1584  if (nbox.height() < min_height) {
1585  if (debug) {
1586  tprintf("Neighbour not big enough:");
1587  nbox.print();
1588  }
1589  continue; // Too small to be the base character.
1590  }
1591  int x_gap = small_box.x_gap(nbox);
1592  int y_gap = small_box.y_gap(nbox);
1593  int total_distance = projection_->DistanceOfBoxFromBox(small_box, nbox,
1594  true, denorm_,
1595  debug);
1596  if (debug) tprintf("xgap=%d, y=%d, total dist=%d\n",
1597  x_gap, y_gap, total_distance);
1598  if (total_distance >
1599  neighbour->owner()->median_height() * kMaxDiacriticDistanceRatio) {
1600  if (debug) {
1601  tprintf("Neighbour with median size %d too far away:",
1602  neighbour->owner()->median_height());
1603  neighbour->bounding_box().print();
1604  }
1605  continue; // Diacritics must not be too distant.
1606  }
1607  if (x_gap <= 0) {
1608  if (debug) {
1609  tprintf("Computing reduced box for :");
1610  nbox.print();
1611  }
1612  int left = small_box.left() - small_box.width();
1613  int right = small_box.right() + small_box.width();
1614  nbox = neighbour->BoundsWithinLimits(left, right);
1615  y_gap = small_box.y_gap(nbox);
1616  if (best_x_overlap == nullptr || y_gap < best_y_gap) {
1617  best_x_overlap = neighbour;
1618  best_xbox = nbox;
1619  best_y_gap = y_gap;
1620  if (debug) {
1621  tprintf("New best:");
1622  nbox.print();
1623  }
1624  } else if (debug) {
1625  tprintf("Shrunken box doesn't win:");
1626  nbox.print();
1627  }
1628  } else if (blob->ConfirmNoTabViolation(*neighbour)) {
1629  if (best_y_overlap == nullptr || total_distance < best_total_dist) {
1630  if (debug) {
1631  tprintf("New best y overlap:");
1632  nbox.print();
1633  }
1634  best_y_overlap = neighbour;
1635  best_total_dist = total_distance;
1636  } else if (debug) {
1637  tprintf("New y overlap box doesn't win:");
1638  nbox.print();
1639  }
1640  } else if (debug) {
1641  tprintf("Neighbour wrong side of a tab:");
1642  nbox.print();
1643  }
1644  }
1645  if (best_x_overlap != nullptr &&
1646  (best_y_overlap == nullptr ||
1647  best_xbox.major_y_overlap(best_y_overlap->bounding_box()))) {
1648  blob->set_diacritic_box(best_xbox);
1649  blob->set_base_char_blob(best_x_overlap);
1650  if (debug) {
1651  tprintf("DiacriticBlob OK! (x-overlap:");
1652  small_box.print();
1653  best_xbox.print();
1654  }
1655  return true;
1656  }
1657  if (best_y_overlap != nullptr &&
1658  DiacriticXGapFilled(small_grid, small_box,
1659  best_y_overlap->bounding_box()) &&
1660  NoNoiseInBetween(small_box, best_y_overlap->bounding_box())) {
1661  blob->set_diacritic_box(best_y_overlap->bounding_box());
1662  blob->set_base_char_blob(best_y_overlap);
1663  if (debug) {
1664  tprintf("DiacriticBlob OK! (y-overlap:");
1665  small_box.print();
1666  best_y_overlap->bounding_box().print();
1667  }
1668  return true;
1669  }
1670  if (debug) {
1671  tprintf("DiacriticBlob fails:");
1672  small_box.print();
1673  tprintf("Best x+y gap = %d, y = %d\n", best_total_dist, best_y_gap);
1674  if (best_y_overlap != nullptr) {
1675  tprintf("XGapFilled=%d, NoiseBetween=%d\n",
1676  DiacriticXGapFilled(small_grid, small_box,
1677  best_y_overlap->bounding_box()),
1678  NoNoiseInBetween(small_box, best_y_overlap->bounding_box()));
1679  }
1680  }
1681  return false;
1682 }
1683 
1684 // Returns true if there is no gap between the base char and the diacritic
1685 // bigger than a fraction of the height of the base char:
1686 // Eg: line end.....'
1687 // The quote is a long way from the end of the line, yet it needs to be a
1688 // diacritic. To determine that the quote is not part of an image, or
1689 // a different text block, we check for other marks in the gap between
1690 // the base char and the diacritic.
1691 // '<--Diacritic
1692 // |---------|
1693 // | |<-toobig-gap->
1694 // | Base |<ok gap>
1695 // |---------| x<-----Dot occupying gap
1696 // The grid is const really.
1697 bool StrokeWidth::DiacriticXGapFilled(BlobGrid* grid,
1698  const TBOX& diacritic_box,
1699  const TBOX& base_box) {
1700  // Since most gaps are small, use an iterative algorithm to search the gap.
1701  int max_gap = IntCastRounded(base_box.height() *
1703  TBOX occupied_box(base_box);
1704  int diacritic_gap;
1705  while ((diacritic_gap = diacritic_box.x_gap(occupied_box)) > max_gap) {
1706  TBOX search_box(occupied_box);
1707  if (diacritic_box.left() > search_box.right()) {
1708  // We are looking right.
1709  search_box.set_left(search_box.right());
1710  search_box.set_right(search_box.left() + max_gap);
1711  } else {
1712  // We are looking left.
1713  search_box.set_right(search_box.left());
1714  search_box.set_left(search_box.left() - max_gap);
1715  }
1716  BlobGridSearch rsearch(grid);
1717  rsearch.StartRectSearch(search_box);
1718  BLOBNBOX* neighbour;
1719  while ((neighbour = rsearch.NextRectSearch()) != nullptr) {
1720  const TBOX& nbox = neighbour->bounding_box();
1721  if (nbox.x_gap(diacritic_box) < diacritic_gap) {
1722  if (nbox.left() < occupied_box.left())
1723  occupied_box.set_left(nbox.left());
1724  if (nbox.right() > occupied_box.right())
1725  occupied_box.set_right(nbox.right());
1726  break;
1727  }
1728  }
1729  if (neighbour == nullptr)
1730  return false; // Found a big gap.
1731  }
1732  return true; // The gap was filled.
1733 }
1734 
1735 // Merges diacritics with the ColPartition of the base character blob.
1736 void StrokeWidth::MergeDiacritics(TO_BLOCK* block,
1737  ColPartitionGrid* part_grid) {
1738  BLOBNBOX_IT small_it(&block->noise_blobs);
1739  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1740  BLOBNBOX* blob = small_it.data();
1741  if (blob->base_char_blob() != nullptr) {
1742  ColPartition* part = blob->base_char_blob()->owner();
1743  // The base character must be owned by a partition and that partition
1744  // must not be on the big_parts list (not block owned).
1745  if (part != nullptr && !part->block_owned() && blob->owner() == nullptr &&
1746  blob->IsDiacritic()) {
1747  // The partition has to be removed from the grid and reinserted
1748  // because its bounding box may change.
1749  part_grid->RemoveBBox(part);
1750  part->AddBox(blob);
1751  blob->set_region_type(part->blob_type());
1752  blob->set_flow(part->flow());
1753  blob->set_owner(part);
1754  part_grid->InsertBBox(true, true, part);
1755  }
1756  // Set all base chars to nullptr before any blobs get deleted.
1757  blob->set_base_char_blob(nullptr);
1758  }
1759  }
1760 }
1761 
1762 // Any blobs on the large_blobs list of block that are still unowned by a
1763 // ColPartition, are probably drop-cap or vertically touching so the blobs
1764 // are removed to the big_parts list and treated separately.
1765 void StrokeWidth::RemoveLargeUnusedBlobs(TO_BLOCK* block,
1766  ColPartitionGrid* part_grid,
1767  ColPartition_LIST* big_parts) {
1768  BLOBNBOX_IT large_it(&block->large_blobs);
1769  for (large_it.mark_cycle_pt(); !large_it.cycled_list(); large_it.forward()) {
1770  BLOBNBOX* blob = large_it.data();
1771  ColPartition* big_part = blob->owner();
1772  if (big_part == nullptr) {
1773  // Large blobs should have gone into partitions by now if they are
1774  // genuine characters, so move any unowned ones out to the big parts
1775  // list. This will include drop caps and vertically touching characters.
1776  ColPartition::MakeBigPartition(blob, big_parts);
1777  }
1778  }
1779 }
1780 
1781 // All remaining unused blobs are put in individual ColPartitions.
1782 void StrokeWidth::PartitionRemainingBlobs(PageSegMode pageseg_mode,
1783  ColPartitionGrid* part_grid) {
1784  BlobGridSearch gsearch(this);
1785  BLOBNBOX* bbox;
1786  int prev_grid_x = -1;
1787  int prev_grid_y = -1;
1788  BLOBNBOX_CLIST cell_list;
1789  BLOBNBOX_C_IT cell_it(&cell_list);
1790  bool cell_all_noise = true;
1791  gsearch.StartFullSearch();
1792  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1793  int grid_x = gsearch.GridX();
1794  int grid_y = gsearch.GridY();
1795  if (grid_x != prev_grid_x || grid_y != prev_grid_y) {
1796  // New cell. Process old cell.
1797  MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid,
1798  &cell_list);
1799  cell_it.set_to_list(&cell_list);
1800  prev_grid_x = grid_x;
1801  prev_grid_y = grid_y;
1802  cell_all_noise = true;
1803  }
1804  if (bbox->owner() == nullptr) {
1805  cell_it.add_to_end(bbox);
1806  if (bbox->flow() != BTFT_NONTEXT)
1807  cell_all_noise = false;
1808  } else {
1809  cell_all_noise = false;
1810  }
1811  }
1812  MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid,
1813  &cell_list);
1814 }
1815 
1816 // If combine, put all blobs in the cell_list into a single partition, otherwise
1817 // put each one into its own partition.
1818 void StrokeWidth::MakePartitionsFromCellList(PageSegMode pageseg_mode,
1819  bool combine,
1820  ColPartitionGrid* part_grid,
1821  BLOBNBOX_CLIST* cell_list) {
1822  if (cell_list->empty())
1823  return;
1824  BLOBNBOX_C_IT cell_it(cell_list);
1825  if (combine) {
1826  BLOBNBOX* bbox = cell_it.extract();
1827  ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1828  part->AddBox(bbox);
1829  part->set_flow(bbox->flow());
1830  for (cell_it.forward(); !cell_it.empty(); cell_it.forward()) {
1831  part->AddBox(cell_it.extract());
1832  }
1833  CompletePartition(pageseg_mode, part, part_grid);
1834  } else {
1835  for (; !cell_it.empty(); cell_it.forward()) {
1836  BLOBNBOX* bbox = cell_it.extract();
1837  ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1838  part->set_flow(bbox->flow());
1839  part->AddBox(bbox);
1840  CompletePartition(pageseg_mode, part, part_grid);
1841  }
1842  }
1843 }
1844 
1845 // Helper function to finish setting up a ColPartition and insert into
1846 // part_grid.
1847 void StrokeWidth::CompletePartition(PageSegMode pageseg_mode,
1848  ColPartition* part,
1849  ColPartitionGrid* part_grid) {
1850  part->ComputeLimits();
1851  TBOX box = part->bounding_box();
1852  bool debug = AlignedBlob::WithinTestRegion(2, box.left(),
1853  box.bottom());
1854  int value = projection_->EvaluateColPartition(*part, denorm_, debug);
1855  // Override value if pageseg_mode disagrees.
1856  if (value > 0 && FindingVerticalOnly(pageseg_mode)) {
1857  value = part->boxes_count() == 1 ? 0 : -2;
1858  } else if (value < 0 && FindingHorizontalOnly(pageseg_mode)) {
1859  value = part->boxes_count() == 1 ? 0 : 2;
1860  }
1861  part->SetRegionAndFlowTypesFromProjectionValue(value);
1862  part->ClaimBoxes();
1863  part_grid->InsertBBox(true, true, part);
1864 }
1865 
1866 // Merge partitions where the merge appears harmless.
1867 // As this
1868 void StrokeWidth::EasyMerges(ColPartitionGrid* part_grid) {
1869  part_grid->Merges(
1870  NewPermanentTessCallback(this, &StrokeWidth::OrientationSearchBox),
1871  NewPermanentTessCallback(this, &StrokeWidth::ConfirmEasyMerge));
1872 }
1873 
1874 // Compute a search box based on the orientation of the partition.
1875 // Returns true if a suitable box can be calculated.
1876 // Callback for EasyMerges.
1877 bool StrokeWidth::OrientationSearchBox(ColPartition* part, TBOX* box) {
1878  if (part->IsVerticalType()) {
1879  box->set_top(box->top() + box->width());
1880  box->set_bottom(box->bottom() - box->width());
1881  } else {
1882  box->set_left(box->left() - box->height());
1883  box->set_right(box->right() + box->height());
1884  }
1885  return true;
1886 }
1887 
1888 // Merge confirmation callback for EasyMerges.
1889 bool StrokeWidth::ConfirmEasyMerge(const ColPartition* p1,
1890  const ColPartition* p2) {
1891  ASSERT_HOST(p1 != nullptr && p2 != nullptr);
1892  ASSERT_HOST(!p1->IsEmpty() && !p2->IsEmpty());
1893  if ((p1->flow() == BTFT_NONTEXT && p2->flow() >= BTFT_CHAIN) ||
1894  (p1->flow() >= BTFT_CHAIN && p2->flow() == BTFT_NONTEXT))
1895  return false; // Don't merge confirmed image with text.
1896  if ((p1->IsVerticalType() || p2->IsVerticalType()) &&
1897  p1->HCoreOverlap(*p2) <= 0 &&
1898  ((!p1->IsSingleton() &&
1899  !p2->IsSingleton()) ||
1900  !p1->bounding_box().major_overlap(p2->bounding_box())))
1901  return false; // Overlap must be in the text line.
1902  if ((p1->IsHorizontalType() || p2->IsHorizontalType()) &&
1903  p1->VCoreOverlap(*p2) <= 0 &&
1904  ((!p1->IsSingleton() &&
1905  !p2->IsSingleton()) ||
1906  (!p1->bounding_box().major_overlap(p2->bounding_box()) &&
1907  !p1->OKDiacriticMerge(*p2, false) &&
1908  !p2->OKDiacriticMerge(*p1, false))))
1909  return false; // Overlap must be in the text line.
1910  if (!p1->ConfirmNoTabViolation(*p2))
1911  return false;
1912  if (p1->flow() <= BTFT_NONTEXT && p2->flow() <= BTFT_NONTEXT)
1913  return true;
1914  return NoNoiseInBetween(p1->bounding_box(), p2->bounding_box());
1915 }
1916 
1917 // Returns true if there is no significant noise in between the boxes.
1918 bool StrokeWidth::NoNoiseInBetween(const TBOX& box1, const TBOX& box2) const {
1919  return ImageFind::BlankImageInBetween(box1, box2, grid_box_, rerotation_,
1920  nontext_map_);
1921 }
1922 
1926 ScrollView* StrokeWidth::DisplayGoodBlobs(const char* window_name,
1927  int x, int y) {
1928  ScrollView* window = nullptr;
1929 #ifndef GRAPHICS_DISABLED
1930  window = MakeWindow(x, y, window_name);
1931  // For every blob in the grid, display it.
1932  window->Brush(ScrollView::NONE);
1933 
1934  // For every bbox in the grid, display it.
1935  BlobGridSearch gsearch(this);
1936  gsearch.StartFullSearch();
1937  BLOBNBOX* bbox;
1938  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1939  const TBOX& box = bbox->bounding_box();
1940  int left_x = box.left();
1941  int right_x = box.right();
1942  int top_y = box.top();
1943  int bottom_y = box.bottom();
1944  int goodness = bbox->GoodTextBlob();
1945  BlobRegionType blob_type = bbox->region_type();
1946  if (bbox->UniquelyVertical())
1947  blob_type = BRT_VERT_TEXT;
1948  if (bbox->UniquelyHorizontal())
1949  blob_type = BRT_TEXT;
1950  BlobTextFlowType flow = bbox->flow();
1951  if (flow == BTFT_NONE) {
1952  if (goodness == 0)
1953  flow = BTFT_NEIGHBOURS;
1954  else if (goodness == 1)
1955  flow = BTFT_CHAIN;
1956  else
1957  flow = BTFT_STRONG_CHAIN;
1958  }
1959  window->Pen(BLOBNBOX::TextlineColor(blob_type, flow));
1960  window->Rectangle(left_x, bottom_y, right_x, top_y);
1961  }
1962  window->Update();
1963 #endif
1964  return window;
1965 }
1966 
1967 static void DrawDiacriticJoiner(const BLOBNBOX* blob, ScrollView* window) {
1968 #ifndef GRAPHICS_DISABLED
1969  const TBOX& blob_box(blob->bounding_box());
1970  int top = std::max(static_cast<int>(blob_box.top()), blob->base_char_top());
1971  int bottom = std::min(static_cast<int>(blob_box.bottom()), blob->base_char_bottom());
1972  int x = (blob_box.left() + blob_box.right()) / 2;
1973  window->Line(x, top, x, bottom);
1974 #endif // GRAPHICS_DISABLED
1975 }
1976 
1977 // Displays blobs colored according to whether or not they are diacritics.
1978 ScrollView* StrokeWidth::DisplayDiacritics(const char* window_name,
1979  int x, int y, TO_BLOCK* block) {
1980  ScrollView* window = nullptr;
1981 #ifndef GRAPHICS_DISABLED
1982  window = MakeWindow(x, y, window_name);
1983  // For every blob in the grid, display it.
1984  window->Brush(ScrollView::NONE);
1985 
1986  BLOBNBOX_IT it(&block->blobs);
1987  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1988  BLOBNBOX* blob = it.data();
1989  if (blob->IsDiacritic()) {
1990  window->Pen(ScrollView::GREEN);
1991  DrawDiacriticJoiner(blob, window);
1992  } else {
1993  window->Pen(blob->BoxColor());
1994  }
1995  const TBOX& box = blob->bounding_box();
1996  window->Rectangle(box.left(), box. bottom(), box.right(), box.top());
1997  }
1998  it.set_to_list(&block->noise_blobs);
1999  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2000  BLOBNBOX* blob = it.data();
2001  if (blob->IsDiacritic()) {
2002  window->Pen(ScrollView::GREEN);
2003  DrawDiacriticJoiner(blob, window);
2004  } else {
2005  window->Pen(ScrollView::WHITE);
2006  }
2007  const TBOX& box = blob->bounding_box();
2008  window->Rectangle(box.left(), box. bottom(), box.right(), box.top());
2009  }
2010  window->Update();
2011 #endif
2012  return window;
2013 }
2014 
2015 } // namespace tesseract.
float area_stroke_width() const
Definition: blobbox.h:350
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:439
BlobNeighbourDir
Definition: blobbox.h:88
int textord_tabfind_show_strokewidths
Definition: strokewidth.cpp:41
void InsertBlobList(BLOBNBOX_LIST *blobs)
Definition: blobgrid.cpp:36
int GoodTextBlob() const
Definition: blobbox.cpp:227
static bool DifferentSizes(int size1, int size2)
Definition: tabfind.cpp:408
void set_vert_possible(bool value)
Definition: blobbox.h:305
bool DefiniteIndividualFlow()
Definition: blobbox.cpp:253
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:591
void SetNeighboursOnMediumBlobs(TO_BLOCK *block)
const int kMaxCJKSizeRatio
Definition: strokewidth.cpp:66
#define BOOL_VAR(name, val, comment)
Definition: params.h:279
BlobGrid(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: blobgrid.cpp:24
float horz_stroke_width() const
Definition: blobbox.h:338
int gridsize() const
Definition: bbgrid.h:64
void set_top(int y)
Definition: rect.h:61
void print() const
Definition: rect.h:278
BBC * NextFullSearch()
Definition: bbgrid.h:677
int y_gap(const TBOX &box) const
Definition: rect.h:233
void set_diacritic_box(const TBOX &diacritic_box)
Definition: blobbox.h:399
void set_leader_on_right(bool flag)
Definition: blobbox.h:368
void set_bottom(int y)
Definition: rect.h:68
const double kStrokeWidthFractionCJK
Definition: strokewidth.cpp:52
TBOX BoundsWithinLimits(int left, int right)
Definition: blobbox.cpp:334
const ICOORD & bleft() const
Definition: bbgrid.h:73
bool IsDiacritic() const
Definition: blobbox.h:381
BlobRegionType
Definition: blobbox.h:73
Definition: rect.h:34
void set_owns_cblob(bool value)
Definition: blobbox.h:409
BlobTextFlowType flow() const
Definition: blobbox.h:296
void set_base_char_blob(BLOBNBOX *blob)
Definition: blobbox.h:406
static bool UnMergeableType(BlobRegionType type)
Definition: blobbox.h:431
static bool WithinTestRegion(int detail_level, int x, int y)
const double kCJKAspectRatioIncrease
Definition: strokewidth.cpp:64
const double kNoiseOverlapGrowthFactor
int x_gap(const TBOX &box) const
Definition: rect.h:225
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
int base_char_top() const
Definition: blobbox.h:384
BlobTextFlowType
Definition: blobbox.h:115
void MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const
Definition: blobbox.cpp:201
void set_x(float xin)
rewrite function
Definition: points.h:215
void RemoveLineResidue(ColPartition_LIST *big_part_list)
Definition: statistc.h:33
void MoveNonTextlineBlobs(BLOBNBOX_LIST *blobs, BLOBNBOX_LIST *small_blobs) const
static void Update()
Definition: scrollview.cpp:711
const double kLineResidueSizeRatio
Definition: strokewidth.cpp:99
virtual void HandleClick(int x, int y)
Definition: bbgrid.h:657
const double kCJKBrokenDistanceFraction
Definition: strokewidth.cpp:58
const int kMostlyOneDirRatio
Definition: strokewidth.cpp:93
const float kSizeRatioToReject
void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good)
Definition: blobbox.h:377
BBC * NextRadSearch()
Definition: bbgrid.h:715
int32_t perimeter()
Definition: stepblob.cpp:294
int base_char_bottom() const
Definition: blobbox.h:387
void set_right(int x)
Definition: rect.h:82
const int kLineTrapLongest
Definition: strokewidth.cpp:88
int16_t width() const
Definition: rect.h:115
int DistanceOfBoxFromBox(const TBOX &from_box, const TBOX &to_box, bool horizontal_textline, const DENORM *denorm, bool debug) const
void set_horz_possible(bool value)
Definition: blobbox.h:311
void FindLeaderPartitions(TO_BLOCK *block, ColPartitionGrid *part_grid)
const double kNeighbourSearchFactor
int16_t left() const
Definition: rect.h:72
void rotate_box(FCOORD rotation)
Definition: blobbox.cpp:72
static ColPartition * MakeBigPartition(BLOBNBOX *box, ColPartition_LIST *big_part_list)
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
static bool VeryDifferentSizes(int size1, int size2)
Definition: tabfind.cpp:414
int16_t top() const
Definition: rect.h:58
void StartRadSearch(int x, int y, int max_radius)
Definition: bbgrid.h:700
void set_region_type(BlobRegionType new_type)
Definition: blobbox.h:287
bool leader_on_right() const
Definition: blobbox.h:365
bool UniquelyHorizontal() const
Definition: blobbox.h:414
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:445
Assume a single column of text of variable sizes.
Definition: publictypes.h:169
const double kMaxDiacriticDistanceRatio
Definition: strokewidth.cpp:80
void ConstructProjection(TO_BLOCK *input_block, const FCOORD &rotation, Pix *nontext_map)
virtual void HandleClick(int x, int y)
void set_owner(tesseract::ColPartition *new_owner)
Definition: blobbox.h:356
bool good_stroke_neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:374
integer coordinate
Definition: points.h:32
bool TestVerticalTextDirection(double find_vertical_text_ratio, TO_BLOCK *block, BLOBNBOX_CLIST *osd_blobs)
const double kLineResidueAspectRatio
Definition: strokewidth.cpp:95
bool horz_possible() const
Definition: blobbox.h:308
BlobRegionType region_type() const
Definition: blobbox.h:284
int EvaluateColPartition(const ColPartition &part, const DENORM *denorm, bool debug) const
int textord_debug_tabfind
Definition: alignedblob.cpp:28
bool joined_to_prev() const
Definition: blobbox.h:257
int IntCastRounded(double x)
Definition: helpers.h:168
void FindTextlineDirectionAndFixBrokenCJK(PageSegMode pageseg_mode, bool cjk_merge, TO_BLOCK *input_block)
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:488
void NeighbourGaps(int gaps[BND_COUNT]) const
Definition: blobbox.cpp:182
BLOBNBOX * base_char_blob() const
Definition: blobbox.h:403
GridSearch< BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT > BlobGridSearch
Definition: blobgrid.h:31
PartitionFindResult
Definition: strokewidth.h:46
const double kStrokeWidthFractionTolerance
Definition: strokewidth.cpp:45
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
float vert_stroke_width() const
Definition: blobbox.h:344
void ClearNeighbours()
Definition: blobbox.h:511
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:53
const double kDiacriticYPadRatio
Definition: strokewidth.cpp:74
int32_t area() const
Definition: rect.h:122
const int kCJKMaxComponents
Definition: strokewidth.cpp:60
void DeleteUnownedNoise()
Definition: blobbox.cpp:1038
const double kMinDiacriticSizeRatio
Definition: strokewidth.cpp:77
void GradeBlobsIntoPartitions(PageSegMode pageseg_mode, const FCOORD &rerotation, TO_BLOCK *block, Pix *nontext_pix, const DENORM *denorm, bool cjk_script, TextlineProjection *projection, BLOBNBOX_LIST *diacritic_blobs, ColPartitionGrid *part_grid, ColPartition_LIST *big_parts)
const int kCJKRadius
Definition: strokewidth.cpp:56
void AddBox(BLOBNBOX *box)
const double kDiacriticXPadRatio
Definition: strokewidth.cpp:71
const double kMaxDiacriticGapToBaseCharHeight
Definition: strokewidth.cpp:83
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:299
void set_left(int x)
Definition: rect.h:75
BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir)
Definition: blobbox.h:107
void set_leader_on_left(bool flag)
Definition: blobbox.h:362
const double kNoiseOverlapAreaFactor
bool vert_possible() const
Definition: blobbox.h:302
bool overlap(const TBOX &box) const
Definition: rect.h:355
bool textord_tabfind_only_strokewidths
Definition: strokewidth.cpp:42
bool y_overlap(const TBOX &box) const
Definition: rect.h:428
bool contains(const FCOORD pt) const
Definition: rect.h:333
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:447
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70
void CorrectForRotation(const FCOORD &rerotation, ColPartitionGrid *part_grid)
int32_t area()
Definition: stepblob.cpp:275
Definition: points.h:189
const TBOX & bounding_box() const
Definition: blobbox.h:231
const double kBrokenCJKIterationFraction
Definition: strokewidth.cpp:68
bool IsVerticalType() const
Definition: colpartition.h:442
bool ConfirmNoTabViolation(const BLOBNBOX &other) const
Definition: blobbox.cpp:293
bool leader_on_left() const
Definition: blobbox.h:359
static bool BlankImageInBetween(const TBOX &box1, const TBOX &box2, const TBOX &im_box, const FCOORD &rotation, Pix *pix)
Definition: imagefind.cpp:577
int16_t right() const
Definition: rect.h:79
float x() const
Definition: points.h:208
BLOBNBOX_LIST blobs
Definition: blobbox.h:785
ScrollView::Color BoxColor() const
Definition: blobbox.cpp:482
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:602
tesseract::ColPartition * owner() const
Definition: blobbox.h:353
StrokeWidth(int gridsize, const ICOORD &bleft, const ICOORD &tright)
const double kStrokeWidthCJK
Definition: strokewidth.cpp:53
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:789
bool UniquelyVertical() const
Definition: blobbox.h:411
const double kStrokeWidthTolerance
Definition: strokewidth.cpp:50
void StartFullSearch()
Definition: bbgrid.h:667
BLOBNBOX * neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:371
void Pen(Color color)
Definition: scrollview.cpp:722
int16_t bottom() const
Definition: rect.h:65
void set_y(float yin)
rewrite function
Definition: points.h:219
int right_rule() const
Definition: blobbox.h:320
const int kLineResiduePadRatio
Definition: strokewidth.cpp:97
const double kCJKAspectRatio
Definition: strokewidth.cpp:62
int16_t height() const
Definition: rect.h:108
void PlotGradedBlobs(BLOBNBOX_LIST *blobs, ScrollView *win)
void really_merge(BLOBNBOX *other)
Definition: blobbox.cpp:104
C_BLOB * cblob() const
Definition: blobbox.h:269
void pad(int xpad, int ypad)
Definition: rect.h:131
#define INT_VAR(name, val, comment)
Definition: params.h:276
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:534
const ICOORD & tright() const
Definition: bbgrid.h:76
float y() const
Definition: points.h:211
void compute_bounding_box()
Definition: blobbox.h:241
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:788
const int kLineTrapShortest
Definition: strokewidth.cpp:90
bool MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance, double constant_tolerance) const
Definition: blobbox.cpp:306
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:787
#define ASSERT_HOST(x)
Definition: errcode.h:84
void Brush(Color color)
Definition: scrollview.cpp:728
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type)
Definition: blobbox.cpp:445