tesseract  5.0.0-alpha-619-ge9db
strokewidth.cpp
Go to the documentation of this file.
1 // File: strokewidth.cpp
3 // Description: Subclass of BBGrid to find uniformity of strokewidth.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2008, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifdef HAVE_CONFIG_H
20 #include "config_auto.h"
21 #endif
22 
23 #include "strokewidth.h"
24 
25 #include <algorithm>
26 #include <cmath>
27 
28 #include "blobbox.h"
29 #include "colpartition.h"
30 #include "colpartitiongrid.h"
31 #include "imagefind.h"
32 #include "linlsq.h"
33 #include "statistc.h"
34 #include "tabfind.h"
35 #include "textlineprojection.h"
36 #include "tordmain.h" // For SetBlobStrokeWidth.
37 
38 namespace tesseract {
39 
40 static INT_VAR(textord_tabfind_show_strokewidths, 0, "Show stroke widths");
41 static BOOL_VAR(textord_tabfind_only_strokewidths, false, "Only run stroke widths");
42 
44 const double kStrokeWidthFractionTolerance = 0.125;
49 const double kStrokeWidthTolerance = 1.5;
50 // Same but for CJK we are a bit more generous.
51 const double kStrokeWidthFractionCJK = 0.25;
52 const double kStrokeWidthCJK = 2.0;
53 // Radius in grid cells of search for broken CJK. Doesn't need to be very
54 // large as the grid size should be about the size of a character anyway.
55 const int kCJKRadius = 2;
56 // Max distance fraction of size to join close but broken CJK characters.
57 const double kCJKBrokenDistanceFraction = 0.25;
58 // Max number of components in a broken CJK character.
59 const int kCJKMaxComponents = 8;
60 // Max aspect ratio of CJK broken characters when put back together.
61 const double kCJKAspectRatio = 1.25;
62 // Max increase in aspect ratio of CJK broken characters when merged.
63 const double kCJKAspectRatioIncrease = 1.0625;
64 // Max multiple of the grid size that will be used in computing median CJKsize.
65 const int kMaxCJKSizeRatio = 5;
66 // Min fraction of blobs broken CJK to iterate and run it again.
67 const double kBrokenCJKIterationFraction = 0.125;
68 // Multiple of gridsize as x-padding for a search box for diacritic base
69 // characters.
70 const double kDiacriticXPadRatio = 7.0;
71 // Multiple of gridsize as y-padding for a search box for diacritic base
72 // characters.
73 const double kDiacriticYPadRatio = 1.75;
74 // Min multiple of diacritic height that a neighbour must be to be a
75 // convincing base character.
76 const double kMinDiacriticSizeRatio = 1.0625;
77 // Max multiple of a textline's median height as a threshold for the sum of
78 // a diacritic's farthest x and y distances (gap + size).
79 const double kMaxDiacriticDistanceRatio = 1.25;
80 // Max x-gap between a diacritic and its base char as a fraction of the height
81 // of the base char (allowing other blobs to fill the gap.)
83 // Ratio between longest side of a line and longest side of a character.
84 // (neighbor_min > blob_min * kLineTrapShortest &&
85 // neighbor_max < blob_max / kLineTrapLongest)
86 // => neighbor is a grapheme and blob is a line.
87 const int kLineTrapLongest = 4;
88 // Ratio between shortest side of a line and shortest side of a character.
89 const int kLineTrapShortest = 2;
90 // Max aspect ratio of the total box before CountNeighbourGaps
91 // decides immediately based on the aspect ratio.
92 const int kMostlyOneDirRatio = 3;
93 // Aspect ratio for a blob to be considered as line residue.
94 const double kLineResidueAspectRatio = 8.0;
95 // Padding ratio for line residue search box.
96 const int kLineResiduePadRatio = 3;
97 // Min multiple of neighbour size for a line residue to be genuine.
98 const double kLineResidueSizeRatio = 1.75;
99 // Aspect ratio filter for OSD.
100 const float kSizeRatioToReject = 2.0;
101 // Expansion factor for search box for good neighbours.
102 const double kNeighbourSearchFactor = 2.5;
103 // Factor of increase of overlap when adding diacritics to make an image noisy.
104 const double kNoiseOverlapGrowthFactor = 4.0;
105 // Fraction of the image size to add overlap when adding diacritics for an
106 // image to qualify as noisy.
107 const double kNoiseOverlapAreaFactor = 1.0 / 512;
108 
110  const ICOORD& bleft, const ICOORD& tright)
111  : BlobGrid(gridsize, bleft, tright), nontext_map_(nullptr), projection_(nullptr),
112  denorm_(nullptr), grid_box_(bleft, tright), rerotation_(1.0f, 0.0f) {
113  leaders_win_ = nullptr;
114  widths_win_ = nullptr;
115  initial_widths_win_ = nullptr;
116  chains_win_ = nullptr;
117  diacritics_win_ = nullptr;
118  textlines_win_ = nullptr;
119  smoothed_win_ = nullptr;
120 }
121 
123  if (widths_win_ != nullptr) {
124  #ifndef GRAPHICS_DISABLED
125  delete widths_win_->AwaitEvent(SVET_DESTROY);
126  #endif // GRAPHICS_DISABLED
127  if (textord_tabfind_only_strokewidths)
128  exit(0);
129  delete widths_win_;
130  }
131  delete leaders_win_;
132  delete initial_widths_win_;
133  delete chains_win_;
134  delete textlines_win_;
135  delete smoothed_win_;
136  delete diacritics_win_;
137 }
138 
139 // Sets the neighbours member of the medium-sized blobs in the block.
140 // Searches on 4 sides of each blob for similar-sized, similar-strokewidth
141 // blobs and sets pointers to the good neighbours.
143  // Run a preliminary strokewidth neighbour detection on the medium blobs.
144  InsertBlobList(&block->blobs);
145  BLOBNBOX_IT blob_it(&block->blobs);
146  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
147  SetNeighbours(false, false, blob_it.data());
148  }
149  Clear();
150 }
151 
152 // Sets the neighbour/textline writing direction members of the medium
153 // and large blobs with optional repair of broken CJK characters first.
154 // Repair of broken CJK is needed here because broken CJK characters
155 // can fool the textline direction detection algorithm.
157  bool cjk_merge,
158  TO_BLOCK* input_block) {
159  // Setup the grid with the remaining (non-noise) blobs.
160  InsertBlobs(input_block);
161  // Repair broken CJK characters if needed.
162  while (cjk_merge && FixBrokenCJK(input_block));
163  // Grade blobs by inspection of neighbours.
164  FindTextlineFlowDirection(pageseg_mode, false);
165  // Clear the grid ready for rotation or leader finding.
166  Clear();
167 }
168 
169 // Helper to collect and count horizontal and vertical blobs from a list.
170 static void CollectHorizVertBlobs(BLOBNBOX_LIST* input_blobs,
171  int* num_vertical_blobs,
172  int* num_horizontal_blobs,
173  BLOBNBOX_CLIST* vertical_blobs,
174  BLOBNBOX_CLIST* horizontal_blobs,
175  BLOBNBOX_CLIST* nondescript_blobs) {
176  BLOBNBOX_C_IT v_it(vertical_blobs);
177  BLOBNBOX_C_IT h_it(horizontal_blobs);
178  BLOBNBOX_C_IT n_it(nondescript_blobs);
179  BLOBNBOX_IT blob_it(input_blobs);
180  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
181  BLOBNBOX* blob = blob_it.data();
182  const TBOX& box = blob->bounding_box();
183  float y_x = static_cast<float>(box.height()) / box.width();
184  float x_y = 1.0f / y_x;
185  // Select a >= 1.0 ratio
186  float ratio = x_y > y_x ? x_y : y_x;
187  // If the aspect ratio is small and we want them for osd, save the blob.
188  bool ok_blob = ratio <= kSizeRatioToReject;
189  if (blob->UniquelyVertical()) {
190  ++*num_vertical_blobs;
191  if (ok_blob) v_it.add_after_then_move(blob);
192  } else if (blob->UniquelyHorizontal()) {
193  ++*num_horizontal_blobs;
194  if (ok_blob) h_it.add_after_then_move(blob);
195  } else if (ok_blob) {
196  n_it.add_after_then_move(blob);
197  }
198  }
199 }
200 
201 
202 // Types all the blobs as vertical or horizontal text or unknown and
203 // returns true if the majority are vertical.
204 // If the blobs are rotated, it is necessary to call CorrectForRotation
205 // after rotating everything, otherwise the work done here will be enough.
206 // If osd_blobs is not null, a list of blobs from the dominant textline
207 // direction are returned for use in orientation and script detection.
208 bool StrokeWidth::TestVerticalTextDirection(double find_vertical_text_ratio,
209  TO_BLOCK* block,
210  BLOBNBOX_CLIST* osd_blobs) {
211  int vertical_boxes = 0;
212  int horizontal_boxes = 0;
213  // Count vertical normal and large blobs.
214  BLOBNBOX_CLIST vertical_blobs;
215  BLOBNBOX_CLIST horizontal_blobs;
216  BLOBNBOX_CLIST nondescript_blobs;
217  CollectHorizVertBlobs(&block->blobs, &vertical_boxes, &horizontal_boxes,
218  &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
219  CollectHorizVertBlobs(&block->large_blobs, &vertical_boxes, &horizontal_boxes,
220  &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
222  tprintf("TextDir hbox=%d vs vbox=%d, %dH, %dV, %dN osd blobs\n",
223  horizontal_boxes, vertical_boxes,
224  horizontal_blobs.length(), vertical_blobs.length(),
225  nondescript_blobs.length());
226  if (osd_blobs != nullptr && vertical_boxes == 0 && horizontal_boxes == 0) {
227  // Only nondescript blobs available, so return those.
228  BLOBNBOX_C_IT osd_it(osd_blobs);
229  osd_it.add_list_after(&nondescript_blobs);
230  return false;
231  }
232  int min_vert_boxes = static_cast<int>((vertical_boxes + horizontal_boxes) *
233  find_vertical_text_ratio);
234  if (vertical_boxes >= min_vert_boxes) {
235  if (osd_blobs != nullptr) {
236  BLOBNBOX_C_IT osd_it(osd_blobs);
237  osd_it.add_list_after(&vertical_blobs);
238  }
239  return true;
240  } else {
241  if (osd_blobs != nullptr) {
242  BLOBNBOX_C_IT osd_it(osd_blobs);
243  osd_it.add_list_after(&horizontal_blobs);
244  }
245  return false;
246  }
247 }
248 
249 // Corrects the data structures for the given rotation.
251  ColPartitionGrid* part_grid) {
252  Init(part_grid->gridsize(), part_grid->bleft(), part_grid->tright());
253  grid_box_ = TBOX(bleft(), tright());
254  rerotation_.set_x(rotation.x());
255  rerotation_.set_y(-rotation.y());
256 }
257 
258 // Finds leader partitions and inserts them into the given part_grid.
260  ColPartitionGrid* part_grid) {
261  Clear();
262  // Find and isolate leaders in the noise list.
263  ColPartition_LIST leader_parts;
264  FindLeadersAndMarkNoise(block, &leader_parts);
265  // Setup the strokewidth grid with the block's remaining (non-noise) blobs.
266  InsertBlobList(&block->blobs);
267  // Mark blobs that have leader neighbours.
268  for (ColPartition_IT it(&leader_parts); !it.empty(); it.forward()) {
269  ColPartition* part = it.extract();
270  part->ClaimBoxes();
271  MarkLeaderNeighbours(part, LR_LEFT);
272  MarkLeaderNeighbours(part, LR_RIGHT);
273  part_grid->InsertBBox(true, true, part);
274  }
275 }
276 
277 // Finds and marks noise those blobs that look like bits of vertical lines
278 // that would otherwise screw up layout analysis.
279 void StrokeWidth::RemoveLineResidue(ColPartition_LIST* big_part_list) {
280  BlobGridSearch gsearch(this);
281  BLOBNBOX* bbox;
282  // For every vertical line-like bbox in the grid, search its neighbours
283  // to find the tallest, and if the original box is taller by sufficient
284  // margin, then call it line residue and delete it.
285  gsearch.StartFullSearch();
286  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
287  TBOX box = bbox->bounding_box();
288  if (box.height() < box.width() * kLineResidueAspectRatio)
289  continue;
290  // Set up a rectangle search around the blob to find the size of its
291  // neighbours.
292  int padding = box.height() * kLineResiduePadRatio;
293  TBOX search_box = box;
294  search_box.pad(padding, padding);
295  bool debug = AlignedBlob::WithinTestRegion(2, box.left(),
296  box.bottom());
297  // Find the largest object in the search box not equal to bbox.
298  BlobGridSearch rsearch(this);
299  int max_height = 0;
300  BLOBNBOX* n;
301  rsearch.StartRectSearch(search_box);
302  while ((n = rsearch.NextRectSearch()) != nullptr) {
303  if (n == bbox) continue;
304  TBOX nbox = n->bounding_box();
305  if (nbox.height() > max_height) {
306  max_height = nbox.height();
307  }
308  }
309  if (debug) {
310  tprintf("Max neighbour size=%d for candidate line box at:", max_height);
311  box.print();
312  }
313  if (max_height * kLineResidueSizeRatio < box.height()) {
314  #ifndef GRAPHICS_DISABLED
315  if (leaders_win_ != nullptr) {
316  // We are debugging, so display deleted in pink blobs in the same
317  // window that we use to display leader detection.
318  leaders_win_->Pen(ScrollView::PINK);
319  leaders_win_->Rectangle(box.left(), box.bottom(),
320  box.right(), box.top());
321  }
322  #endif // GRAPHICS_DISABLED
323  ColPartition::MakeBigPartition(bbox, big_part_list);
324  }
325  }
326 }
327 
328 // Types all the blobs as vertical text or horizontal text or unknown and
329 // puts them into initial ColPartitions in the supplied part_grid.
330 // rerotation determines how to get back to the image coordinates from the
331 // blob coordinates (since they may have been rotated for vertical text).
332 // block is the single block for the whole page or rectangle to be OCRed.
333 // nontext_pix (full-size), is a binary mask used to prevent merges across
334 // photo/text boundaries. It is not kept beyond this function.
335 // denorm provides a mapping back to the image from the current blob
336 // coordinate space.
337 // projection provides a measure of textline density over the image and
338 // provides functions to assist with diacritic detection. It should be a
339 // pointer to a new TextlineProjection, and will be setup here.
340 // part_grid is the output grid of textline partitions.
341 // Large blobs that cause overlap are put in separate partitions and added
342 // to the big_parts list.
344  PageSegMode pageseg_mode, const FCOORD& rerotation, TO_BLOCK* block,
345  Pix* nontext_pix, const DENORM* denorm, bool cjk_script,
346  TextlineProjection* projection, BLOBNBOX_LIST* diacritic_blobs,
347  ColPartitionGrid* part_grid, ColPartition_LIST* big_parts) {
348  nontext_map_ = nontext_pix;
349  projection_ = projection;
350  denorm_ = denorm;
351  // Clear and re Insert to take advantage of the tab stops in the blobs.
352  Clear();
353  // Setup the strokewidth grid with the remaining non-noise, non-leader blobs.
354  InsertBlobs(block);
355 
356  // Run FixBrokenCJK() again if the page is CJK.
357  if (cjk_script) {
358  FixBrokenCJK(block);
359  }
360  FindTextlineFlowDirection(pageseg_mode, false);
361  projection_->ConstructProjection(block, rerotation, nontext_map_);
362  if (textord_tabfind_show_strokewidths) {
363  ScrollView* line_blobs_win = MakeWindow(0, 0, "Initial textline Blobs");
364  projection_->PlotGradedBlobs(&block->blobs, line_blobs_win);
365  projection_->PlotGradedBlobs(&block->small_blobs, line_blobs_win);
366  }
367  projection_->MoveNonTextlineBlobs(&block->blobs, &block->noise_blobs);
368  projection_->MoveNonTextlineBlobs(&block->small_blobs, &block->noise_blobs);
369  // Clear and re Insert to take advantage of the removed diacritics.
370  Clear();
371  InsertBlobs(block);
372  FCOORD skew;
373  FindTextlineFlowDirection(pageseg_mode, true);
375  FindInitialPartitions(pageseg_mode, rerotation, true, block,
376  diacritic_blobs, part_grid, big_parts, &skew);
377  if (r == PFR_NOISE) {
378  tprintf("Detected %d diacritics\n", diacritic_blobs->length());
379  // Noise was found, and removed.
380  Clear();
381  InsertBlobs(block);
382  FindTextlineFlowDirection(pageseg_mode, true);
383  r = FindInitialPartitions(pageseg_mode, rerotation, false, block,
384  diacritic_blobs, part_grid, big_parts, &skew);
385  }
386  nontext_map_ = nullptr;
387  projection_ = nullptr;
388  denorm_ = nullptr;
389 }
390 
391 static void PrintBoxWidths(BLOBNBOX* neighbour) {
392  const TBOX& nbox = neighbour->bounding_box();
393  tprintf("Box (%d,%d)->(%d,%d): h-width=%.1f, v-width=%.1f p-width=%1.f\n",
394  nbox.left(), nbox.bottom(), nbox.right(), nbox.top(),
395  neighbour->horz_stroke_width(), neighbour->vert_stroke_width(),
396  2.0 * neighbour->cblob()->area()/neighbour->cblob()->perimeter());
397 }
398 
400 void StrokeWidth::HandleClick(int x, int y) {
402  // Run a radial search for blobs that overlap.
403  BlobGridSearch radsearch(this);
404  radsearch.StartRadSearch(x, y, 1);
405  BLOBNBOX* neighbour;
406  FCOORD click(static_cast<float>(x), static_cast<float>(y));
407  while ((neighbour = radsearch.NextRadSearch()) != nullptr) {
408  TBOX nbox = neighbour->bounding_box();
409  if (nbox.contains(click) && neighbour->cblob() != nullptr) {
410  PrintBoxWidths(neighbour);
411  if (neighbour->neighbour(BND_LEFT) != nullptr)
412  PrintBoxWidths(neighbour->neighbour(BND_LEFT));
413  if (neighbour->neighbour(BND_RIGHT) != nullptr)
414  PrintBoxWidths(neighbour->neighbour(BND_RIGHT));
415  if (neighbour->neighbour(BND_ABOVE) != nullptr)
416  PrintBoxWidths(neighbour->neighbour(BND_ABOVE));
417  if (neighbour->neighbour(BND_BELOW) != nullptr)
418  PrintBoxWidths(neighbour->neighbour(BND_BELOW));
419  int gaps[BND_COUNT];
420  neighbour->NeighbourGaps(gaps);
421  tprintf("Left gap=%d, right=%d, above=%d, below=%d, horz=%d, vert=%d\n"
422  "Good= %d %d %d %d\n",
423  gaps[BND_LEFT], gaps[BND_RIGHT],
424  gaps[BND_ABOVE], gaps[BND_BELOW],
425  neighbour->horz_possible(),
426  neighbour->vert_possible(),
427  neighbour->good_stroke_neighbour(BND_LEFT),
428  neighbour->good_stroke_neighbour(BND_RIGHT),
429  neighbour->good_stroke_neighbour(BND_ABOVE),
430  neighbour->good_stroke_neighbour(BND_BELOW));
431  break;
432  }
433  }
434 }
435 
436 // Detects and marks leader dots/dashes.
437 // Leaders are horizontal chains of small or noise blobs that look
438 // monospace according to ColPartition::MarkAsLeaderIfMonospaced().
439 // Detected leaders become the only occupants of the block->small_blobs list.
440 // Non-leader small blobs get moved to the blobs list.
441 // Non-leader noise blobs remain singletons in the noise list.
442 // All small and noise blobs in high density regions are marked BTFT_NONTEXT.
443 // block is the single block for the whole page or rectangle to be OCRed.
444 // leader_parts is the output.
445 void StrokeWidth::FindLeadersAndMarkNoise(TO_BLOCK* block,
446  ColPartition_LIST* leader_parts) {
447  InsertBlobList(&block->small_blobs);
448  InsertBlobList(&block->noise_blobs);
449  BlobGridSearch gsearch(this);
450  BLOBNBOX* bbox;
451  // For every bbox in the grid, set its neighbours.
452  gsearch.StartFullSearch();
453  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
454  SetNeighbours(true, false, bbox);
455  }
456  ColPartition_IT part_it(leader_parts);
457  gsearch.StartFullSearch();
458  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
459  if (bbox->flow() == BTFT_NONE) {
460  if (bbox->neighbour(BND_RIGHT) == nullptr &&
461  bbox->neighbour(BND_LEFT) == nullptr)
462  continue;
463  // Put all the linked blobs into a ColPartition.
464  ColPartition* part = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
465  BLOBNBOX* blob;
466  for (blob = bbox; blob != nullptr && blob->flow() == BTFT_NONE;
467  blob = blob->neighbour(BND_RIGHT))
468  part->AddBox(blob);
469  for (blob = bbox->neighbour(BND_LEFT); blob != nullptr &&
470  blob->flow() == BTFT_NONE;
471  blob = blob->neighbour(BND_LEFT))
472  part->AddBox(blob);
473  if (part->MarkAsLeaderIfMonospaced())
474  part_it.add_after_then_move(part);
475  else
476  delete part;
477  }
478  }
479  if (textord_tabfind_show_strokewidths) {
480  leaders_win_ = DisplayGoodBlobs("LeaderNeighbours", 0, 0);
481  }
482  // Move any non-leaders from the small to the blobs list, as they are
483  // most likely dashes or broken characters.
484  BLOBNBOX_IT blob_it(&block->blobs);
485  BLOBNBOX_IT small_it(&block->small_blobs);
486  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
487  BLOBNBOX* blob = small_it.data();
488  if (blob->flow() != BTFT_LEADER) {
489  if (blob->flow() == BTFT_NEIGHBOURS)
490  blob->set_flow(BTFT_NONE);
491  blob->ClearNeighbours();
492  blob_it.add_to_end(small_it.extract());
493  }
494  }
495  // Move leaders from the noise list to the small list, leaving the small
496  // list exclusively leaders, so they don't get processed further,
497  // and the remaining small blobs all in the noise list.
498  BLOBNBOX_IT noise_it(&block->noise_blobs);
499  for (noise_it.mark_cycle_pt(); !noise_it.cycled_list(); noise_it.forward()) {
500  BLOBNBOX* blob = noise_it.data();
501  if (blob->flow() == BTFT_LEADER || blob->joined_to_prev()) {
502  small_it.add_to_end(noise_it.extract());
503  } else if (blob->flow() == BTFT_NEIGHBOURS) {
504  blob->set_flow(BTFT_NONE);
505  blob->ClearNeighbours();
506  }
507  }
508  // Clear the grid as we don't want the small stuff hanging around in it.
509  Clear();
510 }
511 
514 void StrokeWidth::InsertBlobs(TO_BLOCK* block) {
515  InsertBlobList(&block->blobs);
516  InsertBlobList(&block->large_blobs);
517 }
518 
519 // Checks the left or right side of the given leader partition and sets the
520 // (opposite) leader_on_right or leader_on_left flags for blobs
521 // that are next to the given side of the given leader partition.
522 void StrokeWidth::MarkLeaderNeighbours(const ColPartition* part,
523  LeftOrRight side) {
524  const TBOX& part_box = part->bounding_box();
525  BlobGridSearch blobsearch(this);
526  // Search to the side of the leader for the nearest neighbour.
527  BLOBNBOX* best_blob = nullptr;
528  int best_gap = 0;
529  blobsearch.StartSideSearch(side == LR_LEFT ? part_box.left()
530  : part_box.right(),
531  part_box.bottom(), part_box.top());
532  BLOBNBOX* blob;
533  while ((blob = blobsearch.NextSideSearch(side == LR_LEFT)) != nullptr) {
534  const TBOX& blob_box = blob->bounding_box();
535  if (!blob_box.y_overlap(part_box))
536  continue;
537  int x_gap = blob_box.x_gap(part_box);
538  if (x_gap > 2 * gridsize()) {
539  break;
540  } else if (best_blob == nullptr || x_gap < best_gap) {
541  best_blob = blob;
542  best_gap = x_gap;
543  }
544  }
545  if (best_blob != nullptr) {
546  if (side == LR_LEFT)
547  best_blob->set_leader_on_right(true);
548  else
549  best_blob->set_leader_on_left(true);
550  #ifndef GRAPHICS_DISABLED
551  if (leaders_win_ != nullptr) {
552  leaders_win_->Pen(side == LR_LEFT ? ScrollView::RED : ScrollView::GREEN);
553  const TBOX& blob_box = best_blob->bounding_box();
554  leaders_win_->Rectangle(blob_box.left(), blob_box.bottom(),
555  blob_box.right(), blob_box.top());
556  }
557  #endif // GRAPHICS_DISABLED
558  }
559 }
560 
561 // Helper to compute the UQ of the square-ish CJK characters.
562 static int UpperQuartileCJKSize(int gridsize, BLOBNBOX_LIST* blobs) {
563  STATS sizes(0, gridsize * kMaxCJKSizeRatio);
564  BLOBNBOX_IT it(blobs);
565  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
566  BLOBNBOX* blob = it.data();
567  int width = blob->bounding_box().width();
568  int height = blob->bounding_box().height();
569  if (width <= height * kCJKAspectRatio && height < width * kCJKAspectRatio)
570  sizes.add(height, 1);
571  }
572  return static_cast<int>(sizes.ile(0.75f) + 0.5);
573 }
574 
575 // Fix broken CJK characters, using the fake joined blobs mechanism.
576 // Blobs are really merged, ie the master takes all the outlines and the
577 // others are deleted.
578 // Returns true if sufficient blobs are merged that it may be worth running
579 // again, due to a better estimate of character size.
580 bool StrokeWidth::FixBrokenCJK(TO_BLOCK* block) {
581  BLOBNBOX_LIST* blobs = &block->blobs;
582  int median_height = UpperQuartileCJKSize(gridsize(), blobs);
583  int max_dist = static_cast<int>(median_height * kCJKBrokenDistanceFraction);
584  int max_height = static_cast<int>(median_height * kCJKAspectRatio);
585  int num_fixed = 0;
586  BLOBNBOX_IT blob_it(blobs);
587 
588  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
589  BLOBNBOX* blob = blob_it.data();
590  if (blob->cblob() == nullptr || blob->cblob()->out_list()->empty())
591  continue;
592  TBOX bbox = blob->bounding_box();
593  bool debug = AlignedBlob::WithinTestRegion(3, bbox.left(),
594  bbox.bottom());
595  if (debug) {
596  tprintf("Checking for Broken CJK (max size=%d):", max_height);
597  bbox.print();
598  }
599  // Generate a list of blobs that overlap or are near enough to merge.
600  BLOBNBOX_CLIST overlapped_blobs;
601  AccumulateOverlaps(blob, debug, max_height, max_dist,
602  &bbox, &overlapped_blobs);
603  if (!overlapped_blobs.empty()) {
604  // There are overlapping blobs, so qualify them as being satisfactory
605  // before removing them from the grid and replacing them with the union.
606  // The final box must be roughly square.
607  if (bbox.width() > bbox.height() * kCJKAspectRatio ||
608  bbox.height() > bbox.width() * kCJKAspectRatio) {
609  if (debug) {
610  tprintf("Bad final aspectratio:");
611  bbox.print();
612  }
613  continue;
614  }
615  // There can't be too many blobs to merge.
616  if (overlapped_blobs.length() >= kCJKMaxComponents) {
617  if (debug)
618  tprintf("Too many neighbours: %d\n", overlapped_blobs.length());
619  continue;
620  }
621  // The strokewidths must match amongst the join candidates.
622  BLOBNBOX_C_IT n_it(&overlapped_blobs);
623  for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
624  BLOBNBOX* neighbour = nullptr;
625  neighbour = n_it.data();
626  if (!blob->MatchingStrokeWidth(*neighbour, kStrokeWidthFractionCJK,
628  break;
629  }
630  if (!n_it.cycled_list()) {
631  if (debug) {
632  tprintf("Bad stroke widths:");
633  PrintBoxWidths(blob);
634  }
635  continue; // Not good enough.
636  }
637 
638  // Merge all the candidates into blob.
639  // We must remove blob from the grid and reinsert it after merging
640  // to maintain the integrity of the grid.
641  RemoveBBox(blob);
642  // Everything else will be calculated later.
643  for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
644  BLOBNBOX* neighbour = n_it.data();
645  RemoveBBox(neighbour);
646  // Mark empty blob for deletion.
647  neighbour->set_region_type(BRT_NOISE);
648  blob->really_merge(neighbour);
649  if (rerotation_.x() != 1.0f || rerotation_.y() != 0.0f) {
650  blob->rotate_box(rerotation_);
651  }
652  }
653  InsertBBox(true, true, blob);
654  ++num_fixed;
655  if (debug) {
656  tprintf("Done! Final box:");
657  bbox.print();
658  }
659  }
660  }
661  // Count remaining blobs.
662  int num_remaining = 0;
663  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
664  BLOBNBOX* blob = blob_it.data();
665  if (blob->cblob() != nullptr && !blob->cblob()->out_list()->empty()) {
666  ++num_remaining;
667  }
668  }
669  // Permanently delete all the marked blobs after first removing all
670  // references in the neighbour members.
671  block->DeleteUnownedNoise();
672  return num_fixed > num_remaining * kBrokenCJKIterationFraction;
673 }
674 
675 // Helper function to determine whether it is reasonable to merge the
676 // bbox and the nbox for repairing broken CJK.
677 // The distance apart must not exceed max_dist, the combined size must
678 // not exceed max_size, and the aspect ratio must either improve or at
679 // least not get worse by much.
680 static bool AcceptableCJKMerge(const TBOX& bbox, const TBOX& nbox,
681  bool debug, int max_size, int max_dist,
682  int* x_gap, int* y_gap) {
683  *x_gap = bbox.x_gap(nbox);
684  *y_gap = bbox.y_gap(nbox);
685  TBOX merged(nbox);
686  merged += bbox;
687  if (debug) {
688  tprintf("gaps = %d, %d, merged_box:", *x_gap, *y_gap);
689  merged.print();
690  }
691  if (*x_gap <= max_dist && *y_gap <= max_dist &&
692  merged.width() <= max_size && merged.height() <= max_size) {
693  // Close enough to call overlapping. Check aspect ratios.
694  double old_ratio = static_cast<double>(bbox.width()) / bbox.height();
695  if (old_ratio < 1.0) old_ratio = 1.0 / old_ratio;
696  double new_ratio = static_cast<double>(merged.width()) / merged.height();
697  if (new_ratio < 1.0) new_ratio = 1.0 / new_ratio;
698  if (new_ratio <= old_ratio * kCJKAspectRatioIncrease)
699  return true;
700  }
701  return false;
702 }
703 
704 // Collect blobs that overlap or are within max_dist of the input bbox.
705 // Return them in the list of blobs and expand the bbox to be the union
706 // of all the boxes. not_this is excluded from the search, as are blobs
707 // that cause the merged box to exceed max_size in either dimension.
708 void StrokeWidth::AccumulateOverlaps(const BLOBNBOX* not_this, bool debug,
709  int max_size, int max_dist,
710  TBOX* bbox, BLOBNBOX_CLIST* blobs) {
711  // While searching, nearests holds the nearest failed blob in each
712  // direction. When we have a nearest in each of the 4 directions, then
713  // the search is over, and at this point the final bbox must not overlap
714  // any of the nearests.
715  BLOBNBOX* nearests[BND_COUNT];
716  for (auto & nearest : nearests) {
717  nearest = nullptr;
718  }
719  int x = (bbox->left() + bbox->right()) / 2;
720  int y = (bbox->bottom() + bbox->top()) / 2;
721  // Run a radial search for blobs that overlap or are sufficiently close.
722  BlobGridSearch radsearch(this);
723  radsearch.StartRadSearch(x, y, kCJKRadius);
724  BLOBNBOX* neighbour;
725  while ((neighbour = radsearch.NextRadSearch()) != nullptr) {
726  if (neighbour == not_this) continue;
727  TBOX nbox = neighbour->bounding_box();
728  int x_gap, y_gap;
729  if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, max_dist,
730  &x_gap, &y_gap)) {
731  // Close enough to call overlapping. Merge boxes.
732  *bbox += nbox;
733  blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
734  if (debug) {
735  tprintf("Added:");
736  nbox.print();
737  }
738  // Since we merged, search the nearests, as some might now me mergeable.
739  for (int dir = 0; dir < BND_COUNT; ++dir) {
740  if (nearests[dir] == nullptr) continue;
741  nbox = nearests[dir]->bounding_box();
742  if (AcceptableCJKMerge(*bbox, nbox, debug, max_size,
743  max_dist, &x_gap, &y_gap)) {
744  // Close enough to call overlapping. Merge boxes.
745  *bbox += nbox;
746  blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, nearests[dir]);
747  if (debug) {
748  tprintf("Added:");
749  nbox.print();
750  }
751  nearests[dir] = nullptr;
752  dir = -1; // Restart the search.
753  }
754  }
755  } else if (x_gap < 0 && x_gap <= y_gap) {
756  // A vertical neighbour. Record the nearest.
757  BlobNeighbourDir dir = nbox.top() > bbox->top() ? BND_ABOVE : BND_BELOW;
758  if (nearests[dir] == nullptr ||
759  y_gap < bbox->y_gap(nearests[dir]->bounding_box())) {
760  nearests[dir] = neighbour;
761  }
762  } else if (y_gap < 0 && y_gap <= x_gap) {
763  // A horizontal neighbour. Record the nearest.
764  BlobNeighbourDir dir = nbox.left() > bbox->left() ? BND_RIGHT : BND_LEFT;
765  if (nearests[dir] == nullptr ||
766  x_gap < bbox->x_gap(nearests[dir]->bounding_box())) {
767  nearests[dir] = neighbour;
768  }
769  }
770  // If all nearests are non-null, then we have finished.
771  if (nearests[BND_LEFT] && nearests[BND_RIGHT] &&
772  nearests[BND_ABOVE] && nearests[BND_BELOW])
773  break;
774  }
775  // Final overlap with a nearest is not allowed.
776  for (auto & nearest : nearests) {
777  if (nearest == nullptr) continue;
778  const TBOX& nbox = nearest->bounding_box();
779  if (debug) {
780  tprintf("Testing for overlap with:");
781  nbox.print();
782  }
783  if (bbox->overlap(nbox)) {
784  blobs->shallow_clear();
785  if (debug)
786  tprintf("Final box overlaps nearest\n");
787  return;
788  }
789  }
790 }
791 
792 // For each blob in this grid, Finds the textline direction to be horizontal
793 // or vertical according to distance to neighbours and 1st and 2nd order
794 // neighbours. Non-text tends to end up without a definite direction.
795 // Result is setting of the neighbours and vert_possible/horz_possible
796 // flags in the BLOBNBOXes currently in this grid.
797 // This function is called more than once if page orientation is uncertain,
798 // so display_if_debugging is true on the final call to display the results.
799 void StrokeWidth::FindTextlineFlowDirection(PageSegMode pageseg_mode,
800  bool display_if_debugging) {
801  BlobGridSearch gsearch(this);
802  BLOBNBOX* bbox;
803  // For every bbox in the grid, set its neighbours.
804  gsearch.StartFullSearch();
805  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
806  SetNeighbours(false, display_if_debugging, bbox);
807  }
808  // Where vertical or horizontal wins by a big margin, clarify it.
809  gsearch.StartFullSearch();
810  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
811  SimplifyObviousNeighbours(bbox);
812  }
813  // Now try to make the blobs only vertical or horizontal using neighbours.
814  gsearch.StartFullSearch();
815  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
816  if (FindingVerticalOnly(pageseg_mode)) {
817  bbox->set_vert_possible(true);
818  bbox->set_horz_possible(false);
819  } else if (FindingHorizontalOnly(pageseg_mode)) {
820  bbox->set_vert_possible(false);
821  bbox->set_horz_possible(true);
822  } else {
823  SetNeighbourFlows(bbox);
824  }
825  }
826  if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
827  textord_tabfind_show_strokewidths > 1) {
828  initial_widths_win_ = DisplayGoodBlobs("InitialStrokewidths", 400, 0);
829  }
830  // Improve flow direction with neighbours.
831  gsearch.StartFullSearch();
832  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
833  SmoothNeighbourTypes(pageseg_mode, false, bbox);
834  }
835  // Now allow reset of firm values to fix renegades.
836  gsearch.StartFullSearch();
837  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
838  SmoothNeighbourTypes(pageseg_mode, true, bbox);
839  }
840  // Repeat.
841  gsearch.StartFullSearch();
842  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
843  SmoothNeighbourTypes(pageseg_mode, true, bbox);
844  }
845  if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
846  textord_tabfind_show_strokewidths > 1) {
847  widths_win_ = DisplayGoodBlobs("ImprovedStrokewidths", 800, 0);
848  }
849 }
850 
851 // Sets the neighbours and good_stroke_neighbours members of the blob by
852 // searching close on all 4 sides.
853 // When finding leader dots/dashes, there is a slightly different rule for
854 // what makes a good neighbour.
855 void StrokeWidth::SetNeighbours(bool leaders, bool activate_line_trap,
856  BLOBNBOX* blob) {
857  int line_trap_count = 0;
858  for (int dir = 0; dir < BND_COUNT; ++dir) {
859  auto bnd = static_cast<BlobNeighbourDir>(dir);
860  line_trap_count += FindGoodNeighbour(bnd, leaders, blob);
861  }
862  if (line_trap_count > 0 && activate_line_trap) {
863  // It looks like a line so isolate it by clearing its neighbours.
864  blob->ClearNeighbours();
865  const TBOX& box = blob->bounding_box();
866  blob->set_region_type(box.width() > box.height() ? BRT_HLINE : BRT_VLINE);
867  }
868 }
869 
870 
871 // Sets the good_stroke_neighbours member of the blob if it has a
872 // GoodNeighbour on the given side.
873 // Also sets the neighbour in the blob, whether or not a good one is found.
874 // Returns the number of blobs in the nearby search area that would lead us to
875 // believe that this blob is a line separator.
876 // Leaders get extra special lenient treatment.
877 int StrokeWidth::FindGoodNeighbour(BlobNeighbourDir dir, bool leaders,
878  BLOBNBOX* blob) {
879  // Search for neighbours that overlap vertically.
880  TBOX blob_box = blob->bounding_box();
881  bool debug = AlignedBlob::WithinTestRegion(2, blob_box.left(),
882  blob_box.bottom());
883  if (debug) {
884  tprintf("FGN in dir %d for blob:", dir);
885  blob_box.print();
886  }
887  int top = blob_box.top();
888  int bottom = blob_box.bottom();
889  int left = blob_box.left();
890  int right = blob_box.right();
891  int width = right - left;
892  int height = top - bottom;
893 
894  // A trap to detect lines tests for the min dimension of neighbours
895  // being larger than a multiple of the min dimension of the line
896  // and the larger dimension being smaller than a fraction of the max
897  // dimension of the line.
898  int line_trap_max = std::max(width, height) / kLineTrapLongest;
899  int line_trap_min = std::min(width, height) * kLineTrapShortest;
900  int line_trap_count = 0;
901 
902  int min_good_overlap = (dir == BND_LEFT || dir == BND_RIGHT)
903  ? height / 2 : width / 2;
904  int min_decent_overlap = (dir == BND_LEFT || dir == BND_RIGHT)
905  ? height / 3 : width / 3;
906  if (leaders)
907  min_good_overlap = min_decent_overlap = 1;
908 
909  int search_pad = static_cast<int>(
910  sqrt(static_cast<double>(width * height)) * kNeighbourSearchFactor);
911  if (gridsize() > search_pad)
912  search_pad = gridsize();
913  TBOX search_box = blob_box;
914  // Pad the search in the appropriate direction.
915  switch (dir) {
916  case BND_LEFT:
917  search_box.set_left(search_box.left() - search_pad);
918  break;
919  case BND_RIGHT:
920  search_box.set_right(search_box.right() + search_pad);
921  break;
922  case BND_BELOW:
923  search_box.set_bottom(search_box.bottom() - search_pad);
924  break;
925  case BND_ABOVE:
926  search_box.set_top(search_box.top() + search_pad);
927  break;
928  case BND_COUNT:
929  return 0;
930  }
931 
932  BlobGridSearch rectsearch(this);
933  rectsearch.StartRectSearch(search_box);
934  BLOBNBOX* best_neighbour = nullptr;
935  double best_goodness = 0.0;
936  bool best_is_good = false;
937  BLOBNBOX* neighbour;
938  while ((neighbour = rectsearch.NextRectSearch()) != nullptr) {
939  TBOX nbox = neighbour->bounding_box();
940  if (neighbour == blob)
941  continue;
942  int mid_x = (nbox.left() + nbox.right()) / 2;
943  if (mid_x < blob->left_rule() || mid_x > blob->right_rule())
944  continue; // In a different column.
945  if (debug) {
946  tprintf("Neighbour at:");
947  nbox.print();
948  }
949 
950  // Last-minute line detector. There is a small upper limit to the line
951  // width accepted by the morphological line detector.
952  int n_width = nbox.width();
953  int n_height = nbox.height();
954  if (std::min(n_width, n_height) > line_trap_min &&
955  std::max(n_width, n_height) < line_trap_max)
956  ++line_trap_count;
957  // Heavily joined text, such as Arabic may have very different sizes when
958  // looking at the maxes, but the heights may be almost identical, so check
959  // for a difference in height if looking sideways or width vertically.
960  if (TabFind::VeryDifferentSizes(std::max(n_width, n_height),
961  std::max(width, height)) &&
962  (((dir == BND_LEFT || dir ==BND_RIGHT) &&
963  TabFind::DifferentSizes(n_height, height)) ||
964  ((dir == BND_BELOW || dir ==BND_ABOVE) &&
965  TabFind::DifferentSizes(n_width, width)))) {
966  if (debug) tprintf("Bad size\n");
967  continue; // Could be a different font size or non-text.
968  }
969  // Amount of vertical overlap between the blobs.
970  int overlap;
971  // If the overlap is along the short side of the neighbour, and it
972  // is fully overlapped, then perp_overlap holds the length of the long
973  // side of the neighbour. A measure to include hyphens and dashes as
974  // legitimate neighbours.
975  int perp_overlap;
976  int gap;
977  if (dir == BND_LEFT || dir == BND_RIGHT) {
978  overlap = std::min(static_cast<int>(nbox.top()), top) - std::max(static_cast<int>(nbox.bottom()), bottom);
979  if (overlap == nbox.height() && nbox.width() > nbox.height())
980  perp_overlap = nbox.width();
981  else
982  perp_overlap = overlap;
983  gap = dir == BND_LEFT ? left - nbox.left() : nbox.right() - right;
984  if (gap <= 0) {
985  if (debug) tprintf("On wrong side\n");
986  continue; // On the wrong side.
987  }
988  gap -= n_width;
989  } else {
990  overlap = std::min(static_cast<int>(nbox.right()), right) - std::max(static_cast<int>(nbox.left()), left);
991  if (overlap == nbox.width() && nbox.height() > nbox.width())
992  perp_overlap = nbox.height();
993  else
994  perp_overlap = overlap;
995  gap = dir == BND_BELOW ? bottom - nbox.bottom() : nbox.top() - top;
996  if (gap <= 0) {
997  if (debug) tprintf("On wrong side\n");
998  continue; // On the wrong side.
999  }
1000  gap -= n_height;
1001  }
1002  if (-gap > overlap) {
1003  if (debug) tprintf("Overlaps wrong way\n");
1004  continue; // Overlaps the wrong way.
1005  }
1006  if (perp_overlap < min_decent_overlap) {
1007  if (debug) tprintf("Doesn't overlap enough\n");
1008  continue; // Doesn't overlap enough.
1009  }
1010  bool bad_sizes = TabFind::DifferentSizes(height, n_height) &&
1011  TabFind::DifferentSizes(width, n_width);
1012  bool is_good = overlap >= min_good_overlap && !bad_sizes &&
1013  blob->MatchingStrokeWidth(*neighbour,
1016  // Best is a fuzzy combination of gap, overlap and is good.
1017  // Basically if you make one thing twice as good without making
1018  // anything else twice as bad, then it is better.
1019  if (gap < 1) gap = 1;
1020  double goodness = (1.0 + is_good) * overlap / gap;
1021  if (debug) {
1022  tprintf("goodness = %g vs best of %g, good=%d, overlap=%d, gap=%d\n",
1023  goodness, best_goodness, is_good, overlap, gap);
1024  }
1025  if (goodness > best_goodness) {
1026  best_neighbour = neighbour;
1027  best_goodness = goodness;
1028  best_is_good = is_good;
1029  }
1030  }
1031  blob->set_neighbour(dir, best_neighbour, best_is_good);
1032  return line_trap_count;
1033 }
1034 
1035 // Helper to get a list of 1st-order neighbours.
1036 static void ListNeighbours(const BLOBNBOX* blob,
1037  BLOBNBOX_CLIST* neighbours) {
1038  for (int dir = 0; dir < BND_COUNT; ++dir) {
1039  auto bnd = static_cast<BlobNeighbourDir>(dir);
1040  BLOBNBOX* neighbour = blob->neighbour(bnd);
1041  if (neighbour != nullptr) {
1042  neighbours->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
1043  }
1044  }
1045 }
1046 
1047 // Helper to get a list of 1st and 2nd order neighbours.
1048 static void List2ndNeighbours(const BLOBNBOX* blob,
1049  BLOBNBOX_CLIST* neighbours) {
1050  ListNeighbours(blob, neighbours);
1051  for (int dir = 0; dir < BND_COUNT; ++dir) {
1052  auto bnd = static_cast<BlobNeighbourDir>(dir);
1053  BLOBNBOX* neighbour = blob->neighbour(bnd);
1054  if (neighbour != nullptr) {
1055  ListNeighbours(neighbour, neighbours);
1056  }
1057  }
1058 }
1059 
1060 // Helper to get a list of 1st, 2nd and 3rd order neighbours.
1061 static void List3rdNeighbours(const BLOBNBOX* blob,
1062  BLOBNBOX_CLIST* neighbours) {
1063  List2ndNeighbours(blob, neighbours);
1064  for (int dir = 0; dir < BND_COUNT; ++dir) {
1065  auto bnd = static_cast<BlobNeighbourDir>(dir);
1066  BLOBNBOX* neighbour = blob->neighbour(bnd);
1067  if (neighbour != nullptr) {
1068  List2ndNeighbours(neighbour, neighbours);
1069  }
1070  }
1071 }
1072 
1073 // Helper to count the evidence for verticalness or horizontalness
1074 // in a list of neighbours.
1075 static void CountNeighbourGaps(bool debug, BLOBNBOX_CLIST* neighbours,
1076  int* pure_h_count, int* pure_v_count) {
1077  if (neighbours->length() <= kMostlyOneDirRatio)
1078  return;
1079  BLOBNBOX_C_IT it(neighbours);
1080  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1081  BLOBNBOX* blob = it.data();
1082  int h_min, h_max, v_min, v_max;
1083  blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1084  if (debug)
1085  tprintf("Hgaps [%d,%d], vgaps [%d,%d]:", h_min, h_max, v_min, v_max);
1086  if (h_max < v_min ||
1087  blob->leader_on_left() || blob->leader_on_right()) {
1088  // Horizontal gaps are clear winners. Count a pure horizontal.
1089  ++*pure_h_count;
1090  if (debug) tprintf("Horz at:");
1091  } else if (v_max < h_min) {
1092  // Vertical gaps are clear winners. Clear a pure vertical.
1093  ++*pure_v_count;
1094  if (debug) tprintf("Vert at:");
1095  } else {
1096  if (debug) tprintf("Neither at:");
1097  }
1098  if (debug)
1099  blob->bounding_box().print();
1100  }
1101 }
1102 
1103 // Makes the blob to be only horizontal or vertical where evidence
1104 // is clear based on gaps of 2nd order neighbours, or definite individual
1105 // blobs.
1106 void StrokeWidth::SetNeighbourFlows(BLOBNBOX* blob) {
1107  if (blob->DefiniteIndividualFlow())
1108  return;
1109  bool debug = AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1110  blob->bounding_box().bottom());
1111  if (debug) {
1112  tprintf("SetNeighbourFlows (current flow=%d, type=%d) on:",
1113  blob->flow(), blob->region_type());
1114  blob->bounding_box().print();
1115  }
1116  BLOBNBOX_CLIST neighbours;
1117  List3rdNeighbours(blob, &neighbours);
1118  // The number of pure horizontal and vertical neighbours.
1119  int pure_h_count = 0;
1120  int pure_v_count = 0;
1121  CountNeighbourGaps(debug, &neighbours, &pure_h_count, &pure_v_count);
1122  if (debug) {
1123  HandleClick(blob->bounding_box().left() + 1,
1124  blob->bounding_box().bottom() + 1);
1125  tprintf("SetFlows: h_count=%d, v_count=%d\n",
1126  pure_h_count, pure_v_count);
1127  }
1128  if (!neighbours.empty()) {
1129  blob->set_vert_possible(true);
1130  blob->set_horz_possible(true);
1131  if (pure_h_count > 2 * pure_v_count) {
1132  // Horizontal gaps are clear winners. Clear vertical neighbours.
1133  blob->set_vert_possible(false);
1134  } else if (pure_v_count > 2 * pure_h_count) {
1135  // Vertical gaps are clear winners. Clear horizontal neighbours.
1136  blob->set_horz_possible(false);
1137  }
1138  } else {
1139  // Lonely blob. Can't tell its flow direction.
1140  blob->set_vert_possible(false);
1141  blob->set_horz_possible(false);
1142  }
1143 }
1144 
1145 
1146 // Helper to count the number of horizontal and vertical blobs in a list.
1147 static void CountNeighbourTypes(BLOBNBOX_CLIST* neighbours,
1148  int* pure_h_count, int* pure_v_count) {
1149  BLOBNBOX_C_IT it(neighbours);
1150  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1151  BLOBNBOX* blob = it.data();
1152  if (blob->UniquelyHorizontal())
1153  ++*pure_h_count;
1154  if (blob->UniquelyVertical())
1155  ++*pure_v_count;
1156  }
1157 }
1158 
1159 // Nullify the neighbours in the wrong directions where the direction
1160 // is clear-cut based on a distance margin. Good for isolating vertical
1161 // text from neighbouring horizontal text.
1162 void StrokeWidth::SimplifyObviousNeighbours(BLOBNBOX* blob) {
1163  // Case 1: We have text that is likely several characters, blurry and joined
1164  // together.
1165  if ((blob->bounding_box().width() > 3 * blob->area_stroke_width() &&
1166  blob->bounding_box().height() > 3 * blob->area_stroke_width())) {
1167  // The blob is complex (not stick-like).
1168  if (blob->bounding_box().width() > 4 * blob->bounding_box().height()) {
1169  // Horizontal conjoined text.
1170  blob->set_neighbour(BND_ABOVE, nullptr, false);
1171  blob->set_neighbour(BND_BELOW, nullptr, false);
1172  return;
1173  }
1174  if (blob->bounding_box().height() > 4 * blob->bounding_box().width()) {
1175  // Vertical conjoined text.
1176  blob->set_neighbour(BND_LEFT, nullptr, false);
1177  blob->set_neighbour(BND_RIGHT, nullptr, false);
1178  return;
1179  }
1180  }
1181 
1182  // Case 2: This blob is likely a single character.
1183  int margin = gridsize() / 2;
1184  int h_min, h_max, v_min, v_max;
1185  blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1186  if ((h_max + margin < v_min && h_max < margin / 2) ||
1187  blob->leader_on_left() || blob->leader_on_right()) {
1188  // Horizontal gaps are clear winners. Clear vertical neighbours.
1189  blob->set_neighbour(BND_ABOVE, nullptr, false);
1190  blob->set_neighbour(BND_BELOW, nullptr, false);
1191  } else if (v_max + margin < h_min && v_max < margin / 2) {
1192  // Vertical gaps are clear winners. Clear horizontal neighbours.
1193  blob->set_neighbour(BND_LEFT, nullptr, false);
1194  blob->set_neighbour(BND_RIGHT, nullptr, false);
1195  }
1196 }
1197 
1198 // Smoothes the vertical/horizontal type of the blob based on the
1199 // 2nd-order neighbours. If reset_all is true, then all blobs are
1200 // changed. Otherwise, only ambiguous blobs are processed.
1201 void StrokeWidth::SmoothNeighbourTypes(PageSegMode pageseg_mode, bool reset_all,
1202  BLOBNBOX* blob) {
1203  if ((blob->vert_possible() && blob->horz_possible()) || reset_all) {
1204  // There are both horizontal and vertical so try to fix it.
1205  BLOBNBOX_CLIST neighbours;
1206  List2ndNeighbours(blob, &neighbours);
1207  // The number of pure horizontal and vertical neighbours.
1208  int pure_h_count = 0;
1209  int pure_v_count = 0;
1210  CountNeighbourTypes(&neighbours, &pure_h_count, &pure_v_count);
1212  blob->bounding_box().bottom())) {
1213  HandleClick(blob->bounding_box().left() + 1,
1214  blob->bounding_box().bottom() + 1);
1215  tprintf("pure_h=%d, pure_v=%d\n",
1216  pure_h_count, pure_v_count);
1217  }
1218  if (pure_h_count > pure_v_count && !FindingVerticalOnly(pageseg_mode)) {
1219  // Horizontal gaps are clear winners. Clear vertical neighbours.
1220  blob->set_vert_possible(false);
1221  blob->set_horz_possible(true);
1222  } else if (pure_v_count > pure_h_count &&
1223  !FindingHorizontalOnly(pageseg_mode)) {
1224  // Vertical gaps are clear winners. Clear horizontal neighbours.
1225  blob->set_horz_possible(false);
1226  blob->set_vert_possible(true);
1227  }
1228  } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1229  blob->bounding_box().bottom())) {
1230  HandleClick(blob->bounding_box().left() + 1,
1231  blob->bounding_box().bottom() + 1);
1232  tprintf("Clean on pass 3!\n");
1233  }
1234 }
1235 
1236 // Partition creation. Accumulates vertical and horizontal text chains,
1237 // puts the remaining blobs in as unknowns, and then merges/splits to
1238 // minimize overlap and smoothes the types with neighbours and the color
1239 // image if provided. rerotation is used to rotate the coordinate space
1240 // back to the nontext_map_ image.
1241 // If find_problems is true, detects possible noise pollution by the amount
1242 // of partition overlap that is created by the diacritics. If excessive, the
1243 // noise is separated out into diacritic blobs, and PFR_NOISE is returned.
1244 // [TODO(rays): if the partition overlap is caused by heavy skew, deskews
1245 // the components, saves the skew_angle and returns PFR_SKEW.] If the return
1246 // is not PFR_OK, the job is incomplete, and FindInitialPartitions must be
1247 // called again after cleaning up the partly done work.
1248 PartitionFindResult StrokeWidth::FindInitialPartitions(
1249  PageSegMode pageseg_mode, const FCOORD& rerotation, bool find_problems,
1250  TO_BLOCK* block, BLOBNBOX_LIST* diacritic_blobs,
1251  ColPartitionGrid* part_grid, ColPartition_LIST* big_parts,
1252  FCOORD* skew_angle) {
1253  if (!FindingHorizontalOnly(pageseg_mode)) FindVerticalTextChains(part_grid);
1254  if (!FindingVerticalOnly(pageseg_mode)) FindHorizontalTextChains(part_grid);
1255  if (textord_tabfind_show_strokewidths) {
1256  chains_win_ = MakeWindow(0, 400, "Initial text chains");
1257  part_grid->DisplayBoxes(chains_win_);
1258  projection_->DisplayProjection();
1259  }
1260  if (find_problems) {
1261  // TODO(rays) Do something to find skew, set skew_angle and return if there
1262  // is some.
1263  }
1264  part_grid->SplitOverlappingPartitions(big_parts);
1265  EasyMerges(part_grid);
1266  RemoveLargeUnusedBlobs(block, part_grid, big_parts);
1267  TBOX grid_box(bleft(), tright());
1268  while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box,
1269  rerotation));
1270  while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
1271  grid_box, rerotation));
1272  int pre_overlap = part_grid->ComputeTotalOverlap(nullptr);
1273  TestDiacritics(part_grid, block);
1274  MergeDiacritics(block, part_grid);
1275  if (find_problems && diacritic_blobs != nullptr &&
1276  DetectAndRemoveNoise(pre_overlap, grid_box, block, part_grid,
1277  diacritic_blobs)) {
1278  return PFR_NOISE;
1279  }
1280  if (textord_tabfind_show_strokewidths) {
1281  textlines_win_ = MakeWindow(400, 400, "GoodTextline blobs");
1282  part_grid->DisplayBoxes(textlines_win_);
1283  diacritics_win_ = DisplayDiacritics("Diacritics", 0, 0, block);
1284  }
1285  PartitionRemainingBlobs(pageseg_mode, part_grid);
1286  part_grid->SplitOverlappingPartitions(big_parts);
1287  EasyMerges(part_grid);
1288  while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box,
1289  rerotation));
1290  while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
1291  grid_box, rerotation));
1292  // Now eliminate strong stuff in a sea of the opposite.
1293  while (part_grid->GridSmoothNeighbours(BTFT_STRONG_CHAIN, nontext_map_,
1294  grid_box, rerotation));
1295  if (textord_tabfind_show_strokewidths) {
1296  smoothed_win_ = MakeWindow(800, 400, "Smoothed blobs");
1297  part_grid->DisplayBoxes(smoothed_win_);
1298  }
1299  return PFR_OK;
1300 }
1301 
1302 // Detects noise by a significant increase in partition overlap from
1303 // pre_overlap to now, and removes noise from the union of all the overlapping
1304 // partitions, placing the blobs in diacritic_blobs. Returns true if any noise
1305 // was found and removed.
1306 bool StrokeWidth::DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box,
1307  TO_BLOCK* block,
1308  ColPartitionGrid* part_grid,
1309  BLOBNBOX_LIST* diacritic_blobs) {
1310  ColPartitionGrid* noise_grid = nullptr;
1311  int post_overlap = part_grid->ComputeTotalOverlap(&noise_grid);
1312  if (pre_overlap == 0) pre_overlap = 1;
1313  BLOBNBOX_IT diacritic_it(diacritic_blobs);
1314  if (noise_grid != nullptr) {
1315  if (post_overlap > pre_overlap * kNoiseOverlapGrowthFactor &&
1316  post_overlap > grid_box.area() * kNoiseOverlapAreaFactor) {
1317  // This is noisy enough to fix.
1318  if (textord_tabfind_show_strokewidths) {
1319  ScrollView* noise_win = MakeWindow(1000, 500, "Noise Areas");
1320  noise_grid->DisplayBoxes(noise_win);
1321  }
1322  part_grid->DeleteNonLeaderParts();
1323  BLOBNBOX_IT blob_it(&block->noise_blobs);
1324  ColPartitionGridSearch rsearch(noise_grid);
1325  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1326  BLOBNBOX* blob = blob_it.data();
1327  blob->ClearNeighbours();
1328  if (!blob->IsDiacritic() || blob->owner() != nullptr)
1329  continue; // Not a noise candidate.
1330  TBOX blob_box(blob->bounding_box());
1331  TBOX search_box(blob->bounding_box());
1332  search_box.pad(gridsize(), gridsize());
1333  rsearch.StartRectSearch(search_box);
1334  ColPartition* part = rsearch.NextRectSearch();
1335  if (part != nullptr) {
1336  // Consider blob as possible noise.
1337  blob->set_owns_cblob(true);
1338  blob->compute_bounding_box();
1339  diacritic_it.add_after_then_move(blob_it.extract());
1340  }
1341  }
1342  noise_grid->DeleteParts();
1343  delete noise_grid;
1344  return true;
1345  }
1346  noise_grid->DeleteParts();
1347  delete noise_grid;
1348  }
1349  return false;
1350 }
1351 
1352 // Helper verifies that blob's neighbour in direction dir is good to add to a
1353 // vertical text chain by returning the neighbour if it is not null, not owned,
1354 // and not uniquely horizontal, as well as its neighbour in the opposite
1355 // direction is blob.
1356 static BLOBNBOX* MutualUnusedVNeighbour(const BLOBNBOX* blob,
1357  BlobNeighbourDir dir) {
1358  BLOBNBOX* next_blob = blob->neighbour(dir);
1359  if (next_blob == nullptr || next_blob->owner() != nullptr ||
1360  next_blob->UniquelyHorizontal())
1361  return nullptr;
1362  if (next_blob->neighbour(DirOtherWay(dir)) == blob)
1363  return next_blob;
1364  return nullptr;
1365 }
1366 
1367 // Finds vertical chains of text-like blobs and puts them in ColPartitions.
1368 void StrokeWidth::FindVerticalTextChains(ColPartitionGrid* part_grid) {
1369  // A PageSegMode that forces vertical textlines with the current rotation.
1370  PageSegMode pageseg_mode =
1371  rerotation_.y() == 0.0f ? PSM_SINGLE_BLOCK_VERT_TEXT : PSM_SINGLE_COLUMN;
1372  BlobGridSearch gsearch(this);
1373  BLOBNBOX* bbox;
1374  gsearch.StartFullSearch();
1375  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1376  // Only process boxes that have no horizontal hope and have not yet
1377  // been included in a chain.
1378  BLOBNBOX* blob;
1379  if (bbox->owner() == nullptr && bbox->UniquelyVertical() &&
1380  (blob = MutualUnusedVNeighbour(bbox, BND_ABOVE)) != nullptr) {
1381  // Put all the linked blobs into a ColPartition.
1382  ColPartition* part = new ColPartition(BRT_VERT_TEXT, ICOORD(0, 1));
1383  part->AddBox(bbox);
1384  while (blob != nullptr) {
1385  part->AddBox(blob);
1386  blob = MutualUnusedVNeighbour(blob, BND_ABOVE);
1387  }
1388  blob = MutualUnusedVNeighbour(bbox, BND_BELOW);
1389  while (blob != nullptr) {
1390  part->AddBox(blob);
1391  blob = MutualUnusedVNeighbour(blob, BND_BELOW);
1392  }
1393  CompletePartition(pageseg_mode, part, part_grid);
1394  }
1395  }
1396 }
1397 
1398 // Helper verifies that blob's neighbour in direction dir is good to add to a
1399 // horizontal text chain by returning the neighbour if it is not null, not
1400 // owned, and not uniquely vertical, as well as its neighbour in the opposite
1401 // direction is blob.
1402 static BLOBNBOX* MutualUnusedHNeighbour(const BLOBNBOX* blob,
1403  BlobNeighbourDir dir) {
1404  BLOBNBOX* next_blob = blob->neighbour(dir);
1405  if (next_blob == nullptr || next_blob->owner() != nullptr ||
1406  next_blob->UniquelyVertical())
1407  return nullptr;
1408  if (next_blob->neighbour(DirOtherWay(dir)) == blob)
1409  return next_blob;
1410  return nullptr;
1411 }
1412 
1413 // Finds horizontal chains of text-like blobs and puts them in ColPartitions.
1414 void StrokeWidth::FindHorizontalTextChains(ColPartitionGrid* part_grid) {
1415  // A PageSegMode that forces horizontal textlines with the current rotation.
1416  PageSegMode pageseg_mode =
1417  rerotation_.y() == 0.0f ? PSM_SINGLE_COLUMN : PSM_SINGLE_BLOCK_VERT_TEXT;
1418  BlobGridSearch gsearch(this);
1419  BLOBNBOX* bbox;
1420  gsearch.StartFullSearch();
1421  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1422  BLOBNBOX* blob;
1423  if (bbox->owner() == nullptr && bbox->UniquelyHorizontal() &&
1424  (blob = MutualUnusedHNeighbour(bbox, BND_RIGHT)) != nullptr) {
1425  // Put all the linked blobs into a ColPartition.
1426  ColPartition* part = new ColPartition(BRT_TEXT, ICOORD(0, 1));
1427  part->AddBox(bbox);
1428  while (blob != nullptr) {
1429  part->AddBox(blob);
1430  blob = MutualUnusedHNeighbour(blob, BND_RIGHT);
1431  }
1432  blob = MutualUnusedHNeighbour(bbox, BND_LEFT);
1433  while (blob != nullptr) {
1434  part->AddBox(blob);
1435  blob = MutualUnusedVNeighbour(blob, BND_LEFT);
1436  }
1437  CompletePartition(pageseg_mode, part, part_grid);
1438  }
1439  }
1440 }
1441 
1442 // Finds diacritics and saves their base character in the blob.
1443 // The objective is to move all diacritics to the noise_blobs list, so
1444 // they don't mess up early textline finding/merging, or force splits
1445 // on textlines that overlap a bit. Blobs that become diacritics must be
1446 // either part of no ColPartition (nullptr owner) or in a small partition in
1447 // which ALL the blobs are diacritics, in which case the partition is
1448 // exploded (deleted) back to its blobs.
1449 void StrokeWidth::TestDiacritics(ColPartitionGrid* part_grid, TO_BLOCK* block) {
1450  BlobGrid small_grid(gridsize(), bleft(), tright());
1451  small_grid.InsertBlobList(&block->noise_blobs);
1452  small_grid.InsertBlobList(&block->blobs);
1453  int medium_diacritics = 0;
1454  int small_diacritics = 0;
1455  BLOBNBOX_IT small_it(&block->noise_blobs);
1456  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1457  BLOBNBOX* blob = small_it.data();
1458  if (blob->owner() == nullptr && !blob->IsDiacritic() &&
1459  DiacriticBlob(&small_grid, blob)) {
1460  ++small_diacritics;
1461  }
1462  }
1463  BLOBNBOX_IT blob_it(&block->blobs);
1464  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1465  BLOBNBOX* blob = blob_it.data();
1466  if (blob->IsDiacritic()) {
1467  small_it.add_to_end(blob_it.extract());
1468  continue; // Already a diacritic.
1469  }
1470  ColPartition* part = blob->owner();
1471  if (part == nullptr && DiacriticBlob(&small_grid, blob)) {
1472  ++medium_diacritics;
1473  RemoveBBox(blob);
1474  small_it.add_to_end(blob_it.extract());
1475  } else if (part != nullptr && !part->block_owned() &&
1476  part->boxes_count() < 3) {
1477  // We allow blobs in small partitions to become diacritics if ALL the
1478  // blobs in the partition qualify as we can then cleanly delete the
1479  // partition, turn all the blobs in it to diacritics and they can be
1480  // merged into the base character partition more easily than merging
1481  // the partitions.
1482  BLOBNBOX_C_IT box_it(part->boxes());
1483  for (box_it.mark_cycle_pt(); !box_it.cycled_list() &&
1484  DiacriticBlob(&small_grid, box_it.data());
1485  box_it.forward());
1486  if (box_it.cycled_list()) {
1487  // They are all good.
1488  while (!box_it.empty()) {
1489  // Liberate the blob from its partition so it can be treated
1490  // as a diacritic and merged explicitly with the base part.
1491  // The blob is really owned by the block. The partition "owner"
1492  // is nulled to allow the blob to get merged with its base character
1493  // partition.
1494  BLOBNBOX* box = box_it.extract();
1495  box->set_owner(nullptr);
1496  box_it.forward();
1497  ++medium_diacritics;
1498  // We remove the blob from the grid so it isn't found by subsequent
1499  // searches where we might not want to include diacritics.
1500  RemoveBBox(box);
1501  }
1502  // We only move the one blob to the small list here, but the others
1503  // all get moved by the test at the top of the loop.
1504  small_it.add_to_end(blob_it.extract());
1505  part_grid->RemoveBBox(part);
1506  delete part;
1507  }
1508  } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1509  blob->bounding_box().bottom())) {
1510  tprintf("Blob not available to be a diacritic at:");
1511  blob->bounding_box().print();
1512  }
1513  }
1514  if (textord_tabfind_show_strokewidths) {
1515  tprintf("Found %d small diacritics, %d medium\n",
1516  small_diacritics, medium_diacritics);
1517  }
1518 }
1519 
1520 // Searches this grid for an appropriately close and sized neighbour of the
1521 // given [small] blob. If such a blob is found, the diacritic base is saved
1522 // in the blob and true is returned.
1523 // The small_grid is a secondary grid that contains the small/noise objects
1524 // that are not in this grid, but may be useful for determining a connection
1525 // between blob and its potential base character. (See DiacriticXGapFilled.)
1526 bool StrokeWidth::DiacriticBlob(BlobGrid* small_grid, BLOBNBOX* blob) {
1527  if (BLOBNBOX::UnMergeableType(blob->region_type()) ||
1528  blob->region_type() == BRT_VERT_TEXT)
1529  return false;
1530  TBOX small_box(blob->bounding_box());
1531  bool debug = AlignedBlob::WithinTestRegion(2, small_box.left(),
1532  small_box.bottom());
1533  if (debug) {
1534  tprintf("Testing blob for diacriticness at:");
1535  small_box.print();
1536  }
1537  int x = (small_box.left() + small_box.right()) / 2;
1538  int y = (small_box.bottom() + small_box.top()) / 2;
1539  int grid_x, grid_y;
1540  GridCoords(x, y, &grid_x, &grid_y);
1541  int height = small_box.height();
1542  // Setup a rectangle search to find its nearest base-character neighbour.
1543  // We keep 2 different best candidates:
1544  // best_x_overlap is a category of base characters that have an overlap in x
1545  // (like a acute) in which we look for the least y-gap, computed using the
1546  // projection to favor base characters in the same textline.
1547  // best_y_overlap is a category of base characters that have no x overlap,
1548  // (nominally a y-overlap is preferrecd but not essential) in which we
1549  // look for the least weighted sum of x-gap and y-gap, with x-gap getting
1550  // a lower weight to catch quotes at the end of a textline.
1551  // NOTE that x-gap and y-gap are measured from the nearest side of the base
1552  // character to the FARTHEST side of the diacritic to allow small diacritics
1553  // to be a reasonable distance away, but not big diacritics.
1554  BLOBNBOX* best_x_overlap = nullptr;
1555  BLOBNBOX* best_y_overlap = nullptr;
1556  int best_total_dist = 0;
1557  int best_y_gap = 0;
1558  TBOX best_xbox;
1559  // TODO(rays) the search box could be setup using the projection as a guide.
1560  TBOX search_box(small_box);
1561  int x_pad = IntCastRounded(gridsize() * kDiacriticXPadRatio);
1562  int y_pad = IntCastRounded(gridsize() * kDiacriticYPadRatio);
1563  search_box.pad(x_pad, y_pad);
1564  BlobGridSearch rsearch(this);
1565  rsearch.SetUniqueMode(true);
1566  int min_height = height * kMinDiacriticSizeRatio;
1567  rsearch.StartRectSearch(search_box);
1568  BLOBNBOX* neighbour;
1569  while ((neighbour = rsearch.NextRectSearch()) != nullptr) {
1570  if (BLOBNBOX::UnMergeableType(neighbour->region_type()) ||
1571  neighbour == blob || neighbour->owner() == blob->owner())
1572  continue;
1573  TBOX nbox = neighbour->bounding_box();
1574  if (neighbour->owner() == nullptr || neighbour->owner()->IsVerticalType() ||
1575  (neighbour->flow() != BTFT_CHAIN &&
1576  neighbour->flow() != BTFT_STRONG_CHAIN)) {
1577  if (debug) {
1578  tprintf("Neighbour not strong enough:");
1579  nbox.print();
1580  }
1581  continue; // Diacritics must be attached to strong text.
1582  }
1583  if (nbox.height() < min_height) {
1584  if (debug) {
1585  tprintf("Neighbour not big enough:");
1586  nbox.print();
1587  }
1588  continue; // Too small to be the base character.
1589  }
1590  int x_gap = small_box.x_gap(nbox);
1591  int y_gap = small_box.y_gap(nbox);
1592  int total_distance = projection_->DistanceOfBoxFromBox(small_box, nbox,
1593  true, denorm_,
1594  debug);
1595  if (debug) tprintf("xgap=%d, y=%d, total dist=%d\n",
1596  x_gap, y_gap, total_distance);
1597  if (total_distance >
1598  neighbour->owner()->median_height() * kMaxDiacriticDistanceRatio) {
1599  if (debug) {
1600  tprintf("Neighbour with median size %d too far away:",
1601  neighbour->owner()->median_height());
1602  neighbour->bounding_box().print();
1603  }
1604  continue; // Diacritics must not be too distant.
1605  }
1606  if (x_gap <= 0) {
1607  if (debug) {
1608  tprintf("Computing reduced box for :");
1609  nbox.print();
1610  }
1611  int left = small_box.left() - small_box.width();
1612  int right = small_box.right() + small_box.width();
1613  nbox = neighbour->BoundsWithinLimits(left, right);
1614  y_gap = small_box.y_gap(nbox);
1615  if (best_x_overlap == nullptr || y_gap < best_y_gap) {
1616  best_x_overlap = neighbour;
1617  best_xbox = nbox;
1618  best_y_gap = y_gap;
1619  if (debug) {
1620  tprintf("New best:");
1621  nbox.print();
1622  }
1623  } else if (debug) {
1624  tprintf("Shrunken box doesn't win:");
1625  nbox.print();
1626  }
1627  } else if (blob->ConfirmNoTabViolation(*neighbour)) {
1628  if (best_y_overlap == nullptr || total_distance < best_total_dist) {
1629  if (debug) {
1630  tprintf("New best y overlap:");
1631  nbox.print();
1632  }
1633  best_y_overlap = neighbour;
1634  best_total_dist = total_distance;
1635  } else if (debug) {
1636  tprintf("New y overlap box doesn't win:");
1637  nbox.print();
1638  }
1639  } else if (debug) {
1640  tprintf("Neighbour wrong side of a tab:");
1641  nbox.print();
1642  }
1643  }
1644  if (best_x_overlap != nullptr &&
1645  (best_y_overlap == nullptr ||
1646  best_xbox.major_y_overlap(best_y_overlap->bounding_box()))) {
1647  blob->set_diacritic_box(best_xbox);
1648  blob->set_base_char_blob(best_x_overlap);
1649  if (debug) {
1650  tprintf("DiacriticBlob OK! (x-overlap:");
1651  small_box.print();
1652  best_xbox.print();
1653  }
1654  return true;
1655  }
1656  if (best_y_overlap != nullptr &&
1657  DiacriticXGapFilled(small_grid, small_box,
1658  best_y_overlap->bounding_box()) &&
1659  NoNoiseInBetween(small_box, best_y_overlap->bounding_box())) {
1660  blob->set_diacritic_box(best_y_overlap->bounding_box());
1661  blob->set_base_char_blob(best_y_overlap);
1662  if (debug) {
1663  tprintf("DiacriticBlob OK! (y-overlap:");
1664  small_box.print();
1665  best_y_overlap->bounding_box().print();
1666  }
1667  return true;
1668  }
1669  if (debug) {
1670  tprintf("DiacriticBlob fails:");
1671  small_box.print();
1672  tprintf("Best x+y gap = %d, y = %d\n", best_total_dist, best_y_gap);
1673  if (best_y_overlap != nullptr) {
1674  tprintf("XGapFilled=%d, NoiseBetween=%d\n",
1675  DiacriticXGapFilled(small_grid, small_box,
1676  best_y_overlap->bounding_box()),
1677  NoNoiseInBetween(small_box, best_y_overlap->bounding_box()));
1678  }
1679  }
1680  return false;
1681 }
1682 
1683 // Returns true if there is no gap between the base char and the diacritic
1684 // bigger than a fraction of the height of the base char:
1685 // Eg: line end.....'
1686 // The quote is a long way from the end of the line, yet it needs to be a
1687 // diacritic. To determine that the quote is not part of an image, or
1688 // a different text block, we check for other marks in the gap between
1689 // the base char and the diacritic.
1690 // '<--Diacritic
1691 // |---------|
1692 // | |<-toobig-gap->
1693 // | Base |<ok gap>
1694 // |---------| x<-----Dot occupying gap
1695 // The grid is const really.
1696 bool StrokeWidth::DiacriticXGapFilled(BlobGrid* grid,
1697  const TBOX& diacritic_box,
1698  const TBOX& base_box) {
1699  // Since most gaps are small, use an iterative algorithm to search the gap.
1700  int max_gap = IntCastRounded(base_box.height() *
1702  TBOX occupied_box(base_box);
1703  int diacritic_gap;
1704  while ((diacritic_gap = diacritic_box.x_gap(occupied_box)) > max_gap) {
1705  TBOX search_box(occupied_box);
1706  if (diacritic_box.left() > search_box.right()) {
1707  // We are looking right.
1708  search_box.set_left(search_box.right());
1709  search_box.set_right(search_box.left() + max_gap);
1710  } else {
1711  // We are looking left.
1712  search_box.set_right(search_box.left());
1713  search_box.set_left(search_box.left() - max_gap);
1714  }
1715  BlobGridSearch rsearch(grid);
1716  rsearch.StartRectSearch(search_box);
1717  BLOBNBOX* neighbour;
1718  while ((neighbour = rsearch.NextRectSearch()) != nullptr) {
1719  const TBOX& nbox = neighbour->bounding_box();
1720  if (nbox.x_gap(diacritic_box) < diacritic_gap) {
1721  if (nbox.left() < occupied_box.left())
1722  occupied_box.set_left(nbox.left());
1723  if (nbox.right() > occupied_box.right())
1724  occupied_box.set_right(nbox.right());
1725  break;
1726  }
1727  }
1728  if (neighbour == nullptr)
1729  return false; // Found a big gap.
1730  }
1731  return true; // The gap was filled.
1732 }
1733 
1734 // Merges diacritics with the ColPartition of the base character blob.
1735 void StrokeWidth::MergeDiacritics(TO_BLOCK* block,
1736  ColPartitionGrid* part_grid) {
1737  BLOBNBOX_IT small_it(&block->noise_blobs);
1738  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1739  BLOBNBOX* blob = small_it.data();
1740  if (blob->base_char_blob() != nullptr) {
1741  ColPartition* part = blob->base_char_blob()->owner();
1742  // The base character must be owned by a partition and that partition
1743  // must not be on the big_parts list (not block owned).
1744  if (part != nullptr && !part->block_owned() && blob->owner() == nullptr &&
1745  blob->IsDiacritic()) {
1746  // The partition has to be removed from the grid and reinserted
1747  // because its bounding box may change.
1748  part_grid->RemoveBBox(part);
1749  part->AddBox(blob);
1750  blob->set_region_type(part->blob_type());
1751  blob->set_flow(part->flow());
1752  blob->set_owner(part);
1753  part_grid->InsertBBox(true, true, part);
1754  }
1755  // Set all base chars to nullptr before any blobs get deleted.
1756  blob->set_base_char_blob(nullptr);
1757  }
1758  }
1759 }
1760 
1761 // Any blobs on the large_blobs list of block that are still unowned by a
1762 // ColPartition, are probably drop-cap or vertically touching so the blobs
1763 // are removed to the big_parts list and treated separately.
1764 void StrokeWidth::RemoveLargeUnusedBlobs(TO_BLOCK* block,
1765  ColPartitionGrid* part_grid,
1766  ColPartition_LIST* big_parts) {
1767  BLOBNBOX_IT large_it(&block->large_blobs);
1768  for (large_it.mark_cycle_pt(); !large_it.cycled_list(); large_it.forward()) {
1769  BLOBNBOX* blob = large_it.data();
1770  ColPartition* big_part = blob->owner();
1771  if (big_part == nullptr) {
1772  // Large blobs should have gone into partitions by now if they are
1773  // genuine characters, so move any unowned ones out to the big parts
1774  // list. This will include drop caps and vertically touching characters.
1775  ColPartition::MakeBigPartition(blob, big_parts);
1776  }
1777  }
1778 }
1779 
1780 // All remaining unused blobs are put in individual ColPartitions.
1781 void StrokeWidth::PartitionRemainingBlobs(PageSegMode pageseg_mode,
1782  ColPartitionGrid* part_grid) {
1783  BlobGridSearch gsearch(this);
1784  BLOBNBOX* bbox;
1785  int prev_grid_x = -1;
1786  int prev_grid_y = -1;
1787  BLOBNBOX_CLIST cell_list;
1788  BLOBNBOX_C_IT cell_it(&cell_list);
1789  bool cell_all_noise = true;
1790  gsearch.StartFullSearch();
1791  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1792  int grid_x = gsearch.GridX();
1793  int grid_y = gsearch.GridY();
1794  if (grid_x != prev_grid_x || grid_y != prev_grid_y) {
1795  // New cell. Process old cell.
1796  MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid,
1797  &cell_list);
1798  cell_it.set_to_list(&cell_list);
1799  prev_grid_x = grid_x;
1800  prev_grid_y = grid_y;
1801  cell_all_noise = true;
1802  }
1803  if (bbox->owner() == nullptr) {
1804  cell_it.add_to_end(bbox);
1805  if (bbox->flow() != BTFT_NONTEXT)
1806  cell_all_noise = false;
1807  } else {
1808  cell_all_noise = false;
1809  }
1810  }
1811  MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid,
1812  &cell_list);
1813 }
1814 
1815 // If combine, put all blobs in the cell_list into a single partition, otherwise
1816 // put each one into its own partition.
1817 void StrokeWidth::MakePartitionsFromCellList(PageSegMode pageseg_mode,
1818  bool combine,
1819  ColPartitionGrid* part_grid,
1820  BLOBNBOX_CLIST* cell_list) {
1821  if (cell_list->empty())
1822  return;
1823  BLOBNBOX_C_IT cell_it(cell_list);
1824  if (combine) {
1825  BLOBNBOX* bbox = cell_it.extract();
1826  ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1827  part->AddBox(bbox);
1828  part->set_flow(bbox->flow());
1829  for (cell_it.forward(); !cell_it.empty(); cell_it.forward()) {
1830  part->AddBox(cell_it.extract());
1831  }
1832  CompletePartition(pageseg_mode, part, part_grid);
1833  } else {
1834  for (; !cell_it.empty(); cell_it.forward()) {
1835  BLOBNBOX* bbox = cell_it.extract();
1836  ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1837  part->set_flow(bbox->flow());
1838  part->AddBox(bbox);
1839  CompletePartition(pageseg_mode, part, part_grid);
1840  }
1841  }
1842 }
1843 
1844 // Helper function to finish setting up a ColPartition and insert into
1845 // part_grid.
1846 void StrokeWidth::CompletePartition(PageSegMode pageseg_mode,
1847  ColPartition* part,
1848  ColPartitionGrid* part_grid) {
1849  part->ComputeLimits();
1850  TBOX box = part->bounding_box();
1851  bool debug = AlignedBlob::WithinTestRegion(2, box.left(),
1852  box.bottom());
1853  int value = projection_->EvaluateColPartition(*part, denorm_, debug);
1854  // Override value if pageseg_mode disagrees.
1855  if (value > 0 && FindingVerticalOnly(pageseg_mode)) {
1856  value = part->boxes_count() == 1 ? 0 : -2;
1857  } else if (value < 0 && FindingHorizontalOnly(pageseg_mode)) {
1858  value = part->boxes_count() == 1 ? 0 : 2;
1859  }
1860  part->SetRegionAndFlowTypesFromProjectionValue(value);
1861  part->ClaimBoxes();
1862  part_grid->InsertBBox(true, true, part);
1863 }
1864 
1865 // Merge partitions where the merge appears harmless.
1866 // As this
1867 void StrokeWidth::EasyMerges(ColPartitionGrid* part_grid) {
1868  using namespace std::placeholders; // for _1, _2
1869  part_grid->Merges(
1870  std::bind(&StrokeWidth::OrientationSearchBox, this, _1, _2),
1871  std::bind(&StrokeWidth::ConfirmEasyMerge, this, _1, _2));
1872 }
1873 
1874 // Compute a search box based on the orientation of the partition.
1875 // Returns true if a suitable box can be calculated.
1876 // Callback for EasyMerges.
1877 bool StrokeWidth::OrientationSearchBox(ColPartition* part, TBOX* box) {
1878  if (part->IsVerticalType()) {
1879  box->set_top(box->top() + box->width());
1880  box->set_bottom(box->bottom() - box->width());
1881  } else {
1882  box->set_left(box->left() - box->height());
1883  box->set_right(box->right() + box->height());
1884  }
1885  return true;
1886 }
1887 
1888 // Merge confirmation callback for EasyMerges.
1889 bool StrokeWidth::ConfirmEasyMerge(const ColPartition* p1,
1890  const ColPartition* p2) {
1891  ASSERT_HOST(p1 != nullptr && p2 != nullptr);
1892  ASSERT_HOST(!p1->IsEmpty() && !p2->IsEmpty());
1893  if ((p1->flow() == BTFT_NONTEXT && p2->flow() >= BTFT_CHAIN) ||
1894  (p1->flow() >= BTFT_CHAIN && p2->flow() == BTFT_NONTEXT))
1895  return false; // Don't merge confirmed image with text.
1896  if ((p1->IsVerticalType() || p2->IsVerticalType()) &&
1897  p1->HCoreOverlap(*p2) <= 0 &&
1898  ((!p1->IsSingleton() &&
1899  !p2->IsSingleton()) ||
1900  !p1->bounding_box().major_overlap(p2->bounding_box())))
1901  return false; // Overlap must be in the text line.
1902  if ((p1->IsHorizontalType() || p2->IsHorizontalType()) &&
1903  p1->VCoreOverlap(*p2) <= 0 &&
1904  ((!p1->IsSingleton() &&
1905  !p2->IsSingleton()) ||
1906  (!p1->bounding_box().major_overlap(p2->bounding_box()) &&
1907  !p1->OKDiacriticMerge(*p2, false) &&
1908  !p2->OKDiacriticMerge(*p1, false))))
1909  return false; // Overlap must be in the text line.
1910  if (!p1->ConfirmNoTabViolation(*p2))
1911  return false;
1912  if (p1->flow() <= BTFT_NONTEXT && p2->flow() <= BTFT_NONTEXT)
1913  return true;
1914  return NoNoiseInBetween(p1->bounding_box(), p2->bounding_box());
1915 }
1916 
1917 // Returns true if there is no significant noise in between the boxes.
1918 bool StrokeWidth::NoNoiseInBetween(const TBOX& box1, const TBOX& box2) const {
1919  return ImageFind::BlankImageInBetween(box1, box2, grid_box_, rerotation_,
1920  nontext_map_);
1921 }
1922 
1926 ScrollView* StrokeWidth::DisplayGoodBlobs(const char* window_name,
1927  int x, int y) {
1928  ScrollView* window = nullptr;
1929 #ifndef GRAPHICS_DISABLED
1930  window = MakeWindow(x, y, window_name);
1931  // For every blob in the grid, display it.
1932  window->Brush(ScrollView::NONE);
1933 
1934  // For every bbox in the grid, display it.
1935  BlobGridSearch gsearch(this);
1936  gsearch.StartFullSearch();
1937  BLOBNBOX* bbox;
1938  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1939  const TBOX& box = bbox->bounding_box();
1940  int left_x = box.left();
1941  int right_x = box.right();
1942  int top_y = box.top();
1943  int bottom_y = box.bottom();
1944  int goodness = bbox->GoodTextBlob();
1945  BlobRegionType blob_type = bbox->region_type();
1946  if (bbox->UniquelyVertical())
1947  blob_type = BRT_VERT_TEXT;
1948  if (bbox->UniquelyHorizontal())
1949  blob_type = BRT_TEXT;
1950  BlobTextFlowType flow = bbox->flow();
1951  if (flow == BTFT_NONE) {
1952  if (goodness == 0)
1953  flow = BTFT_NEIGHBOURS;
1954  else if (goodness == 1)
1955  flow = BTFT_CHAIN;
1956  else
1957  flow = BTFT_STRONG_CHAIN;
1958  }
1959  window->Pen(BLOBNBOX::TextlineColor(blob_type, flow));
1960  window->Rectangle(left_x, bottom_y, right_x, top_y);
1961  }
1962  window->Update();
1963 #endif
1964  return window;
1965 }
1966 
1967 static void DrawDiacriticJoiner(const BLOBNBOX* blob, ScrollView* window) {
1968 #ifndef GRAPHICS_DISABLED
1969  const TBOX& blob_box(blob->bounding_box());
1970  int top = std::max(static_cast<int>(blob_box.top()), blob->base_char_top());
1971  int bottom = std::min(static_cast<int>(blob_box.bottom()), blob->base_char_bottom());
1972  int x = (blob_box.left() + blob_box.right()) / 2;
1973  window->Line(x, top, x, bottom);
1974 #endif // GRAPHICS_DISABLED
1975 }
1976 
1977 // Displays blobs colored according to whether or not they are diacritics.
1978 ScrollView* StrokeWidth::DisplayDiacritics(const char* window_name,
1979  int x, int y, TO_BLOCK* block) {
1980  ScrollView* window = nullptr;
1981 #ifndef GRAPHICS_DISABLED
1982  window = MakeWindow(x, y, window_name);
1983  // For every blob in the grid, display it.
1984  window->Brush(ScrollView::NONE);
1985 
1986  BLOBNBOX_IT it(&block->blobs);
1987  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1988  BLOBNBOX* blob = it.data();
1989  if (blob->IsDiacritic()) {
1990  window->Pen(ScrollView::GREEN);
1991  DrawDiacriticJoiner(blob, window);
1992  } else {
1993  window->Pen(blob->BoxColor());
1994  }
1995  const TBOX& box = blob->bounding_box();
1996  window->Rectangle(box.left(), box. bottom(), box.right(), box.top());
1997  }
1998  it.set_to_list(&block->noise_blobs);
1999  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2000  BLOBNBOX* blob = it.data();
2001  if (blob->IsDiacritic()) {
2002  window->Pen(ScrollView::GREEN);
2003  DrawDiacriticJoiner(blob, window);
2004  } else {
2005  window->Pen(ScrollView::WHITE);
2006  }
2007  const TBOX& box = blob->bounding_box();
2008  window->Rectangle(box.left(), box. bottom(), box.right(), box.top());
2009  }
2010  window->Update();
2011 #endif
2012  return window;
2013 }
2014 
2015 } // namespace tesseract.
BLOBNBOX::ClearNeighbours
void ClearNeighbours()
Definition: blobbox.h:498
TBOX
Definition: cleanapi_test.cc:19
tesseract::kNoiseOverlapAreaFactor
const double kNoiseOverlapAreaFactor
Definition: strokewidth.cpp:107
TO_BLOCK::small_blobs
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:774
ScrollView
Definition: scrollview.h:97
BlobTextFlowType
BlobTextFlowType
Definition: blobbox.h:113
INT_VAR
#define INT_VAR(name, val, comment)
Definition: params.h:300
ScrollView::Brush
void Brush(Color color)
Definition: scrollview.cpp:723
SVET_DESTROY
Definition: scrollview.h:45
C_BLOB::perimeter
int32_t perimeter()
Definition: stepblob.cpp:284
tesseract::kMostlyOneDirRatio
const int kMostlyOneDirRatio
Definition: strokewidth.cpp:92
BND_RIGHT
Definition: blobbox.h:89
BLOBNBOX::rotate_box
void rotate_box(FCOORD rotation)
Definition: blobbox.cpp:71
BTFT_NONE
Definition: blobbox.h:114
tesseract::kNoiseOverlapGrowthFactor
const double kNoiseOverlapGrowthFactor
Definition: strokewidth.cpp:104
tesseract::TextlineProjection::EvaluateColPartition
int EvaluateColPartition(const ColPartition &part, const DENORM *denorm, bool debug) const
Definition: textlineprojection.cpp:362
C_BLOB::out_list
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:69
BTFT_STRONG_CHAIN
Definition: blobbox.h:118
BLOBNBOX::set_diacritic_box
void set_diacritic_box(const TBOX &diacritic_box)
Definition: blobbox.h:397
tesseract::BBGrid::InsertBBox
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:486
tesseract::BlobGrid::BlobGrid
BlobGrid(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: blobgrid.cpp:24
tabfind.h
tesseract::BBGrid< BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT >::MakeWindow
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:589
BRT_NOISE
Definition: blobbox.h:72
TO_BLOCK::noise_blobs
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:773
tesseract::kCJKAspectRatio
const double kCJKAspectRatio
Definition: strokewidth.cpp:61
tesseract::kLineTrapShortest
const int kLineTrapShortest
Definition: strokewidth.cpp:89
tesseract::StrokeWidth::GradeBlobsIntoPartitions
void GradeBlobsIntoPartitions(PageSegMode pageseg_mode, const FCOORD &rerotation, TO_BLOCK *block, Pix *nontext_pix, const DENORM *denorm, bool cjk_script, TextlineProjection *projection, BLOBNBOX_LIST *diacritic_blobs, ColPartitionGrid *part_grid, ColPartition_LIST *big_parts)
Definition: strokewidth.cpp:343
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
BND_BELOW
Definition: blobbox.h:88
FCOORD::set_x
void set_x(float xin)
rewrite function
Definition: points.h:213
BLOBNBOX::base_char_blob
BLOBNBOX * base_char_blob() const
Definition: blobbox.h:401
BLOBNBOX::BoundsWithinLimits
TBOX BoundsWithinLimits(int left, int right)
Definition: blobbox.cpp:331
tesseract::TextlineProjection::DisplayProjection
void DisplayProjection() const
Definition: textlineprojection.cpp:123
BLOBNBOX::base_char_top
int base_char_top() const
Definition: blobbox.h:382
blobbox.h
TBOX::overlap
bool overlap(const TBOX &box) const
Definition: rect.h:350
BRT_UNKNOWN
Definition: blobbox.h:77
FCOORD::y
float y() const
Definition: points.h:209
ICOORD
integer coordinate
Definition: points.h:30
BLOBNBOX::set_flow
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:297
BLOBNBOX::leader_on_left
bool leader_on_left() const
Definition: blobbox.h:357
BLOBNBOX::compute_bounding_box
void compute_bounding_box()
Definition: blobbox.h:239
tesseract::BBGrid< BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT >::RemoveBBox
void RemoveBBox(BLOBNBOX *bbox)
Definition: bbgrid.h:533
TBOX::print
void print() const
Definition: rect.h:277
tesseract::GridSearch::StartFullSearch
void StartFullSearch()
Definition: bbgrid.h:665
tesseract::LR_LEFT
Definition: strokewidth.h:40
TO_BLOCK::DeleteUnownedNoise
void DeleteUnownedNoise()
Definition: blobbox.cpp:1020
FCOORD::x
float x() const
Definition: points.h:206
tesseract::ColPartition::median_height
int median_height() const
Definition: colpartition.h:136
TBOX::top
int16_t top() const
Definition: rect.h:57
TBOX::contains
bool contains(const FCOORD pt) const
Definition: rect.h:330
tesseract::BBGrid< BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT >::Clear
void Clear()
Definition: bbgrid.h:455
TBOX::area
int32_t area() const
Definition: rect.h:121
TO_BLOCK::blobs
BLOBNBOX_LIST blobs
Definition: blobbox.h:771
tesseract::BlobGrid
Definition: blobgrid.h:33
TO_BLOCK
Definition: blobbox.h:691
BRT_VERT_TEXT
Definition: blobbox.h:78
TBOX::set_top
void set_top(int y)
Definition: rect.h:60
BLOBNBOX::NeighbourGaps
void NeighbourGaps(int gaps[BND_COUNT]) const
Definition: blobbox.cpp:179
BLOBNBOX::right_rule
int right_rule() const
Definition: blobbox.h:318
ScrollView::NONE
Definition: scrollview.h:101
ScrollView::Pen
void Pen(Color color)
Definition: scrollview.cpp:717
colpartition.h
BLOBNBOX::good_stroke_neighbour
bool good_stroke_neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:372
TBOX::major_y_overlap
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:428
IntCastRounded
int IntCastRounded(double x)
Definition: helpers.h:173
tesseract::ColPartitionGridSearch
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:935
tesseract::GridSearch::StartRadSearch
void StartRadSearch(int x, int y, int max_radius)
Definition: bbgrid.h:698
tesseract::kSizeRatioToReject
const float kSizeRatioToReject
Definition: strokewidth.cpp:100
FCOORD
Definition: points.h:187
FCOORD::set_y
void set_y(float yin)
rewrite function
Definition: points.h:217
tesseract::TextlineProjection::PlotGradedBlobs
void PlotGradedBlobs(BLOBNBOX_LIST *blobs, ScrollView *win)
Definition: textlineprojection.cpp:84
BLOBNBOX
Definition: blobbox.h:142
BND_ABOVE
Definition: blobbox.h:90
BTFT_CHAIN
Definition: blobbox.h:117
tesseract::TextlineProjection::MoveNonTextlineBlobs
void MoveNonTextlineBlobs(BLOBNBOX_LIST *blobs, BLOBNBOX_LIST *small_blobs) const
Definition: textlineprojection.cpp:106
BRT_HLINE
Definition: blobbox.h:73
BTFT_LEADER
Definition: blobbox.h:120
BLOBNBOX::UnMergeableType
static bool UnMergeableType(BlobRegionType type)
Definition: blobbox.h:429
BLOBNBOX::BoxColor
ScrollView::Color BoxColor() const
Definition: blobbox.cpp:479
C_BLOB::area
int32_t area()
Definition: stepblob.cpp:266
tesseract::ColPartition
Definition: colpartition.h:67
tesseract::kDiacriticYPadRatio
const double kDiacriticYPadRatio
Definition: strokewidth.cpp:73
BLOBNBOX::horz_stroke_width
float horz_stroke_width() const
Definition: blobbox.h:336
tesseract::GridBase::tright
const ICOORD & tright() const
Definition: bbgrid.h:75
TBOX::height
int16_t height() const
Definition: rect.h:107
BLOBNBOX::TextlineColor
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type)
Definition: blobbox.cpp:442
TBOX::y_overlap
bool y_overlap(const TBOX &box) const
Definition: rect.h:418
TBOX::y_gap
int y_gap(const TBOX &box) const
Definition: rect.h:232
BTFT_NONTEXT
Definition: blobbox.h:115
BLOBNBOX::GoodTextBlob
int GoodTextBlob() const
Definition: blobbox.cpp:224
statistc.h
tesseract::GridSearch::NextRadSearch
BBC * NextRadSearch()
Definition: bbgrid.h:713
strokewidth.h
tesseract::kDiacriticXPadRatio
const double kDiacriticXPadRatio
Definition: strokewidth.cpp:70
BLOBNBOX::neighbour
BLOBNBOX * neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:369
TBOX::set_right
void set_right(int x)
Definition: rect.h:81
tesseract::PSM_SINGLE_BLOCK_VERT_TEXT
Definition: publictypes.h:166
textlineprojection.h
BLOBNBOX::set_base_char_blob
void set_base_char_blob(BLOBNBOX *blob)
Definition: blobbox.h:404
BLOBNBOX::set_owns_cblob
void set_owns_cblob(bool value)
Definition: blobbox.h:407
BLOBNBOX::base_char_bottom
int base_char_bottom() const
Definition: blobbox.h:385
BLOBNBOX::leader_on_right
bool leader_on_right() const
Definition: blobbox.h:363
tesseract::PSM_SINGLE_COLUMN
Assume a single column of text of variable sizes.
Definition: publictypes.h:165
TO_BLOCK::large_blobs
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:775
tesseract::kLineResidueAspectRatio
const double kLineResidueAspectRatio
Definition: strokewidth.cpp:94
BlobRegionType
BlobRegionType
Definition: blobbox.h:71
tesseract::kCJKRadius
const int kCJKRadius
Definition: strokewidth.cpp:55
tesseract::LR_RIGHT
Definition: strokewidth.h:41
BLOBNBOX::MatchingStrokeWidth
bool MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance, double constant_tolerance) const
Definition: blobbox.cpp:303
tesseract::ColPartition::MakeBigPartition
static ColPartition * MakeBigPartition(BLOBNBOX *box, ColPartition_LIST *big_part_list)
Definition: colpartition.cpp:116
tesseract::AlignedBlob::WithinTestRegion
static bool WithinTestRegion(int detail_level, int x, int y)
Definition: alignedblob.cpp:150
tesseract::kStrokeWidthFractionCJK
const double kStrokeWidthFractionCJK
Definition: strokewidth.cpp:51
tesseract::TextlineProjection::DistanceOfBoxFromBox
int DistanceOfBoxFromBox(const TBOX &from_box, const TBOX &to_box, bool horizontal_textline, const DENORM *denorm, bool debug) const
Definition: textlineprojection.cpp:197
BRT_TEXT
Definition: blobbox.h:79
tesseract::PFR_OK
Definition: strokewidth.h:47
BLOBNBOX::joined_to_prev
bool joined_to_prev() const
Definition: blobbox.h:255
BLOBNBOX::set_leader_on_right
void set_leader_on_right(bool flag)
Definition: blobbox.h:366
tesseract::StrokeWidth::FindTextlineDirectionAndFixBrokenCJK
void FindTextlineDirectionAndFixBrokenCJK(PageSegMode pageseg_mode, bool cjk_merge, TO_BLOCK *input_block)
Definition: strokewidth.cpp:156
tesseract::kCJKBrokenDistanceFraction
const double kCJKBrokenDistanceFraction
Definition: strokewidth.cpp:57
BLOBNBOX::vert_possible
bool vert_possible() const
Definition: blobbox.h:300
TBOX::width
int16_t width() const
Definition: rect.h:114
BOOL_VAR
#define BOOL_VAR(name, val, comment)
Definition: params.h:303
BLOBNBOX::set_leader_on_left
void set_leader_on_left(bool flag)
Definition: blobbox.h:360
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
tesseract::kLineResidueSizeRatio
const double kLineResidueSizeRatio
Definition: strokewidth.cpp:98
BLOBNBOX::set_owner
void set_owner(tesseract::ColPartition *new_owner)
Definition: blobbox.h:354
tesseract::TabFind::DifferentSizes
static bool DifferentSizes(int size1, int size2)
Definition: tabfind.cpp:407
BLOBNBOX::IsDiacritic
bool IsDiacritic() const
Definition: blobbox.h:379
tesseract::TextlineProjection::ConstructProjection
void ConstructProjection(TO_BLOCK *input_block, const FCOORD &rotation, Pix *nontext_map)
Definition: textlineprojection.cpp:64
ScrollView::WHITE
Definition: scrollview.h:103
tesseract::kLineTrapLongest
const int kLineTrapLongest
Definition: strokewidth.cpp:87
BLOBNBOX::DefiniteIndividualFlow
bool DefiniteIndividualFlow()
Definition: blobbox.cpp:250
tesseract::PFR_NOISE
Definition: strokewidth.h:49
linlsq.h
tesseract::StrokeWidth::CorrectForRotation
void CorrectForRotation(const FCOORD &rerotation, ColPartitionGrid *part_grid)
Definition: strokewidth.cpp:250
tesseract::kCJKAspectRatioIncrease
const double kCJKAspectRatioIncrease
Definition: strokewidth.cpp:63
tesseract::GridSearch
Definition: bbgrid.h:48
BLOBNBOX::set_neighbour
void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good)
Definition: blobbox.h:375
tesseract
Definition: baseapi.h:65
BLOBNBOX::set_region_type
void set_region_type(BlobRegionType new_type)
Definition: blobbox.h:285
tesseract::TabFind::VeryDifferentSizes
static bool VeryDifferentSizes(int size1, int size2)
Definition: tabfind.cpp:413
BND_LEFT
Definition: blobbox.h:87
ScrollView::RED
Definition: scrollview.h:104
STATS
Definition: statistc.h:30
BLOBNBOX::bounding_box
const TBOX & bounding_box() const
Definition: blobbox.h:229
BLOBNBOX::UniquelyVertical
bool UniquelyVertical() const
Definition: blobbox.h:409
tesseract::kNeighbourSearchFactor
const double kNeighbourSearchFactor
Definition: strokewidth.cpp:102
tesseract::ImageFind::BlankImageInBetween
static bool BlankImageInBetween(const TBOX &box1, const TBOX &box2, const TBOX &im_box, const FCOORD &rotation, Pix *pix)
Definition: imagefind.cpp:576
tesseract::StrokeWidth::TestVerticalTextDirection
bool TestVerticalTextDirection(double find_vertical_text_ratio, TO_BLOCK *block, BLOBNBOX_CLIST *osd_blobs)
Definition: strokewidth.cpp:208
tesseract::kStrokeWidthCJK
const double kStrokeWidthCJK
Definition: strokewidth.cpp:52
tesseract::kMaxDiacriticDistanceRatio
const double kMaxDiacriticDistanceRatio
Definition: strokewidth.cpp:79
tesseract::LeftOrRight
LeftOrRight
Definition: strokewidth.h:39
ScrollView::PINK
Definition: scrollview.h:138
BLOBNBOX::area_stroke_width
float area_stroke_width() const
Definition: blobbox.h:348
ScrollView::AwaitEvent
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:443
tesseract::GridBase::gridsize
int gridsize() const
Definition: bbgrid.h:63
tesseract::StrokeWidth::HandleClick
void HandleClick(int x, int y) override
Definition: strokewidth.cpp:400
TBOX::pad
void pad(int xpad, int ypad)
Definition: rect.h:130
tesseract::ColPartition::AddBox
void AddBox(BLOBNBOX *box)
Definition: colpartition.cpp:169
tesseract::BBGrid< BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT >::Init
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:445
BLOBNBOX::really_merge
void really_merge(BLOBNBOX *other)
Definition: blobbox.cpp:102
tesseract::StrokeWidth::FindLeaderPartitions
void FindLeaderPartitions(TO_BLOCK *block, ColPartitionGrid *part_grid)
Definition: strokewidth.cpp:259
BLOBNBOX::flow
BlobTextFlowType flow() const
Definition: blobbox.h:294
tesseract::PageSegMode
PageSegMode
Definition: publictypes.h:159
imagefind.h
BRT_VLINE
Definition: blobbox.h:74
BLOBNBOX::horz_possible
bool horz_possible() const
Definition: blobbox.h:306
TBOX::left
int16_t left() const
Definition: rect.h:71
tesseract::ColPartitionGrid
Definition: colpartitiongrid.h:32
tesseract::StrokeWidth::SetNeighboursOnMediumBlobs
void SetNeighboursOnMediumBlobs(TO_BLOCK *block)
Definition: strokewidth.cpp:142
tesseract::kMaxDiacriticGapToBaseCharHeight
const double kMaxDiacriticGapToBaseCharHeight
Definition: strokewidth.cpp:82
ScrollView::GREEN
Definition: scrollview.h:106
BND_COUNT
Definition: blobbox.h:91
tesseract::ColPartition::IsVerticalType
bool IsVerticalType() const
Definition: colpartition.h:441
BLOBNBOX::ConfirmNoTabViolation
bool ConfirmNoTabViolation(const BLOBNBOX &other) const
Definition: blobbox.cpp:290
BLOBNBOX::region_type
BlobRegionType region_type() const
Definition: blobbox.h:282
tesseract::PartitionFindResult
PartitionFindResult
Definition: strokewidth.h:46
tesseract::BBGrid::HandleClick
virtual void HandleClick(int x, int y)
Definition: bbgrid.h:655
BLOBNBOX::set_horz_possible
void set_horz_possible(bool value)
Definition: blobbox.h:309
tesseract::kStrokeWidthFractionTolerance
const double kStrokeWidthFractionTolerance
Definition: strokewidth.cpp:44
TBOX::right
int16_t right() const
Definition: rect.h:78
DirOtherWay
BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir)
Definition: blobbox.h:105
ScrollView::Line
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:531
BLOBNBOX::MinMaxGapsClipped
void MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const
Definition: blobbox.cpp:198
tesseract::BlobGrid::InsertBlobList
void InsertBlobList(BLOBNBOX_LIST *blobs)
Definition: blobgrid.cpp:36
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
ScrollView::Update
static void Update()
Definition: scrollview.cpp:708
tesseract::ColPartition::ClaimBoxes
void ClaimBoxes()
Definition: colpartition.cpp:247
tesseract::StrokeWidth::RemoveLineResidue
void RemoveLineResidue(ColPartition_LIST *big_part_list)
Definition: strokewidth.cpp:279
BlobNeighbourDir
BlobNeighbourDir
Definition: blobbox.h:86
tesseract::kBrokenCJKIterationFraction
const double kBrokenCJKIterationFraction
Definition: strokewidth.cpp:67
tesseract::StrokeWidth::~StrokeWidth
~StrokeWidth() override
Definition: strokewidth.cpp:122
tesseract::kMinDiacriticSizeRatio
const double kMinDiacriticSizeRatio
Definition: strokewidth.cpp:76
TBOX::set_bottom
void set_bottom(int y)
Definition: rect.h:67
BLOBNBOX::cblob
C_BLOB * cblob() const
Definition: blobbox.h:267
BLOBNBOX::owner
tesseract::ColPartition * owner() const
Definition: blobbox.h:351
tesseract::StrokeWidth::StrokeWidth
StrokeWidth(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: strokewidth.cpp:109
tordmain.h
ScrollView::Rectangle
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:599
tesseract::BlobGridSearch
GridSearch< BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT > BlobGridSearch
Definition: blobgrid.h:31
textord_debug_tabfind
int textord_debug_tabfind
Definition: alignedblob.cpp:27
tesseract::TextlineProjection
Definition: textlineprojection.h:33
tesseract::kMaxCJKSizeRatio
const int kMaxCJKSizeRatio
Definition: strokewidth.cpp:65
tesseract::kStrokeWidthTolerance
const double kStrokeWidthTolerance
Definition: strokewidth.cpp:49
tesseract::GridBase::bleft
const ICOORD & bleft() const
Definition: bbgrid.h:72
colpartitiongrid.h
BLOBNBOX::vert_stroke_width
float vert_stroke_width() const
Definition: blobbox.h:342
tesseract::kCJKMaxComponents
const int kCJKMaxComponents
Definition: strokewidth.cpp:59
BTFT_NEIGHBOURS
Definition: blobbox.h:116
tesseract::GridBase::GridCoords
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:52
TBOX::set_left
void set_left(int x)
Definition: rect.h:74
TBOX::x_gap
int x_gap(const TBOX &box) const
Definition: rect.h:224
BLOBNBOX::UniquelyHorizontal
bool UniquelyHorizontal() const
Definition: blobbox.h:412
tesseract::kLineResiduePadRatio
const int kLineResiduePadRatio
Definition: strokewidth.cpp:96
BLOBNBOX::set_vert_possible
void set_vert_possible(bool value)
Definition: blobbox.h:303
tesseract::GridSearch::NextFullSearch
BBC * NextFullSearch()
Definition: bbgrid.h:675
TBOX
Definition: rect.h:33
DENORM
Definition: normalis.h:49