All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
strokewidth.cpp
Go to the documentation of this file.
1 // File: strokewidth.cpp
3 // Description: Subclass of BBGrid to find uniformity of strokewidth.
4 // Author: Ray Smith
5 // Created: Mon Mar 31 16:17:01 PST 2008
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #ifdef HAVE_CONFIG_H
25 #include "config_auto.h"
26 #endif
27 
28 #include "strokewidth.h"
29 
30 #include <math.h>
31 
32 #include "blobbox.h"
33 #include "colpartition.h"
34 #include "colpartitiongrid.h"
35 #include "imagefind.h"
36 #include "linlsq.h"
37 #include "statistc.h"
38 #include "tabfind.h"
39 #include "textlineprojection.h"
40 #include "tordmain.h" // For SetBlobStrokeWidth.
41 
42 namespace tesseract {
43 
44 INT_VAR(textord_tabfind_show_strokewidths, 0, "Show stroke widths");
45 BOOL_VAR(textord_tabfind_only_strokewidths, false, "Only run stroke widths");
46 
48 const double kStrokeWidthFractionTolerance = 0.125;
53 const double kStrokeWidthTolerance = 1.5;
54 // Same but for CJK we are a bit more generous.
55 const double kStrokeWidthFractionCJK = 0.25;
56 const double kStrokeWidthCJK = 2.0;
57 // Radius in grid cells of search for broken CJK. Doesn't need to be very
58 // large as the grid size should be about the size of a character anyway.
59 const int kCJKRadius = 2;
60 // Max distance fraction of size to join close but broken CJK characters.
61 const double kCJKBrokenDistanceFraction = 0.25;
62 // Max number of components in a broken CJK character.
63 const int kCJKMaxComponents = 8;
64 // Max aspect ratio of CJK broken characters when put back together.
65 const double kCJKAspectRatio = 1.25;
66 // Max increase in aspect ratio of CJK broken characters when merged.
67 const double kCJKAspectRatioIncrease = 1.0625;
68 // Max multiple of the grid size that will be used in computing median CJKsize.
69 const int kMaxCJKSizeRatio = 5;
70 // Min fraction of blobs broken CJK to iterate and run it again.
71 const double kBrokenCJKIterationFraction = 0.125;
72 // Multiple of gridsize as x-padding for a search box for diacritic base
73 // characters.
74 const double kDiacriticXPadRatio = 7.0;
75 // Multiple of gridsize as y-padding for a search box for diacritic base
76 // characters.
77 const double kDiacriticYPadRatio = 1.75;
78 // Min multiple of diacritic height that a neighbour must be to be a
79 // convincing base character.
80 const double kMinDiacriticSizeRatio = 1.0625;
81 // Max multiple of a textline's median height as a threshold for the sum of
82 // a diacritic's farthest x and y distances (gap + size).
83 const double kMaxDiacriticDistanceRatio = 1.25;
84 // Max x-gap between a diacritic and its base char as a fraction of the height
85 // of the base char (allowing other blobs to fill the gap.)
87 // Radius of a search for diacritics in grid units.
88 const int kSearchRadius = 2;
89 // Ratio between longest side of a line and longest side of a character.
90 // (neighbor_min > blob_min * kLineTrapShortest &&
91 // neighbor_max < blob_max / kLineTrapLongest)
92 // => neighbor is a grapheme and blob is a line.
93 const int kLineTrapLongest = 4;
94 // Ratio between shortest side of a line and shortest side of a character.
95 const int kLineTrapShortest = 2;
96 // Max aspect ratio of the total box before CountNeighbourGaps
97 // decides immediately based on the aspect ratio.
98 const int kMostlyOneDirRatio = 3;
99 // Aspect ratio for a blob to be considered as line residue.
100 const double kLineResidueAspectRatio = 8.0;
101 // Padding ratio for line residue search box.
102 const int kLineResiduePadRatio = 3;
103 // Min multiple of neighbour size for a line residue to be genuine.
104 const double kLineResidueSizeRatio = 1.75;
105 // Aspect ratio filter for OSD.
106 const float kSizeRatioToReject = 2.0;
107 // Max number of normal blobs a large blob may overlap before it is rejected
108 // and determined to be image
109 const int kMaxLargeOverlaps = 3;
110 // Expansion factor for search box for good neighbours.
111 const double kNeighbourSearchFactor = 2.5;
112 // Factor of increase of overlap when adding diacritics to make an image noisy.
113 const double kNoiseOverlapGrowthFactor = 4.0;
114 // Fraction of the image size to add overlap when adding diacritics for an
115 // image to qualify as noisy.
116 const double kNoiseOverlapAreaFactor = 1.0 / 512;
117 // Ratio of perimeter^2/area for a blob to be considered noise vs i dot.
118 const double kShapePerimeterRatio = 3.0;
119 
121  const ICOORD& bleft, const ICOORD& tright)
122  : BlobGrid(gridsize, bleft, tright), nontext_map_(NULL), projection_(NULL),
123  denorm_(NULL), grid_box_(bleft, tright), rerotation_(1.0f, 0.0f) {
124  leaders_win_ = NULL;
125  widths_win_ = NULL;
126  initial_widths_win_ = NULL;
127  chains_win_ = NULL;
128  diacritics_win_ = NULL;
129  textlines_win_ = NULL;
130  smoothed_win_ = NULL;
131 }
132 
134  if (widths_win_ != NULL) {
135  #ifndef GRAPHICS_DISABLED
136  delete widths_win_->AwaitEvent(SVET_DESTROY);
137  #endif // GRAPHICS_DISABLED
139  exit(0);
140  delete widths_win_;
141  }
142  delete leaders_win_;
143  delete initial_widths_win_;
144  delete chains_win_;
145  delete textlines_win_;
146  delete smoothed_win_;
147  delete diacritics_win_;
148 }
149 
150 // Sets the neighbours member of the medium-sized blobs in the block.
151 // Searches on 4 sides of each blob for similar-sized, similar-strokewidth
152 // blobs and sets pointers to the good neighbours.
154  // Run a preliminary strokewidth neighbour detection on the medium blobs.
155  InsertBlobList(&block->blobs);
156  BLOBNBOX_IT blob_it(&block->blobs);
157  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
158  SetNeighbours(false, false, blob_it.data());
159  }
160  Clear();
161 }
162 
163 // Sets the neighbour/textline writing direction members of the medium
164 // and large blobs with optional repair of broken CJK characters first.
165 // Repair of broken CJK is needed here because broken CJK characters
166 // can fool the textline direction detection algorithm.
168  bool cjk_merge,
169  TO_BLOCK* input_block) {
170  // Setup the grid with the remaining (non-noise) blobs.
171  InsertBlobs(input_block);
172  // Repair broken CJK characters if needed.
173  while (cjk_merge && FixBrokenCJK(input_block));
174  // Grade blobs by inspection of neighbours.
175  FindTextlineFlowDirection(pageseg_mode, false);
176  // Clear the grid ready for rotation or leader finding.
177  Clear();
178 }
179 
180 // Helper to collect and count horizontal and vertical blobs from a list.
181 static void CollectHorizVertBlobs(BLOBNBOX_LIST* input_blobs,
182  int* num_vertical_blobs,
183  int* num_horizontal_blobs,
184  BLOBNBOX_CLIST* vertical_blobs,
185  BLOBNBOX_CLIST* horizontal_blobs,
186  BLOBNBOX_CLIST* nondescript_blobs) {
187  BLOBNBOX_C_IT v_it(vertical_blobs);
188  BLOBNBOX_C_IT h_it(horizontal_blobs);
189  BLOBNBOX_C_IT n_it(nondescript_blobs);
190  BLOBNBOX_IT blob_it(input_blobs);
191  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
192  BLOBNBOX* blob = blob_it.data();
193  const TBOX& box = blob->bounding_box();
194  float y_x = static_cast<float>(box.height()) / box.width();
195  float x_y = 1.0f / y_x;
196  // Select a >= 1.0 ratio
197  float ratio = x_y > y_x ? x_y : y_x;
198  // If the aspect ratio is small and we want them for osd, save the blob.
199  bool ok_blob = ratio <= kSizeRatioToReject;
200  if (blob->UniquelyVertical()) {
201  ++*num_vertical_blobs;
202  if (ok_blob) v_it.add_after_then_move(blob);
203  } else if (blob->UniquelyHorizontal()) {
204  ++*num_horizontal_blobs;
205  if (ok_blob) h_it.add_after_then_move(blob);
206  } else if (ok_blob) {
207  n_it.add_after_then_move(blob);
208  }
209  }
210 }
211 
212 
213 // Types all the blobs as vertical or horizontal text or unknown and
214 // returns true if the majority are vertical.
215 // If the blobs are rotated, it is necessary to call CorrectForRotation
216 // after rotating everything, otherwise the work done here will be enough.
217 // If osd_blobs is not null, a list of blobs from the dominant textline
218 // direction are returned for use in orientation and script detection.
219 bool StrokeWidth::TestVerticalTextDirection(double find_vertical_text_ratio,
220  TO_BLOCK* block,
221  BLOBNBOX_CLIST* osd_blobs) {
222  int vertical_boxes = 0;
223  int horizontal_boxes = 0;
224  // Count vertical normal and large blobs.
225  BLOBNBOX_CLIST vertical_blobs;
226  BLOBNBOX_CLIST horizontal_blobs;
227  BLOBNBOX_CLIST nondescript_blobs;
228  CollectHorizVertBlobs(&block->blobs, &vertical_boxes, &horizontal_boxes,
229  &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
230  CollectHorizVertBlobs(&block->large_blobs, &vertical_boxes, &horizontal_boxes,
231  &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
233  tprintf("TextDir hbox=%d vs vbox=%d, %dH, %dV, %dN osd blobs\n",
234  horizontal_boxes, vertical_boxes,
235  horizontal_blobs.length(), vertical_blobs.length(),
236  nondescript_blobs.length());
237  if (osd_blobs != NULL && vertical_boxes == 0 && horizontal_boxes == 0) {
238  // Only nondescript blobs available, so return those.
239  BLOBNBOX_C_IT osd_it(osd_blobs);
240  osd_it.add_list_after(&nondescript_blobs);
241  return false;
242  }
243  int min_vert_boxes = static_cast<int>((vertical_boxes + horizontal_boxes) *
244  find_vertical_text_ratio);
245  if (vertical_boxes >= min_vert_boxes) {
246  if (osd_blobs != NULL) {
247  BLOBNBOX_C_IT osd_it(osd_blobs);
248  osd_it.add_list_after(&vertical_blobs);
249  }
250  return true;
251  } else {
252  if (osd_blobs != NULL) {
253  BLOBNBOX_C_IT osd_it(osd_blobs);
254  osd_it.add_list_after(&horizontal_blobs);
255  }
256  return false;
257  }
258 }
259 
260 // Corrects the data structures for the given rotation.
262  ColPartitionGrid* part_grid) {
263  Init(part_grid->gridsize(), part_grid->bleft(), part_grid->tright());
264  grid_box_ = TBOX(bleft(), tright());
265  rerotation_.set_x(rotation.x());
266  rerotation_.set_y(-rotation.y());
267 }
268 
269 // Finds leader partitions and inserts them into the given part_grid.
271  ColPartitionGrid* part_grid) {
272  Clear();
273  // Find and isolate leaders in the noise list.
274  ColPartition_LIST leader_parts;
275  FindLeadersAndMarkNoise(block, &leader_parts);
276  // Setup the strokewidth grid with the block's remaining (non-noise) blobs.
277  InsertBlobList(&block->blobs);
278  // Mark blobs that have leader neighbours.
279  for (ColPartition_IT it(&leader_parts); !it.empty(); it.forward()) {
280  ColPartition* part = it.extract();
281  part->ClaimBoxes();
282  MarkLeaderNeighbours(part, LR_LEFT);
283  MarkLeaderNeighbours(part, LR_RIGHT);
284  part_grid->InsertBBox(true, true, part);
285  }
286 }
287 
288 // Finds and marks noise those blobs that look like bits of vertical lines
289 // that would otherwise screw up layout analysis.
290 void StrokeWidth::RemoveLineResidue(ColPartition_LIST* big_part_list) {
291  BlobGridSearch gsearch(this);
292  BLOBNBOX* bbox;
293  // For every vertical line-like bbox in the grid, search its neighbours
294  // to find the tallest, and if the original box is taller by sufficient
295  // margin, then call it line residue and delete it.
296  gsearch.StartFullSearch();
297  while ((bbox = gsearch.NextFullSearch()) != NULL) {
298  TBOX box = bbox->bounding_box();
299  if (box.height() < box.width() * kLineResidueAspectRatio)
300  continue;
301  // Set up a rectangle search around the blob to find the size of its
302  // neighbours.
303  int padding = box.height() * kLineResiduePadRatio;
304  TBOX search_box = box;
305  search_box.pad(padding, padding);
306  bool debug = AlignedBlob::WithinTestRegion(2, box.left(),
307  box.bottom());
308  // Find the largest object in the search box not equal to bbox.
309  BlobGridSearch rsearch(this);
310  int max_size = 0;
311  BLOBNBOX* n;
312  rsearch.StartRectSearch(search_box);
313  while ((n = rsearch.NextRectSearch()) != NULL) {
314  if (n == bbox) continue;
315  TBOX nbox = n->bounding_box();
316  if (nbox.height() > max_size) {
317  max_size = nbox.height();
318  }
319  }
320  if (debug) {
321  tprintf("Max neighbour size=%d for candidate line box at:", max_size);
322  box.print();
323  }
324  if (max_size * kLineResidueSizeRatio < box.height()) {
325  #ifndef GRAPHICS_DISABLED
326  if (leaders_win_ != NULL) {
327  // We are debugging, so display deleted in pink blobs in the same
328  // window that we use to display leader detection.
329  leaders_win_->Pen(ScrollView::PINK);
330  leaders_win_->Rectangle(box.left(), box.bottom(),
331  box.right(), box.top());
332  }
333  #endif // GRAPHICS_DISABLED
334  ColPartition::MakeBigPartition(bbox, big_part_list);
335  }
336  }
337 }
338 
339 // Types all the blobs as vertical text or horizontal text or unknown and
340 // puts them into initial ColPartitions in the supplied part_grid.
341 // rerotation determines how to get back to the image coordinates from the
342 // blob coordinates (since they may have been rotated for vertical text).
343 // block is the single block for the whole page or rectangle to be OCRed.
344 // nontext_pix (full-size), is a binary mask used to prevent merges across
345 // photo/text boundaries. It is not kept beyond this function.
346 // denorm provides a mapping back to the image from the current blob
347 // coordinate space.
348 // projection provides a measure of textline density over the image and
349 // provides functions to assist with diacritic detection. It should be a
350 // pointer to a new TextlineProjection, and will be setup here.
351 // part_grid is the output grid of textline partitions.
352 // Large blobs that cause overlap are put in separate partitions and added
353 // to the big_parts list.
355  PageSegMode pageseg_mode, const FCOORD& rerotation, TO_BLOCK* block,
356  Pix* nontext_pix, const DENORM* denorm, bool cjk_script,
357  TextlineProjection* projection, BLOBNBOX_LIST* diacritic_blobs,
358  ColPartitionGrid* part_grid, ColPartition_LIST* big_parts) {
359  nontext_map_ = nontext_pix;
360  projection_ = projection;
361  denorm_ = denorm;
362  // Clear and re Insert to take advantage of the tab stops in the blobs.
363  Clear();
364  // Setup the strokewidth grid with the remaining non-noise, non-leader blobs.
365  InsertBlobs(block);
366 
367  // Run FixBrokenCJK() again if the page is CJK.
368  if (cjk_script) {
369  FixBrokenCJK(block);
370  }
371  FindTextlineFlowDirection(pageseg_mode, false);
372  projection_->ConstructProjection(block, rerotation, nontext_map_);
374  ScrollView* line_blobs_win = MakeWindow(0, 0, "Initial textline Blobs");
375  projection_->PlotGradedBlobs(&block->blobs, line_blobs_win);
376  projection_->PlotGradedBlobs(&block->small_blobs, line_blobs_win);
377  }
378  projection_->MoveNonTextlineBlobs(&block->blobs, &block->noise_blobs);
379  projection_->MoveNonTextlineBlobs(&block->small_blobs, &block->noise_blobs);
380  // Clear and re Insert to take advantage of the removed diacritics.
381  Clear();
382  InsertBlobs(block);
383  FCOORD skew;
384  FindTextlineFlowDirection(pageseg_mode, true);
386  FindInitialPartitions(pageseg_mode, rerotation, true, block,
387  diacritic_blobs, part_grid, big_parts, &skew);
388  if (r == PFR_NOISE) {
389  tprintf("Detected %d diacritics\n", diacritic_blobs->length());
390  // Noise was found, and removed.
391  Clear();
392  InsertBlobs(block);
393  FindTextlineFlowDirection(pageseg_mode, true);
394  r = FindInitialPartitions(pageseg_mode, rerotation, false, block,
395  diacritic_blobs, part_grid, big_parts, &skew);
396  }
397  nontext_map_ = NULL;
398  projection_ = NULL;
399  denorm_ = NULL;
400 }
401 
402 static void PrintBoxWidths(BLOBNBOX* neighbour) {
403  TBOX nbox = neighbour->bounding_box();
404  tprintf("Box (%d,%d)->(%d,%d): h-width=%.1f, v-width=%.1f p-width=%1.f\n",
405  nbox.left(), nbox.bottom(), nbox.right(), nbox.top(),
406  neighbour->horz_stroke_width(), neighbour->vert_stroke_width(),
407  2.0 * neighbour->cblob()->area()/neighbour->cblob()->perimeter());
408 }
409 
411 void StrokeWidth::HandleClick(int x, int y) {
413  // Run a radial search for blobs that overlap.
414  BlobGridSearch radsearch(this);
415  radsearch.StartRadSearch(x, y, 1);
416  BLOBNBOX* neighbour;
417  FCOORD click(static_cast<float>(x), static_cast<float>(y));
418  while ((neighbour = radsearch.NextRadSearch()) != NULL) {
419  TBOX nbox = neighbour->bounding_box();
420  if (nbox.contains(click) && neighbour->cblob() != NULL) {
421  PrintBoxWidths(neighbour);
422  if (neighbour->neighbour(BND_LEFT) != NULL)
423  PrintBoxWidths(neighbour->neighbour(BND_LEFT));
424  if (neighbour->neighbour(BND_RIGHT) != NULL)
425  PrintBoxWidths(neighbour->neighbour(BND_RIGHT));
426  if (neighbour->neighbour(BND_ABOVE) != NULL)
427  PrintBoxWidths(neighbour->neighbour(BND_ABOVE));
428  if (neighbour->neighbour(BND_BELOW) != NULL)
429  PrintBoxWidths(neighbour->neighbour(BND_BELOW));
430  int gaps[BND_COUNT];
431  neighbour->NeighbourGaps(gaps);
432  tprintf("Left gap=%d, right=%d, above=%d, below=%d, horz=%d, vert=%d\n"
433  "Good= %d %d %d %d\n",
434  gaps[BND_LEFT], gaps[BND_RIGHT],
435  gaps[BND_ABOVE], gaps[BND_BELOW],
436  neighbour->horz_possible(),
437  neighbour->vert_possible(),
438  neighbour->good_stroke_neighbour(BND_LEFT),
439  neighbour->good_stroke_neighbour(BND_RIGHT),
440  neighbour->good_stroke_neighbour(BND_ABOVE),
441  neighbour->good_stroke_neighbour(BND_BELOW));
442  break;
443  }
444  }
445 }
446 
447 // Detects and marks leader dots/dashes.
448 // Leaders are horizontal chains of small or noise blobs that look
449 // monospace according to ColPartition::MarkAsLeaderIfMonospaced().
450 // Detected leaders become the only occupants of the block->small_blobs list.
451 // Non-leader small blobs get moved to the blobs list.
452 // Non-leader noise blobs remain singletons in the noise list.
453 // All small and noise blobs in high density regions are marked BTFT_NONTEXT.
454 // block is the single block for the whole page or rectangle to be OCRed.
455 // leader_parts is the output.
456 void StrokeWidth::FindLeadersAndMarkNoise(TO_BLOCK* block,
457  ColPartition_LIST* leader_parts) {
458  InsertBlobList(&block->small_blobs);
459  InsertBlobList(&block->noise_blobs);
460  BlobGridSearch gsearch(this);
461  BLOBNBOX* bbox;
462  // For every bbox in the grid, set its neighbours.
463  gsearch.StartFullSearch();
464  while ((bbox = gsearch.NextFullSearch()) != NULL) {
465  SetNeighbours(true, false, bbox);
466  }
467  ColPartition_IT part_it(leader_parts);
468  gsearch.StartFullSearch();
469  while ((bbox = gsearch.NextFullSearch()) != NULL) {
470  if (bbox->flow() == BTFT_NONE) {
471  if (bbox->neighbour(BND_RIGHT) == NULL &&
472  bbox->neighbour(BND_LEFT) == NULL)
473  continue;
474  // Put all the linked blobs into a ColPartition.
475  ColPartition* part = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
476  BLOBNBOX* blob;
477  for (blob = bbox; blob != NULL && blob->flow() == BTFT_NONE;
478  blob = blob->neighbour(BND_RIGHT))
479  part->AddBox(blob);
480  for (blob = bbox->neighbour(BND_LEFT); blob != NULL &&
481  blob->flow() == BTFT_NONE;
482  blob = blob->neighbour(BND_LEFT))
483  part->AddBox(blob);
484  if (part->MarkAsLeaderIfMonospaced())
485  part_it.add_after_then_move(part);
486  else
487  delete part;
488  }
489  }
491  leaders_win_ = DisplayGoodBlobs("LeaderNeighbours", 0, 0);
492  }
493  // Move any non-leaders from the small to the blobs list, as they are
494  // most likely dashes or broken characters.
495  BLOBNBOX_IT blob_it(&block->blobs);
496  BLOBNBOX_IT small_it(&block->small_blobs);
497  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
498  BLOBNBOX* blob = small_it.data();
499  if (blob->flow() != BTFT_LEADER) {
500  if (blob->flow() == BTFT_NEIGHBOURS)
501  blob->set_flow(BTFT_NONE);
502  blob->ClearNeighbours();
503  blob_it.add_to_end(small_it.extract());
504  }
505  }
506  // Move leaders from the noise list to the small list, leaving the small
507  // list exclusively leaders, so they don't get processed further,
508  // and the remaining small blobs all in the noise list.
509  BLOBNBOX_IT noise_it(&block->noise_blobs);
510  for (noise_it.mark_cycle_pt(); !noise_it.cycled_list(); noise_it.forward()) {
511  BLOBNBOX* blob = noise_it.data();
512  if (blob->flow() == BTFT_LEADER || blob->joined_to_prev()) {
513  small_it.add_to_end(noise_it.extract());
514  } else if (blob->flow() == BTFT_NEIGHBOURS) {
515  blob->set_flow(BTFT_NONE);
516  blob->ClearNeighbours();
517  }
518  }
519  // Clear the grid as we don't want the small stuff hanging around in it.
520  Clear();
521 }
522 
525 void StrokeWidth::InsertBlobs(TO_BLOCK* block) {
526  InsertBlobList(&block->blobs);
527  InsertBlobList(&block->large_blobs);
528 }
529 
530 // Checks the left or right side of the given leader partition and sets the
531 // (opposite) leader_on_right or leader_on_left flags for blobs
532 // that are next to the given side of the given leader partition.
533 void StrokeWidth::MarkLeaderNeighbours(const ColPartition* part,
534  LeftOrRight side) {
535  const TBOX& part_box = part->bounding_box();
536  BlobGridSearch blobsearch(this);
537  // Search to the side of the leader for the nearest neighbour.
538  BLOBNBOX* best_blob = NULL;
539  int best_gap = 0;
540  blobsearch.StartSideSearch(side == LR_LEFT ? part_box.left()
541  : part_box.right(),
542  part_box.bottom(), part_box.top());
543  BLOBNBOX* blob;
544  while ((blob = blobsearch.NextSideSearch(side == LR_LEFT)) != NULL) {
545  const TBOX& blob_box = blob->bounding_box();
546  if (!blob_box.y_overlap(part_box))
547  continue;
548  int x_gap = blob_box.x_gap(part_box);
549  if (x_gap > 2 * gridsize()) {
550  break;
551  } else if (best_blob == NULL || x_gap < best_gap) {
552  best_blob = blob;
553  best_gap = x_gap;
554  }
555  }
556  if (best_blob != NULL) {
557  if (side == LR_LEFT)
558  best_blob->set_leader_on_right(true);
559  else
560  best_blob->set_leader_on_left(true);
561  #ifndef GRAPHICS_DISABLED
562  if (leaders_win_ != NULL) {
563  leaders_win_->Pen(side == LR_LEFT ? ScrollView::RED : ScrollView::GREEN);
564  const TBOX& blob_box = best_blob->bounding_box();
565  leaders_win_->Rectangle(blob_box.left(), blob_box.bottom(),
566  blob_box.right(), blob_box.top());
567  }
568  #endif // GRAPHICS_DISABLED
569  }
570 }
571 
572 // Helper to compute the UQ of the square-ish CJK charcters.
573 static int UpperQuartileCJKSize(int gridsize, BLOBNBOX_LIST* blobs) {
574  STATS sizes(0, gridsize * kMaxCJKSizeRatio);
575  BLOBNBOX_IT it(blobs);
576  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
577  BLOBNBOX* blob = it.data();
578  int width = blob->bounding_box().width();
579  int height = blob->bounding_box().height();
580  if (width <= height * kCJKAspectRatio && height < width * kCJKAspectRatio)
581  sizes.add(height, 1);
582  }
583  return static_cast<int>(sizes.ile(0.75f) + 0.5);
584 }
585 
586 // Fix broken CJK characters, using the fake joined blobs mechanism.
587 // Blobs are really merged, ie the master takes all the outlines and the
588 // others are deleted.
589 // Returns true if sufficient blobs are merged that it may be worth running
590 // again, due to a better estimate of character size.
591 bool StrokeWidth::FixBrokenCJK(TO_BLOCK* block) {
592  BLOBNBOX_LIST* blobs = &block->blobs;
593  int median_height = UpperQuartileCJKSize(gridsize(), blobs);
594  int max_dist = static_cast<int>(median_height * kCJKBrokenDistanceFraction);
595  int max_size = static_cast<int>(median_height * kCJKAspectRatio);
596  int num_fixed = 0;
597  BLOBNBOX_IT blob_it(blobs);
598 
599  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
600  BLOBNBOX* blob = blob_it.data();
601  if (blob->cblob() == NULL || blob->cblob()->out_list()->empty())
602  continue;
603  TBOX bbox = blob->bounding_box();
604  bool debug = AlignedBlob::WithinTestRegion(3, bbox.left(),
605  bbox.bottom());
606  if (debug) {
607  tprintf("Checking for Broken CJK (max size=%d):", max_size);
608  bbox.print();
609  }
610  // Generate a list of blobs that overlap or are near enough to merge.
611  BLOBNBOX_CLIST overlapped_blobs;
612  AccumulateOverlaps(blob, debug, max_size, max_dist,
613  &bbox, &overlapped_blobs);
614  if (!overlapped_blobs.empty()) {
615  // There are overlapping blobs, so qualify them as being satisfactory
616  // before removing them from the grid and replacing them with the union.
617  // The final box must be roughly square.
618  if (bbox.width() > bbox.height() * kCJKAspectRatio ||
619  bbox.height() > bbox.width() * kCJKAspectRatio) {
620  if (debug) {
621  tprintf("Bad final aspectratio:");
622  bbox.print();
623  }
624  continue;
625  }
626  // There can't be too many blobs to merge.
627  if (overlapped_blobs.length() >= kCJKMaxComponents) {
628  if (debug)
629  tprintf("Too many neighbours: %d\n", overlapped_blobs.length());
630  continue;
631  }
632  // The strokewidths must match amongst the join candidates.
633  BLOBNBOX_C_IT n_it(&overlapped_blobs);
634  for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
635  BLOBNBOX* neighbour = NULL;
636  neighbour = n_it.data();
637  if (!blob->MatchingStrokeWidth(*neighbour, kStrokeWidthFractionCJK,
638  kStrokeWidthCJK))
639  break;
640  }
641  if (!n_it.cycled_list()) {
642  if (debug) {
643  tprintf("Bad stroke widths:");
644  PrintBoxWidths(blob);
645  }
646  continue; // Not good enough.
647  }
648 
649  // Merge all the candidates into blob.
650  // We must remove blob from the grid and reinsert it after merging
651  // to maintain the integrity of the grid.
652  RemoveBBox(blob);
653  // Everything else will be calculated later.
654  for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
655  BLOBNBOX* neighbour = n_it.data();
656  RemoveBBox(neighbour);
657  // Mark empty blob for deletion.
658  neighbour->set_region_type(BRT_NOISE);
659  blob->really_merge(neighbour);
660  if (rerotation_.x() != 1.0f || rerotation_.y() != 0.0f) {
661  blob->rotate_box(rerotation_);
662  }
663  }
664  InsertBBox(true, true, blob);
665  ++num_fixed;
666  if (debug) {
667  tprintf("Done! Final box:");
668  bbox.print();
669  }
670  }
671  }
672  // Count remaining blobs.
673  int num_remaining = 0;
674  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
675  BLOBNBOX* blob = blob_it.data();
676  if (blob->cblob() != NULL && !blob->cblob()->out_list()->empty()) {
677  ++num_remaining;
678  }
679  }
680  // Permanently delete all the marked blobs after first removing all
681  // references in the neighbour members.
682  block->DeleteUnownedNoise();
683  return num_fixed > num_remaining * kBrokenCJKIterationFraction;
684 }
685 
686 // Helper function to determine whether it is reasonable to merge the
687 // bbox and the nbox for repairing broken CJK.
688 // The distance apart must not exceed max_dist, the combined size must
689 // not exceed max_size, and the aspect ratio must either improve or at
690 // least not get worse by much.
691 static bool AcceptableCJKMerge(const TBOX& bbox, const TBOX& nbox,
692  bool debug, int max_size, int max_dist,
693  int* x_gap, int* y_gap) {
694  *x_gap = bbox.x_gap(nbox);
695  *y_gap = bbox.y_gap(nbox);
696  TBOX merged(nbox);
697  merged += bbox;
698  if (debug) {
699  tprintf("gaps = %d, %d, merged_box:", *x_gap, *y_gap);
700  merged.print();
701  }
702  if (*x_gap <= max_dist && *y_gap <= max_dist &&
703  merged.width() <= max_size && merged.height() <= max_size) {
704  // Close enough to call overlapping. Check aspect ratios.
705  double old_ratio = static_cast<double>(bbox.width()) / bbox.height();
706  if (old_ratio < 1.0) old_ratio = 1.0 / old_ratio;
707  double new_ratio = static_cast<double>(merged.width()) / merged.height();
708  if (new_ratio < 1.0) new_ratio = 1.0 / new_ratio;
709  if (new_ratio <= old_ratio * kCJKAspectRatioIncrease)
710  return true;
711  }
712  return false;
713 }
714 
715 // Collect blobs that overlap or are within max_dist of the input bbox.
716 // Return them in the list of blobs and expand the bbox to be the union
717 // of all the boxes. not_this is excluded from the search, as are blobs
718 // that cause the merged box to exceed max_size in either dimension.
719 void StrokeWidth::AccumulateOverlaps(const BLOBNBOX* not_this, bool debug,
720  int max_size, int max_dist,
721  TBOX* bbox, BLOBNBOX_CLIST* blobs) {
722  // While searching, nearests holds the nearest failed blob in each
723  // direction. When we have a nearest in each of the 4 directions, then
724  // the search is over, and at this point the final bbox must not overlap
725  // any of the nearests.
726  BLOBNBOX* nearests[BND_COUNT];
727  for (int i = 0; i < BND_COUNT; ++i) {
728  nearests[i] = NULL;
729  }
730  int x = (bbox->left() + bbox->right()) / 2;
731  int y = (bbox->bottom() + bbox->top()) / 2;
732  // Run a radial search for blobs that overlap or are sufficiently close.
733  BlobGridSearch radsearch(this);
734  radsearch.StartRadSearch(x, y, kCJKRadius);
735  BLOBNBOX* neighbour;
736  while ((neighbour = radsearch.NextRadSearch()) != NULL) {
737  if (neighbour == not_this) continue;
738  TBOX nbox = neighbour->bounding_box();
739  int x_gap, y_gap;
740  if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, max_dist,
741  &x_gap, &y_gap)) {
742  // Close enough to call overlapping. Merge boxes.
743  *bbox += nbox;
744  blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
745  if (debug) {
746  tprintf("Added:");
747  nbox.print();
748  }
749  // Since we merged, search the nearests, as some might now me mergeable.
750  for (int dir = 0; dir < BND_COUNT; ++dir) {
751  if (nearests[dir] == NULL) continue;
752  nbox = nearests[dir]->bounding_box();
753  if (AcceptableCJKMerge(*bbox, nbox, debug, max_size,
754  max_dist, &x_gap, &y_gap)) {
755  // Close enough to call overlapping. Merge boxes.
756  *bbox += nbox;
757  blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, nearests[dir]);
758  if (debug) {
759  tprintf("Added:");
760  nbox.print();
761  }
762  nearests[dir] = NULL;
763  dir = -1; // Restart the search.
764  }
765  }
766  } else if (x_gap < 0 && x_gap <= y_gap) {
767  // A vertical neighbour. Record the nearest.
768  BlobNeighbourDir dir = nbox.top() > bbox->top() ? BND_ABOVE : BND_BELOW;
769  if (nearests[dir] == NULL ||
770  y_gap < bbox->y_gap(nearests[dir]->bounding_box())) {
771  nearests[dir] = neighbour;
772  }
773  } else if (y_gap < 0 && y_gap <= x_gap) {
774  // A horizontal neighbour. Record the nearest.
775  BlobNeighbourDir dir = nbox.left() > bbox->left() ? BND_RIGHT : BND_LEFT;
776  if (nearests[dir] == NULL ||
777  x_gap < bbox->x_gap(nearests[dir]->bounding_box())) {
778  nearests[dir] = neighbour;
779  }
780  }
781  // If all nearests are non-null, then we have finished.
782  if (nearests[BND_LEFT] && nearests[BND_RIGHT] &&
783  nearests[BND_ABOVE] && nearests[BND_BELOW])
784  break;
785  }
786  // Final overlap with a nearest is not allowed.
787  for (int dir = 0; dir < BND_COUNT; ++dir) {
788  if (nearests[dir] == NULL) continue;
789  const TBOX& nbox = nearests[dir]->bounding_box();
790  if (debug) {
791  tprintf("Testing for overlap with:");
792  nbox.print();
793  }
794  if (bbox->overlap(nbox)) {
795  blobs->shallow_clear();
796  if (debug)
797  tprintf("Final box overlaps nearest\n");
798  return;
799  }
800  }
801 }
802 
803 // For each blob in this grid, Finds the textline direction to be horizontal
804 // or vertical according to distance to neighbours and 1st and 2nd order
805 // neighbours. Non-text tends to end up without a definite direction.
806 // Result is setting of the neighbours and vert_possible/horz_possible
807 // flags in the BLOBNBOXes currently in this grid.
808 // This function is called more than once if page orientation is uncertain,
809 // so display_if_debugging is true on the final call to display the results.
810 void StrokeWidth::FindTextlineFlowDirection(PageSegMode pageseg_mode,
811  bool display_if_debugging) {
812  BlobGridSearch gsearch(this);
813  BLOBNBOX* bbox;
814  // For every bbox in the grid, set its neighbours.
815  gsearch.StartFullSearch();
816  while ((bbox = gsearch.NextFullSearch()) != NULL) {
817  SetNeighbours(false, display_if_debugging, bbox);
818  }
819  // Where vertical or horizontal wins by a big margin, clarify it.
820  gsearch.StartFullSearch();
821  while ((bbox = gsearch.NextFullSearch()) != NULL) {
822  SimplifyObviousNeighbours(bbox);
823  }
824  // Now try to make the blobs only vertical or horizontal using neighbours.
825  gsearch.StartFullSearch();
826  while ((bbox = gsearch.NextFullSearch()) != NULL) {
827  if (FindingVerticalOnly(pageseg_mode)) {
828  bbox->set_vert_possible(true);
829  bbox->set_horz_possible(false);
830  } else if (FindingHorizontalOnly(pageseg_mode)) {
831  bbox->set_vert_possible(false);
832  bbox->set_horz_possible(true);
833  } else {
834  SetNeighbourFlows(bbox);
835  }
836  }
837  if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
839  initial_widths_win_ = DisplayGoodBlobs("InitialStrokewidths", 400, 0);
840  }
841  // Improve flow direction with neighbours.
842  gsearch.StartFullSearch();
843  while ((bbox = gsearch.NextFullSearch()) != NULL) {
844  SmoothNeighbourTypes(pageseg_mode, false, bbox);
845  }
846  // Now allow reset of firm values to fix renegades.
847  gsearch.StartFullSearch();
848  while ((bbox = gsearch.NextFullSearch()) != NULL) {
849  SmoothNeighbourTypes(pageseg_mode, true, bbox);
850  }
851  // Repeat.
852  gsearch.StartFullSearch();
853  while ((bbox = gsearch.NextFullSearch()) != NULL) {
854  SmoothNeighbourTypes(pageseg_mode, true, bbox);
855  }
856  if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
858  widths_win_ = DisplayGoodBlobs("ImprovedStrokewidths", 800, 0);
859  }
860 }
861 
862 // Sets the neighbours and good_stroke_neighbours members of the blob by
863 // searching close on all 4 sides.
864 // When finding leader dots/dashes, there is a slightly different rule for
865 // what makes a good neighbour.
866 void StrokeWidth::SetNeighbours(bool leaders, bool activate_line_trap,
867  BLOBNBOX* blob) {
868  int line_trap_count = 0;
869  for (int dir = 0; dir < BND_COUNT; ++dir) {
870  BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
871  line_trap_count += FindGoodNeighbour(bnd, leaders, blob);
872  }
873  if (line_trap_count > 0 && activate_line_trap) {
874  // It looks like a line so isolate it by clearing its neighbours.
875  blob->ClearNeighbours();
876  const TBOX& box = blob->bounding_box();
877  blob->set_region_type(box.width() > box.height() ? BRT_HLINE : BRT_VLINE);
878  }
879 }
880 
881 
882 // Sets the good_stroke_neighbours member of the blob if it has a
883 // GoodNeighbour on the given side.
884 // Also sets the neighbour in the blob, whether or not a good one is found.
885 // Returns the number of blobs in the nearby search area that would lead us to
886 // believe that this blob is a line separator.
887 // Leaders get extra special lenient treatment.
888 int StrokeWidth::FindGoodNeighbour(BlobNeighbourDir dir, bool leaders,
889  BLOBNBOX* blob) {
890  // Search for neighbours that overlap vertically.
891  TBOX blob_box = blob->bounding_box();
892  bool debug = AlignedBlob::WithinTestRegion(2, blob_box.left(),
893  blob_box.bottom());
894  if (debug) {
895  tprintf("FGN in dir %d for blob:", dir);
896  blob_box.print();
897  }
898  int top = blob_box.top();
899  int bottom = blob_box.bottom();
900  int left = blob_box.left();
901  int right = blob_box.right();
902  int width = right - left;
903  int height = top - bottom;
904 
905  // A trap to detect lines tests for the min dimension of neighbours
906  // being larger than a multiple of the min dimension of the line
907  // and the larger dimension being smaller than a fraction of the max
908  // dimension of the line.
909  int line_trap_max = MAX(width, height) / kLineTrapLongest;
910  int line_trap_min = MIN(width, height) * kLineTrapShortest;
911  int line_trap_count = 0;
912 
913  int min_good_overlap = (dir == BND_LEFT || dir == BND_RIGHT)
914  ? height / 2 : width / 2;
915  int min_decent_overlap = (dir == BND_LEFT || dir == BND_RIGHT)
916  ? height / 3 : width / 3;
917  if (leaders)
918  min_good_overlap = min_decent_overlap = 1;
919 
920  int search_pad = static_cast<int>(
921  sqrt(static_cast<double>(width * height)) * kNeighbourSearchFactor);
922  if (gridsize() > search_pad)
923  search_pad = gridsize();
924  TBOX search_box = blob_box;
925  // Pad the search in the appropriate direction.
926  switch (dir) {
927  case BND_LEFT:
928  search_box.set_left(search_box.left() - search_pad);
929  break;
930  case BND_RIGHT:
931  search_box.set_right(search_box.right() + search_pad);
932  break;
933  case BND_BELOW:
934  search_box.set_bottom(search_box.bottom() - search_pad);
935  break;
936  case BND_ABOVE:
937  search_box.set_top(search_box.top() + search_pad);
938  break;
939  case BND_COUNT:
940  return 0;
941  }
942 
943  BlobGridSearch rectsearch(this);
944  rectsearch.StartRectSearch(search_box);
945  BLOBNBOX* best_neighbour = NULL;
946  double best_goodness = 0.0;
947  bool best_is_good = false;
948  BLOBNBOX* neighbour;
949  while ((neighbour = rectsearch.NextRectSearch()) != NULL) {
950  TBOX nbox = neighbour->bounding_box();
951  if (neighbour == blob)
952  continue;
953  int mid_x = (nbox.left() + nbox.right()) / 2;
954  if (mid_x < blob->left_rule() || mid_x > blob->right_rule())
955  continue; // In a different column.
956  if (debug) {
957  tprintf("Neighbour at:");
958  nbox.print();
959  }
960 
961  // Last-minute line detector. There is a small upper limit to the line
962  // width accepted by the morphological line detector.
963  int n_width = nbox.width();
964  int n_height = nbox.height();
965  if (MIN(n_width, n_height) > line_trap_min &&
966  MAX(n_width, n_height) < line_trap_max)
967  ++line_trap_count;
968  // Heavily joined text, such as Arabic may have very different sizes when
969  // looking at the maxes, but the heights may be almost identical, so check
970  // for a difference in height if looking sideways or width vertically.
971  if (TabFind::VeryDifferentSizes(MAX(n_width, n_height),
972  MAX(width, height)) &&
973  (((dir == BND_LEFT || dir ==BND_RIGHT) &&
974  TabFind::DifferentSizes(n_height, height)) ||
975  ((dir == BND_BELOW || dir ==BND_ABOVE) &&
976  TabFind::DifferentSizes(n_width, width)))) {
977  if (debug) tprintf("Bad size\n");
978  continue; // Could be a different font size or non-text.
979  }
980  // Amount of vertical overlap between the blobs.
981  int overlap;
982  // If the overlap is along the short side of the neighbour, and it
983  // is fully overlapped, then perp_overlap holds the length of the long
984  // side of the neighbour. A measure to include hyphens and dashes as
985  // legitimate neighbours.
986  int perp_overlap;
987  int gap;
988  if (dir == BND_LEFT || dir == BND_RIGHT) {
989  overlap = MIN(nbox.top(), top) - MAX(nbox.bottom(), bottom);
990  if (overlap == nbox.height() && nbox.width() > nbox.height())
991  perp_overlap = nbox.width();
992  else
993  perp_overlap = overlap;
994  gap = dir == BND_LEFT ? left - nbox.left() : nbox.right() - right;
995  if (gap <= 0) {
996  if (debug) tprintf("On wrong side\n");
997  continue; // On the wrong side.
998  }
999  gap -= n_width;
1000  } else {
1001  overlap = MIN(nbox.right(), right) - MAX(nbox.left(), left);
1002  if (overlap == nbox.width() && nbox.height() > nbox.width())
1003  perp_overlap = nbox.height();
1004  else
1005  perp_overlap = overlap;
1006  gap = dir == BND_BELOW ? bottom - nbox.bottom() : nbox.top() - top;
1007  if (gap <= 0) {
1008  if (debug) tprintf("On wrong side\n");
1009  continue; // On the wrong side.
1010  }
1011  gap -= n_height;
1012  }
1013  if (-gap > overlap) {
1014  if (debug) tprintf("Overlaps wrong way\n");
1015  continue; // Overlaps the wrong way.
1016  }
1017  if (perp_overlap < min_decent_overlap) {
1018  if (debug) tprintf("Doesn't overlap enough\n");
1019  continue; // Doesn't overlap enough.
1020  }
1021  bool bad_sizes = TabFind::DifferentSizes(height, n_height) &&
1022  TabFind::DifferentSizes(width, n_width);
1023  bool is_good = overlap >= min_good_overlap && !bad_sizes &&
1024  blob->MatchingStrokeWidth(*neighbour,
1025  kStrokeWidthFractionTolerance,
1026  kStrokeWidthTolerance);
1027  // Best is a fuzzy combination of gap, overlap and is good.
1028  // Basically if you make one thing twice as good without making
1029  // anything else twice as bad, then it is better.
1030  if (gap < 1) gap = 1;
1031  double goodness = (1.0 + is_good) * overlap / gap;
1032  if (debug) {
1033  tprintf("goodness = %g vs best of %g, good=%d, overlap=%d, gap=%d\n",
1034  goodness, best_goodness, is_good, overlap, gap);
1035  }
1036  if (goodness > best_goodness) {
1037  best_neighbour = neighbour;
1038  best_goodness = goodness;
1039  best_is_good = is_good;
1040  }
1041  }
1042  blob->set_neighbour(dir, best_neighbour, best_is_good);
1043  return line_trap_count;
1044 }
1045 
1046 // Helper to get a list of 1st-order neighbours.
1047 static void ListNeighbours(const BLOBNBOX* blob,
1048  BLOBNBOX_CLIST* neighbours) {
1049  for (int dir = 0; dir < BND_COUNT; ++dir) {
1050  BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
1051  BLOBNBOX* neighbour = blob->neighbour(bnd);
1052  if (neighbour != NULL) {
1053  neighbours->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
1054  }
1055  }
1056 }
1057 
1058 // Helper to get a list of 1st and 2nd order neighbours.
1059 static void List2ndNeighbours(const BLOBNBOX* blob,
1060  BLOBNBOX_CLIST* neighbours) {
1061  ListNeighbours(blob, neighbours);
1062  for (int dir = 0; dir < BND_COUNT; ++dir) {
1063  BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
1064  BLOBNBOX* neighbour = blob->neighbour(bnd);
1065  if (neighbour != NULL) {
1066  ListNeighbours(neighbour, neighbours);
1067  }
1068  }
1069 }
1070 
1071 // Helper to get a list of 1st, 2nd and 3rd order neighbours.
1072 static void List3rdNeighbours(const BLOBNBOX* blob,
1073  BLOBNBOX_CLIST* neighbours) {
1074  List2ndNeighbours(blob, neighbours);
1075  for (int dir = 0; dir < BND_COUNT; ++dir) {
1076  BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
1077  BLOBNBOX* neighbour = blob->neighbour(bnd);
1078  if (neighbour != NULL) {
1079  List2ndNeighbours(neighbour, neighbours);
1080  }
1081  }
1082 }
1083 
1084 // Helper to count the evidence for verticalness or horizontalness
1085 // in a list of neighbours.
1086 static void CountNeighbourGaps(bool debug, BLOBNBOX_CLIST* neighbours,
1087  int* pure_h_count, int* pure_v_count) {
1088  if (neighbours->length() <= kMostlyOneDirRatio)
1089  return;
1090  BLOBNBOX_C_IT it(neighbours);
1091  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1092  BLOBNBOX* blob = it.data();
1093  int h_min, h_max, v_min, v_max;
1094  blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1095  if (debug)
1096  tprintf("Hgaps [%d,%d], vgaps [%d,%d]:", h_min, h_max, v_min, v_max);
1097  if (h_max < v_min ||
1098  blob->leader_on_left() || blob->leader_on_right()) {
1099  // Horizontal gaps are clear winners. Count a pure horizontal.
1100  ++*pure_h_count;
1101  if (debug) tprintf("Horz at:");
1102  } else if (v_max < h_min) {
1103  // Vertical gaps are clear winners. Clear a pure vertical.
1104  ++*pure_v_count;
1105  if (debug) tprintf("Vert at:");
1106  } else {
1107  if (debug) tprintf("Neither at:");
1108  }
1109  if (debug)
1110  blob->bounding_box().print();
1111  }
1112 }
1113 
1114 // Makes the blob to be only horizontal or vertical where evidence
1115 // is clear based on gaps of 2nd order neighbours, or definite individual
1116 // blobs.
1117 void StrokeWidth::SetNeighbourFlows(BLOBNBOX* blob) {
1118  if (blob->DefiniteIndividualFlow())
1119  return;
1120  bool debug = AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1121  blob->bounding_box().bottom());
1122  if (debug) {
1123  tprintf("SetNeighbourFlows (current flow=%d, type=%d) on:",
1124  blob->flow(), blob->region_type());
1125  blob->bounding_box().print();
1126  }
1127  BLOBNBOX_CLIST neighbours;
1128  List3rdNeighbours(blob, &neighbours);
1129  // The number of pure horizontal and vertical neighbours.
1130  int pure_h_count = 0;
1131  int pure_v_count = 0;
1132  CountNeighbourGaps(debug, &neighbours, &pure_h_count, &pure_v_count);
1133  if (debug) {
1134  HandleClick(blob->bounding_box().left() + 1,
1135  blob->bounding_box().bottom() + 1);
1136  tprintf("SetFlows: h_count=%d, v_count=%d\n",
1137  pure_h_count, pure_v_count);
1138  }
1139  if (!neighbours.empty()) {
1140  blob->set_vert_possible(true);
1141  blob->set_horz_possible(true);
1142  if (pure_h_count > 2 * pure_v_count) {
1143  // Horizontal gaps are clear winners. Clear vertical neighbours.
1144  blob->set_vert_possible(false);
1145  } else if (pure_v_count > 2 * pure_h_count) {
1146  // Vertical gaps are clear winners. Clear horizontal neighbours.
1147  blob->set_horz_possible(false);
1148  }
1149  } else {
1150  // Lonely blob. Can't tell its flow direction.
1151  blob->set_vert_possible(false);
1152  blob->set_horz_possible(false);
1153  }
1154 }
1155 
1156 
1157 // Helper to count the number of horizontal and vertical blobs in a list.
1158 static void CountNeighbourTypes(BLOBNBOX_CLIST* neighbours,
1159  int* pure_h_count, int* pure_v_count) {
1160  BLOBNBOX_C_IT it(neighbours);
1161  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1162  BLOBNBOX* blob = it.data();
1163  if (blob->UniquelyHorizontal())
1164  ++*pure_h_count;
1165  if (blob->UniquelyVertical())
1166  ++*pure_v_count;
1167  }
1168 }
1169 
1170 // Nullify the neighbours in the wrong directions where the direction
1171 // is clear-cut based on a distance margin. Good for isolating vertical
1172 // text from neighbouring horizontal text.
1173 void StrokeWidth::SimplifyObviousNeighbours(BLOBNBOX* blob) {
1174  // Case 1: We have text that is likely several characters, blurry and joined
1175  // together.
1176  if ((blob->bounding_box().width() > 3 * blob->area_stroke_width() &&
1177  blob->bounding_box().height() > 3 * blob->area_stroke_width())) {
1178  // The blob is complex (not stick-like).
1179  if (blob->bounding_box().width() > 4 * blob->bounding_box().height()) {
1180  // Horizontal conjoined text.
1181  blob->set_neighbour(BND_ABOVE, NULL, false);
1182  blob->set_neighbour(BND_BELOW, NULL, false);
1183  return;
1184  }
1185  if (blob->bounding_box().height() > 4 * blob->bounding_box().width()) {
1186  // Vertical conjoined text.
1187  blob->set_neighbour(BND_LEFT, NULL, false);
1188  blob->set_neighbour(BND_RIGHT, NULL, false);
1189  return;
1190  }
1191  }
1192 
1193  // Case 2: This blob is likely a single character.
1194  int margin = gridsize() / 2;
1195  int h_min, h_max, v_min, v_max;
1196  blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1197  if ((h_max + margin < v_min && h_max < margin / 2) ||
1198  blob->leader_on_left() || blob->leader_on_right()) {
1199  // Horizontal gaps are clear winners. Clear vertical neighbours.
1200  blob->set_neighbour(BND_ABOVE, NULL, false);
1201  blob->set_neighbour(BND_BELOW, NULL, false);
1202  } else if (v_max + margin < h_min && v_max < margin / 2) {
1203  // Vertical gaps are clear winners. Clear horizontal neighbours.
1204  blob->set_neighbour(BND_LEFT, NULL, false);
1205  blob->set_neighbour(BND_RIGHT, NULL, false);
1206  }
1207 }
1208 
1209 // Smoothes the vertical/horizontal type of the blob based on the
1210 // 2nd-order neighbours. If reset_all is true, then all blobs are
1211 // changed. Otherwise, only ambiguous blobs are processed.
1212 void StrokeWidth::SmoothNeighbourTypes(PageSegMode pageseg_mode, bool reset_all,
1213  BLOBNBOX* blob) {
1214  if ((blob->vert_possible() && blob->horz_possible()) || reset_all) {
1215  // There are both horizontal and vertical so try to fix it.
1216  BLOBNBOX_CLIST neighbours;
1217  List2ndNeighbours(blob, &neighbours);
1218  // The number of pure horizontal and vertical neighbours.
1219  int pure_h_count = 0;
1220  int pure_v_count = 0;
1221  CountNeighbourTypes(&neighbours, &pure_h_count, &pure_v_count);
1223  blob->bounding_box().bottom())) {
1224  HandleClick(blob->bounding_box().left() + 1,
1225  blob->bounding_box().bottom() + 1);
1226  tprintf("pure_h=%d, pure_v=%d\n",
1227  pure_h_count, pure_v_count);
1228  }
1229  if (pure_h_count > pure_v_count && !FindingVerticalOnly(pageseg_mode)) {
1230  // Horizontal gaps are clear winners. Clear vertical neighbours.
1231  blob->set_vert_possible(false);
1232  blob->set_horz_possible(true);
1233  } else if (pure_v_count > pure_h_count &&
1234  !FindingHorizontalOnly(pageseg_mode)) {
1235  // Vertical gaps are clear winners. Clear horizontal neighbours.
1236  blob->set_horz_possible(false);
1237  blob->set_vert_possible(true);
1238  }
1239  } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1240  blob->bounding_box().bottom())) {
1241  HandleClick(blob->bounding_box().left() + 1,
1242  blob->bounding_box().bottom() + 1);
1243  tprintf("Clean on pass 3!\n");
1244  }
1245 }
1246 
1247 // Partition creation. Accumulates vertical and horizontal text chains,
1248 // puts the remaining blobs in as unknowns, and then merges/splits to
1249 // minimize overlap and smoothes the types with neighbours and the color
1250 // image if provided. rerotation is used to rotate the coordinate space
1251 // back to the nontext_map_ image.
1252 // If find_problems is true, detects possible noise pollution by the amount
1253 // of partition overlap that is created by the diacritics. If excessive, the
1254 // noise is separated out into diacritic blobs, and PFR_NOISE is returned.
1255 // [TODO(rays): if the partition overlap is caused by heavy skew, deskews
1256 // the components, saves the skew_angle and returns PFR_SKEW.] If the return
1257 // is not PFR_OK, the job is incomplete, and FindInitialPartitions must be
1258 // called again after cleaning up the partly done work.
1259 PartitionFindResult StrokeWidth::FindInitialPartitions(
1260  PageSegMode pageseg_mode, const FCOORD& rerotation, bool find_problems,
1261  TO_BLOCK* block, BLOBNBOX_LIST* diacritic_blobs,
1262  ColPartitionGrid* part_grid, ColPartition_LIST* big_parts,
1263  FCOORD* skew_angle) {
1264  if (!FindingHorizontalOnly(pageseg_mode)) FindVerticalTextChains(part_grid);
1265  if (!FindingVerticalOnly(pageseg_mode)) FindHorizontalTextChains(part_grid);
1267  chains_win_ = MakeWindow(0, 400, "Initial text chains");
1268  part_grid->DisplayBoxes(chains_win_);
1269  projection_->DisplayProjection();
1270  }
1271  if (find_problems) {
1272  // TODO(rays) Do something to find skew, set skew_angle and return if there
1273  // is some.
1274  }
1275  part_grid->SplitOverlappingPartitions(big_parts);
1276  EasyMerges(part_grid);
1277  RemoveLargeUnusedBlobs(block, part_grid, big_parts);
1278  TBOX grid_box(bleft(), tright());
1279  while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box,
1280  rerotation));
1281  while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
1282  grid_box, rerotation));
1283  int pre_overlap = part_grid->ComputeTotalOverlap(NULL);
1284  TestDiacritics(part_grid, block);
1285  MergeDiacritics(block, part_grid);
1286  if (find_problems && diacritic_blobs != NULL &&
1287  DetectAndRemoveNoise(pre_overlap, grid_box, block, part_grid,
1288  diacritic_blobs)) {
1289  return PFR_NOISE;
1290  }
1292  textlines_win_ = MakeWindow(400, 400, "GoodTextline blobs");
1293  part_grid->DisplayBoxes(textlines_win_);
1294  diacritics_win_ = DisplayDiacritics("Diacritics", 0, 0, block);
1295  }
1296  PartitionRemainingBlobs(pageseg_mode, part_grid);
1297  part_grid->SplitOverlappingPartitions(big_parts);
1298  EasyMerges(part_grid);
1299  while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box,
1300  rerotation));
1301  while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
1302  grid_box, rerotation));
1303  // Now eliminate strong stuff in a sea of the opposite.
1304  while (part_grid->GridSmoothNeighbours(BTFT_STRONG_CHAIN, nontext_map_,
1305  grid_box, rerotation));
1307  smoothed_win_ = MakeWindow(800, 400, "Smoothed blobs");
1308  part_grid->DisplayBoxes(smoothed_win_);
1309  }
1310  return PFR_OK;
1311 }
1312 
1313 // Detects noise by a significant increase in partition overlap from
1314 // pre_overlap to now, and removes noise from the union of all the overlapping
1315 // partitions, placing the blobs in diacritic_blobs. Returns true if any noise
1316 // was found and removed.
1317 bool StrokeWidth::DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box,
1318  TO_BLOCK* block,
1319  ColPartitionGrid* part_grid,
1320  BLOBNBOX_LIST* diacritic_blobs) {
1321  ColPartitionGrid* noise_grid = NULL;
1322  int post_overlap = part_grid->ComputeTotalOverlap(&noise_grid);
1323  if (pre_overlap == 0) pre_overlap = 1;
1324  BLOBNBOX_IT diacritic_it(diacritic_blobs);
1325  if (noise_grid != NULL) {
1326  if (post_overlap > pre_overlap * kNoiseOverlapGrowthFactor &&
1327  post_overlap > grid_box.area() * kNoiseOverlapAreaFactor) {
1328  // This is noisy enough to fix.
1330  ScrollView* noise_win = MakeWindow(1000, 500, "Noise Areas");
1331  noise_grid->DisplayBoxes(noise_win);
1332  }
1333  part_grid->DeleteNonLeaderParts();
1334  BLOBNBOX_IT blob_it(&block->noise_blobs);
1335  ColPartitionGridSearch rsearch(noise_grid);
1336  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1337  BLOBNBOX* blob = blob_it.data();
1338  blob->ClearNeighbours();
1339  if (!blob->IsDiacritic() || blob->owner() != NULL)
1340  continue; // Not a noise candidate.
1341  TBOX blob_box(blob->bounding_box());
1342  TBOX search_box(blob->bounding_box());
1343  search_box.pad(gridsize(), gridsize());
1344  rsearch.StartRectSearch(search_box);
1345  ColPartition* part = rsearch.NextRectSearch();
1346  if (part != NULL) {
1347  // Consider blob as possible noise.
1348  blob->set_owns_cblob(true);
1349  blob->compute_bounding_box();
1350  diacritic_it.add_after_then_move(blob_it.extract());
1351  }
1352  }
1353  noise_grid->DeleteParts();
1354  delete noise_grid;
1355  return true;
1356  }
1357  noise_grid->DeleteParts();
1358  delete noise_grid;
1359  }
1360  return false;
1361 }
1362 
1363 // Helper verifies that blob's neighbour in direction dir is good to add to a
1364 // vertical text chain by returning the neighbour if it is not null, not owned,
1365 // and not uniquely horizontal, as well as its neighbour in the opposite
1366 // direction is blob.
1367 static BLOBNBOX* MutualUnusedVNeighbour(const BLOBNBOX* blob,
1368  BlobNeighbourDir dir) {
1369  BLOBNBOX* next_blob = blob->neighbour(dir);
1370  if (next_blob == NULL || next_blob->owner() != NULL ||
1371  next_blob->UniquelyHorizontal())
1372  return NULL;
1373  if (next_blob->neighbour(DirOtherWay(dir)) == blob)
1374  return next_blob;
1375  return NULL;
1376 }
1377 
1378 // Finds vertical chains of text-like blobs and puts them in ColPartitions.
1379 void StrokeWidth::FindVerticalTextChains(ColPartitionGrid* part_grid) {
1380  // A PageSegMode that forces vertical textlines with the current rotation.
1381  PageSegMode pageseg_mode =
1382  rerotation_.y() == 0.0f ? PSM_SINGLE_BLOCK_VERT_TEXT : PSM_SINGLE_COLUMN;
1383  BlobGridSearch gsearch(this);
1384  BLOBNBOX* bbox;
1385  gsearch.StartFullSearch();
1386  while ((bbox = gsearch.NextFullSearch()) != NULL) {
1387  // Only process boxes that have no horizontal hope and have not yet
1388  // been included in a chain.
1389  BLOBNBOX* blob;
1390  if (bbox->owner() == NULL && bbox->UniquelyVertical() &&
1391  (blob = MutualUnusedVNeighbour(bbox, BND_ABOVE)) != NULL) {
1392  // Put all the linked blobs into a ColPartition.
1393  ColPartition* part = new ColPartition(BRT_VERT_TEXT, ICOORD(0, 1));
1394  part->AddBox(bbox);
1395  while (blob != NULL) {
1396  part->AddBox(blob);
1397  blob = MutualUnusedVNeighbour(blob, BND_ABOVE);
1398  }
1399  blob = MutualUnusedVNeighbour(bbox, BND_BELOW);
1400  while (blob != NULL) {
1401  part->AddBox(blob);
1402  blob = MutualUnusedVNeighbour(blob, BND_BELOW);
1403  }
1404  CompletePartition(pageseg_mode, part, part_grid);
1405  }
1406  }
1407 }
1408 
1409 // Helper verifies that blob's neighbour in direction dir is good to add to a
1410 // horizontal text chain by returning the neighbour if it is not null, not
1411 // owned, and not uniquely vertical, as well as its neighbour in the opposite
1412 // direction is blob.
1413 static BLOBNBOX* MutualUnusedHNeighbour(const BLOBNBOX* blob,
1414  BlobNeighbourDir dir) {
1415  BLOBNBOX* next_blob = blob->neighbour(dir);
1416  if (next_blob == NULL || next_blob->owner() != NULL ||
1417  next_blob->UniquelyVertical())
1418  return NULL;
1419  if (next_blob->neighbour(DirOtherWay(dir)) == blob)
1420  return next_blob;
1421  return NULL;
1422 }
1423 
1424 // Finds horizontal chains of text-like blobs and puts them in ColPartitions.
1425 void StrokeWidth::FindHorizontalTextChains(ColPartitionGrid* part_grid) {
1426  // A PageSegMode that forces horizontal textlines with the current rotation.
1427  PageSegMode pageseg_mode =
1428  rerotation_.y() == 0.0f ? PSM_SINGLE_COLUMN : PSM_SINGLE_BLOCK_VERT_TEXT;
1429  BlobGridSearch gsearch(this);
1430  BLOBNBOX* bbox;
1431  gsearch.StartFullSearch();
1432  while ((bbox = gsearch.NextFullSearch()) != NULL) {
1433  BLOBNBOX* blob;
1434  if (bbox->owner() == NULL && bbox->UniquelyHorizontal() &&
1435  (blob = MutualUnusedHNeighbour(bbox, BND_RIGHT)) != NULL) {
1436  // Put all the linked blobs into a ColPartition.
1437  ColPartition* part = new ColPartition(BRT_TEXT, ICOORD(0, 1));
1438  part->AddBox(bbox);
1439  while (blob != NULL) {
1440  part->AddBox(blob);
1441  blob = MutualUnusedHNeighbour(blob, BND_RIGHT);
1442  }
1443  blob = MutualUnusedHNeighbour(bbox, BND_LEFT);
1444  while (blob != NULL) {
1445  part->AddBox(blob);
1446  blob = MutualUnusedVNeighbour(blob, BND_LEFT);
1447  }
1448  CompletePartition(pageseg_mode, part, part_grid);
1449  }
1450  }
1451 }
1452 
1453 // Finds diacritics and saves their base character in the blob.
1454 // The objective is to move all diacritics to the noise_blobs list, so
1455 // they don't mess up early textline finding/merging, or force splits
1456 // on textlines that overlap a bit. Blobs that become diacritics must be
1457 // either part of no ColPartition (NULL owner) or in a small partition in
1458 // which ALL the blobs are diacritics, in which case the partition is
1459 // exploded (deleted) back to its blobs.
1460 void StrokeWidth::TestDiacritics(ColPartitionGrid* part_grid, TO_BLOCK* block) {
1461  BlobGrid small_grid(gridsize(), bleft(), tright());
1462  small_grid.InsertBlobList(&block->noise_blobs);
1463  small_grid.InsertBlobList(&block->blobs);
1464  int medium_diacritics = 0;
1465  int small_diacritics = 0;
1466  BLOBNBOX_IT small_it(&block->noise_blobs);
1467  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1468  BLOBNBOX* blob = small_it.data();
1469  if (blob->owner() == NULL && !blob->IsDiacritic() &&
1470  DiacriticBlob(&small_grid, blob)) {
1471  ++small_diacritics;
1472  }
1473  }
1474  BLOBNBOX_IT blob_it(&block->blobs);
1475  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1476  BLOBNBOX* blob = blob_it.data();
1477  if (blob->IsDiacritic()) {
1478  small_it.add_to_end(blob_it.extract());
1479  continue; // Already a diacritic.
1480  }
1481  ColPartition* part = blob->owner();
1482  if (part == NULL && DiacriticBlob(&small_grid, blob)) {
1483  ++medium_diacritics;
1484  RemoveBBox(blob);
1485  small_it.add_to_end(blob_it.extract());
1486  } else if (part != NULL && !part->block_owned() &&
1487  part->boxes_count() < 3) {
1488  // We allow blobs in small partitions to become diacritics if ALL the
1489  // blobs in the partition qualify as we can then cleanly delete the
1490  // partition, turn all the blobs in it to diacritics and they can be
1491  // merged into the base character partition more easily than merging
1492  // the partitions.
1493  BLOBNBOX_C_IT box_it(part->boxes());
1494  for (box_it.mark_cycle_pt(); !box_it.cycled_list() &&
1495  DiacriticBlob(&small_grid, box_it.data());
1496  box_it.forward());
1497  if (box_it.cycled_list()) {
1498  // They are all good.
1499  while (!box_it.empty()) {
1500  // Liberate the blob from its partition so it can be treated
1501  // as a diacritic and merged explicitly with the base part.
1502  // The blob is really owned by the block. The partition "owner"
1503  // is NULLed to allow the blob to get merged with its base character
1504  // partition.
1505  BLOBNBOX* box = box_it.extract();
1506  box->set_owner(NULL);
1507  box_it.forward();
1508  ++medium_diacritics;
1509  // We remove the blob from the grid so it isn't found by subsequent
1510  // searches where we might not want to include diacritics.
1511  RemoveBBox(box);
1512  }
1513  // We only move the one blob to the small list here, but the others
1514  // all get moved by the test at the top of the loop.
1515  small_it.add_to_end(blob_it.extract());
1516  part_grid->RemoveBBox(part);
1517  delete part;
1518  }
1519  } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1520  blob->bounding_box().bottom())) {
1521  tprintf("Blob not available to be a diacritic at:");
1522  blob->bounding_box().print();
1523  }
1524  }
1526  tprintf("Found %d small diacritics, %d medium\n",
1527  small_diacritics, medium_diacritics);
1528  }
1529 }
1530 
1531 // Searches this grid for an appropriately close and sized neighbour of the
1532 // given [small] blob. If such a blob is found, the diacritic base is saved
1533 // in the blob and true is returned.
1534 // The small_grid is a secondary grid that contains the small/noise objects
1535 // that are not in this grid, but may be useful for determining a connection
1536 // between blob and its potential base character. (See DiacriticXGapFilled.)
1537 bool StrokeWidth::DiacriticBlob(BlobGrid* small_grid, BLOBNBOX* blob) {
1538  if (BLOBNBOX::UnMergeableType(blob->region_type()) ||
1539  blob->region_type() == BRT_VERT_TEXT)
1540  return false;
1541  TBOX small_box(blob->bounding_box());
1542  bool debug = AlignedBlob::WithinTestRegion(2, small_box.left(),
1543  small_box.bottom());
1544  if (debug) {
1545  tprintf("Testing blob for diacriticness at:");
1546  small_box.print();
1547  }
1548  int x = (small_box.left() + small_box.right()) / 2;
1549  int y = (small_box.bottom() + small_box.top()) / 2;
1550  int grid_x, grid_y;
1551  GridCoords(x, y, &grid_x, &grid_y);
1552  int height = small_box.height();
1553  // Setup a rectangle search to find its nearest base-character neighbour.
1554  // We keep 2 different best candidates:
1555  // best_x_overlap is a category of base characters that have an overlap in x
1556  // (like a acute) in which we look for the least y-gap, computed using the
1557  // projection to favor base characters in the same textline.
1558  // best_y_overlap is a category of base characters that have no x overlap,
1559  // (nominally a y-overlap is preferrecd but not essential) in which we
1560  // look for the least weighted sum of x-gap and y-gap, with x-gap getting
1561  // a lower weight to catch quotes at the end of a textline.
1562  // NOTE that x-gap and y-gap are measured from the nearest side of the base
1563  // character to the FARTHEST side of the diacritic to allow small diacritics
1564  // to be a reasonable distance away, but not big diacritics.
1565  BLOBNBOX* best_x_overlap = NULL;
1566  BLOBNBOX* best_y_overlap = NULL;
1567  int best_total_dist = 0;
1568  int best_y_gap = 0;
1569  TBOX best_xbox;
1570  // TODO(rays) the search box could be setup using the projection as a guide.
1571  TBOX search_box(small_box);
1572  int x_pad = IntCastRounded(gridsize() * kDiacriticXPadRatio);
1573  int y_pad = IntCastRounded(gridsize() * kDiacriticYPadRatio);
1574  search_box.pad(x_pad, y_pad);
1575  BlobGridSearch rsearch(this);
1576  rsearch.SetUniqueMode(true);
1577  int min_height = height * kMinDiacriticSizeRatio;
1578  rsearch.StartRectSearch(search_box);
1579  BLOBNBOX* neighbour;
1580  while ((neighbour = rsearch.NextRectSearch()) != NULL) {
1581  if (BLOBNBOX::UnMergeableType(neighbour->region_type()) ||
1582  neighbour == blob || neighbour->owner() == blob->owner())
1583  continue;
1584  TBOX nbox = neighbour->bounding_box();
1585  if (neighbour->owner() == NULL || neighbour->owner()->IsVerticalType() ||
1586  (neighbour->flow() != BTFT_CHAIN &&
1587  neighbour->flow() != BTFT_STRONG_CHAIN)) {
1588  if (debug) {
1589  tprintf("Neighbour not strong enough:");
1590  nbox.print();
1591  }
1592  continue; // Diacritics must be attached to strong text.
1593  }
1594  if (nbox.height() < min_height) {
1595  if (debug) {
1596  tprintf("Neighbour not big enough:");
1597  nbox.print();
1598  }
1599  continue; // Too small to be the base character.
1600  }
1601  int x_gap = small_box.x_gap(nbox);
1602  int y_gap = small_box.y_gap(nbox);
1603  int total_distance = projection_->DistanceOfBoxFromBox(small_box, nbox,
1604  true, denorm_,
1605  debug);
1606  if (debug) tprintf("xgap=%d, y=%d, total dist=%d\n",
1607  x_gap, y_gap, total_distance);
1608  if (total_distance >
1609  neighbour->owner()->median_size() * kMaxDiacriticDistanceRatio) {
1610  if (debug) {
1611  tprintf("Neighbour with median size %d too far away:",
1612  neighbour->owner()->median_size());
1613  neighbour->bounding_box().print();
1614  }
1615  continue; // Diacritics must not be too distant.
1616  }
1617  if (x_gap <= 0) {
1618  if (debug) {
1619  tprintf("Computing reduced box for :");
1620  nbox.print();
1621  }
1622  int left = small_box.left() - small_box.width();
1623  int right = small_box.right() + small_box.width();
1624  nbox = neighbour->BoundsWithinLimits(left, right);
1625  y_gap = small_box.y_gap(nbox);
1626  if (best_x_overlap == NULL || y_gap < best_y_gap) {
1627  best_x_overlap = neighbour;
1628  best_xbox = nbox;
1629  best_y_gap = y_gap;
1630  if (debug) {
1631  tprintf("New best:");
1632  nbox.print();
1633  }
1634  } else if (debug) {
1635  tprintf("Shrunken box doesn't win:");
1636  nbox.print();
1637  }
1638  } else if (blob->ConfirmNoTabViolation(*neighbour)) {
1639  if (best_y_overlap == NULL || total_distance < best_total_dist) {
1640  if (debug) {
1641  tprintf("New best y overlap:");
1642  nbox.print();
1643  }
1644  best_y_overlap = neighbour;
1645  best_total_dist = total_distance;
1646  } else if (debug) {
1647  tprintf("New y overlap box doesn't win:");
1648  nbox.print();
1649  }
1650  } else if (debug) {
1651  tprintf("Neighbour wrong side of a tab:");
1652  nbox.print();
1653  }
1654  }
1655  if (best_x_overlap != NULL &&
1656  (best_y_overlap == NULL ||
1657  best_xbox.major_y_overlap(best_y_overlap->bounding_box()))) {
1658  blob->set_diacritic_box(best_xbox);
1659  blob->set_base_char_blob(best_x_overlap);
1660  if (debug) {
1661  tprintf("DiacriticBlob OK! (x-overlap:");
1662  small_box.print();
1663  best_xbox.print();
1664  }
1665  return true;
1666  }
1667  if (best_y_overlap != NULL &&
1668  DiacriticXGapFilled(small_grid, small_box,
1669  best_y_overlap->bounding_box()) &&
1670  NoNoiseInBetween(small_box, best_y_overlap->bounding_box())) {
1671  blob->set_diacritic_box(best_y_overlap->bounding_box());
1672  blob->set_base_char_blob(best_y_overlap);
1673  if (debug) {
1674  tprintf("DiacriticBlob OK! (y-overlap:");
1675  small_box.print();
1676  best_y_overlap->bounding_box().print();
1677  }
1678  return true;
1679  }
1680  if (debug) {
1681  tprintf("DiacriticBlob fails:");
1682  small_box.print();
1683  tprintf("Best x+y gap = %d, y = %d\n", best_total_dist, best_y_gap);
1684  if (best_y_overlap != NULL) {
1685  tprintf("XGapFilled=%d, NoiseBetween=%d\n",
1686  DiacriticXGapFilled(small_grid, small_box,
1687  best_y_overlap->bounding_box()),
1688  NoNoiseInBetween(small_box, best_y_overlap->bounding_box()));
1689  }
1690  }
1691  return false;
1692 }
1693 
1694 // Returns true if there is no gap between the base char and the diacritic
1695 // bigger than a fraction of the height of the base char:
1696 // Eg: line end.....'
1697 // The quote is a long way from the end of the line, yet it needs to be a
1698 // diacritic. To determine that the quote is not part of an image, or
1699 // a different text block, we check for other marks in the gap between
1700 // the base char and the diacritic.
1701 // '<--Diacritic
1702 // |---------|
1703 // | |<-toobig-gap->
1704 // | Base |<ok gap>
1705 // |---------| x<-----Dot occupying gap
1706 // The grid is const really.
1707 bool StrokeWidth::DiacriticXGapFilled(BlobGrid* grid,
1708  const TBOX& diacritic_box,
1709  const TBOX& base_box) {
1710  // Since most gaps are small, use an iterative algorithm to search the gap.
1711  int max_gap = IntCastRounded(base_box.height() *
1713  TBOX occupied_box(base_box);
1714  int diacritic_gap;
1715  while ((diacritic_gap = diacritic_box.x_gap(occupied_box)) > max_gap) {
1716  TBOX search_box(occupied_box);
1717  if (diacritic_box.left() > search_box.right()) {
1718  // We are looking right.
1719  search_box.set_left(search_box.right());
1720  search_box.set_right(search_box.left() + max_gap);
1721  } else {
1722  // We are looking left.
1723  search_box.set_right(search_box.left());
1724  search_box.set_left(search_box.left() - max_gap);
1725  }
1726  BlobGridSearch rsearch(grid);
1727  rsearch.StartRectSearch(search_box);
1728  BLOBNBOX* neighbour;
1729  while ((neighbour = rsearch.NextRectSearch()) != NULL) {
1730  const TBOX& nbox = neighbour->bounding_box();
1731  if (nbox.x_gap(diacritic_box) < diacritic_gap) {
1732  if (nbox.left() < occupied_box.left())
1733  occupied_box.set_left(nbox.left());
1734  if (nbox.right() > occupied_box.right())
1735  occupied_box.set_right(nbox.right());
1736  break;
1737  }
1738  }
1739  if (neighbour == NULL)
1740  return false; // Found a big gap.
1741  }
1742  return true; // The gap was filled.
1743 }
1744 
1745 // Merges diacritics with the ColPartition of the base character blob.
1746 void StrokeWidth::MergeDiacritics(TO_BLOCK* block,
1747  ColPartitionGrid* part_grid) {
1748  BLOBNBOX_IT small_it(&block->noise_blobs);
1749  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1750  BLOBNBOX* blob = small_it.data();
1751  if (blob->base_char_blob() != NULL) {
1752  ColPartition* part = blob->base_char_blob()->owner();
1753  // The base character must be owned by a partition and that partition
1754  // must not be on the big_parts list (not block owned).
1755  if (part != NULL && !part->block_owned() && blob->owner() == NULL &&
1756  blob->IsDiacritic()) {
1757  // The partition has to be removed from the grid and reinserted
1758  // because its bounding box may change.
1759  part_grid->RemoveBBox(part);
1760  part->AddBox(blob);
1761  blob->set_region_type(part->blob_type());
1762  blob->set_flow(part->flow());
1763  blob->set_owner(part);
1764  part_grid->InsertBBox(true, true, part);
1765  }
1766  // Set all base chars to NULL before any blobs get deleted.
1767  blob->set_base_char_blob(NULL);
1768  }
1769  }
1770 }
1771 
1772 // Any blobs on the large_blobs list of block that are still unowned by a
1773 // ColPartition, are probably drop-cap or vertically touching so the blobs
1774 // are removed to the big_parts list and treated separately.
1775 void StrokeWidth::RemoveLargeUnusedBlobs(TO_BLOCK* block,
1776  ColPartitionGrid* part_grid,
1777  ColPartition_LIST* big_parts) {
1778  BLOBNBOX_IT large_it(&block->large_blobs);
1779  for (large_it.mark_cycle_pt(); !large_it.cycled_list(); large_it.forward()) {
1780  BLOBNBOX* blob = large_it.data();
1781  ColPartition* big_part = blob->owner();
1782  if (big_part == NULL) {
1783  // Large blobs should have gone into partitions by now if they are
1784  // genuine characters, so move any unowned ones out to the big parts
1785  // list. This will include drop caps and vertically touching characters.
1786  ColPartition::MakeBigPartition(blob, big_parts);
1787  }
1788  }
1789 }
1790 
1791 // All remaining unused blobs are put in individual ColPartitions.
1792 void StrokeWidth::PartitionRemainingBlobs(PageSegMode pageseg_mode,
1793  ColPartitionGrid* part_grid) {
1794  BlobGridSearch gsearch(this);
1795  BLOBNBOX* bbox;
1796  int prev_grid_x = -1;
1797  int prev_grid_y = -1;
1798  BLOBNBOX_CLIST cell_list;
1799  BLOBNBOX_C_IT cell_it(&cell_list);
1800  bool cell_all_noise = true;
1801  gsearch.StartFullSearch();
1802  while ((bbox = gsearch.NextFullSearch()) != NULL) {
1803  int grid_x = gsearch.GridX();
1804  int grid_y = gsearch.GridY();
1805  if (grid_x != prev_grid_x || grid_y != prev_grid_y) {
1806  // New cell. Process old cell.
1807  MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid,
1808  &cell_list);
1809  cell_it.set_to_list(&cell_list);
1810  prev_grid_x = grid_x;
1811  prev_grid_y = grid_y;
1812  cell_all_noise = true;
1813  }
1814  if (bbox->owner() == NULL) {
1815  cell_it.add_to_end(bbox);
1816  if (bbox->flow() != BTFT_NONTEXT)
1817  cell_all_noise = false;
1818  } else {
1819  cell_all_noise = false;
1820  }
1821  }
1822  MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid,
1823  &cell_list);
1824 }
1825 
1826 // If combine, put all blobs in the cell_list into a single partition, otherwise
1827 // put each one into its own partition.
1828 void StrokeWidth::MakePartitionsFromCellList(PageSegMode pageseg_mode,
1829  bool combine,
1830  ColPartitionGrid* part_grid,
1831  BLOBNBOX_CLIST* cell_list) {
1832  if (cell_list->empty())
1833  return;
1834  BLOBNBOX_C_IT cell_it(cell_list);
1835  if (combine) {
1836  BLOBNBOX* bbox = cell_it.extract();
1837  ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1838  part->AddBox(bbox);
1839  part->set_flow(bbox->flow());
1840  for (cell_it.forward(); !cell_it.empty(); cell_it.forward()) {
1841  part->AddBox(cell_it.extract());
1842  }
1843  CompletePartition(pageseg_mode, part, part_grid);
1844  } else {
1845  for (; !cell_it.empty(); cell_it.forward()) {
1846  BLOBNBOX* bbox = cell_it.extract();
1847  ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1848  part->set_flow(bbox->flow());
1849  part->AddBox(bbox);
1850  CompletePartition(pageseg_mode, part, part_grid);
1851  }
1852  }
1853 }
1854 
1855 // Helper function to finish setting up a ColPartition and insert into
1856 // part_grid.
1857 void StrokeWidth::CompletePartition(PageSegMode pageseg_mode,
1858  ColPartition* part,
1859  ColPartitionGrid* part_grid) {
1860  part->ComputeLimits();
1861  TBOX box = part->bounding_box();
1862  bool debug = AlignedBlob::WithinTestRegion(2, box.left(),
1863  box.bottom());
1864  int value = projection_->EvaluateColPartition(*part, denorm_, debug);
1865  // Override value if pageseg_mode disagrees.
1866  if (value > 0 && FindingVerticalOnly(pageseg_mode)) {
1867  value = part->boxes_count() == 1 ? 0 : -2;
1868  } else if (value < 0 && FindingHorizontalOnly(pageseg_mode)) {
1869  value = part->boxes_count() == 1 ? 0 : 2;
1870  }
1871  part->SetRegionAndFlowTypesFromProjectionValue(value);
1872  part->ClaimBoxes();
1873  part_grid->InsertBBox(true, true, part);
1874 }
1875 
1876 // Merge partitions where the merge appears harmless.
1877 // As this
1878 void StrokeWidth::EasyMerges(ColPartitionGrid* part_grid) {
1879  part_grid->Merges(
1880  NewPermanentTessCallback(this, &StrokeWidth::OrientationSearchBox),
1881  NewPermanentTessCallback(this, &StrokeWidth::ConfirmEasyMerge));
1882 }
1883 
1884 // Compute a search box based on the orientation of the partition.
1885 // Returns true if a suitable box can be calculated.
1886 // Callback for EasyMerges.
1887 bool StrokeWidth::OrientationSearchBox(ColPartition* part, TBOX* box) {
1888  if (part->IsVerticalType()) {
1889  box->set_top(box->top() + box->width());
1890  box->set_bottom(box->bottom() - box->width());
1891  } else {
1892  box->set_left(box->left() - box->height());
1893  box->set_right(box->right() + box->height());
1894  }
1895  return true;
1896 }
1897 
1898 // Merge confirmation callback for EasyMerges.
1899 bool StrokeWidth::ConfirmEasyMerge(const ColPartition* p1,
1900  const ColPartition* p2) {
1901  ASSERT_HOST(p1 != NULL && p2 != NULL);
1902  ASSERT_HOST(!p1->IsEmpty() && !p2->IsEmpty());
1903  if ((p1->flow() == BTFT_NONTEXT && p2->flow() >= BTFT_CHAIN) ||
1904  (p1->flow() >= BTFT_CHAIN && p2->flow() == BTFT_NONTEXT))
1905  return false; // Don't merge confirmed image with text.
1906  if ((p1->IsVerticalType() || p2->IsVerticalType()) &&
1907  p1->HCoreOverlap(*p2) <= 0 &&
1908  ((!p1->IsSingleton() &&
1909  !p2->IsSingleton()) ||
1910  !p1->bounding_box().major_overlap(p2->bounding_box())))
1911  return false; // Overlap must be in the text line.
1912  if ((p1->IsHorizontalType() || p2->IsHorizontalType()) &&
1913  p1->VCoreOverlap(*p2) <= 0 &&
1914  ((!p1->IsSingleton() &&
1915  !p2->IsSingleton()) ||
1916  (!p1->bounding_box().major_overlap(p2->bounding_box()) &&
1917  !p1->OKDiacriticMerge(*p2, false) &&
1918  !p2->OKDiacriticMerge(*p1, false))))
1919  return false; // Overlap must be in the text line.
1920  if (!p1->ConfirmNoTabViolation(*p2))
1921  return false;
1922  if (p1->flow() <= BTFT_NONTEXT && p2->flow() <= BTFT_NONTEXT)
1923  return true;
1924  return NoNoiseInBetween(p1->bounding_box(), p2->bounding_box());
1925 }
1926 
1927 // Returns true if there is no significant noise in between the boxes.
1928 bool StrokeWidth::NoNoiseInBetween(const TBOX& box1, const TBOX& box2) const {
1929  return ImageFind::BlankImageInBetween(box1, box2, grid_box_, rerotation_,
1930  nontext_map_);
1931 }
1932 
1936 ScrollView* StrokeWidth::DisplayGoodBlobs(const char* window_name,
1937  int x, int y) {
1938  ScrollView* window = NULL;
1939 #ifndef GRAPHICS_DISABLED
1940  window = MakeWindow(x, y, window_name);
1941  // For every blob in the grid, display it.
1942  window->Brush(ScrollView::NONE);
1943 
1944  // For every bbox in the grid, display it.
1945  BlobGridSearch gsearch(this);
1946  gsearch.StartFullSearch();
1947  BLOBNBOX* bbox;
1948  while ((bbox = gsearch.NextFullSearch()) != NULL) {
1949  TBOX box = bbox->bounding_box();
1950  int left_x = box.left();
1951  int right_x = box.right();
1952  int top_y = box.top();
1953  int bottom_y = box.bottom();
1954  int goodness = bbox->GoodTextBlob();
1955  BlobRegionType blob_type = bbox->region_type();
1956  if (bbox->UniquelyVertical())
1957  blob_type = BRT_VERT_TEXT;
1958  if (bbox->UniquelyHorizontal())
1959  blob_type = BRT_TEXT;
1960  BlobTextFlowType flow = bbox->flow();
1961  if (flow == BTFT_NONE) {
1962  if (goodness == 0)
1963  flow = BTFT_NEIGHBOURS;
1964  else if (goodness == 1)
1965  flow = BTFT_CHAIN;
1966  else
1967  flow = BTFT_STRONG_CHAIN;
1968  }
1969  window->Pen(BLOBNBOX::TextlineColor(blob_type, flow));
1970  window->Rectangle(left_x, bottom_y, right_x, top_y);
1971  }
1972  window->Update();
1973 #endif
1974  return window;
1975 }
1976 
1977 static void DrawDiacriticJoiner(const BLOBNBOX* blob, ScrollView* window) {
1978 #ifndef GRAPHICS_DISABLED
1979  const TBOX& blob_box(blob->bounding_box());
1980  int top = MAX(blob_box.top(), blob->base_char_top());
1981  int bottom = MIN(blob_box.bottom(), blob->base_char_bottom());
1982  int x = (blob_box.left() + blob_box.right()) / 2;
1983  window->Line(x, top, x, bottom);
1984 #endif // GRAPHICS_DISABLED
1985 }
1986 
1987 // Displays blobs colored according to whether or not they are diacritics.
1988 ScrollView* StrokeWidth::DisplayDiacritics(const char* window_name,
1989  int x, int y, TO_BLOCK* block) {
1990  ScrollView* window = NULL;
1991 #ifndef GRAPHICS_DISABLED
1992  window = MakeWindow(x, y, window_name);
1993  // For every blob in the grid, display it.
1994  window->Brush(ScrollView::NONE);
1995 
1996  BLOBNBOX_IT it(&block->blobs);
1997  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1998  BLOBNBOX* blob = it.data();
1999  if (blob->IsDiacritic()) {
2000  window->Pen(ScrollView::GREEN);
2001  DrawDiacriticJoiner(blob, window);
2002  } else {
2003  window->Pen(blob->BoxColor());
2004  }
2005  const TBOX& box = blob->bounding_box();
2006  window->Rectangle(box.left(), box. bottom(), box.right(), box.top());
2007  }
2008  it.set_to_list(&block->noise_blobs);
2009  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2010  BLOBNBOX* blob = it.data();
2011  if (blob->IsDiacritic()) {
2012  window->Pen(ScrollView::GREEN);
2013  DrawDiacriticJoiner(blob, window);
2014  } else {
2015  window->Pen(ScrollView::WHITE);
2016  }
2017  const TBOX& box = blob->bounding_box();
2018  window->Rectangle(box.left(), box. bottom(), box.right(), box.top());
2019  }
2020  window->Update();
2021 #endif
2022  return window;
2023 }
2024 
2025 } // namespace tesseract.
const double kShapePerimeterRatio
virtual void HandleClick(int x, int y)
void MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const
Definition: blobbox.cpp:193
GridSearch< BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT > BlobGridSearch
Definition: blobgrid.h:31
StrokeWidth(int gridsize, const ICOORD &bleft, const ICOORD &tright)
void Pen(Color color)
Definition: scrollview.cpp:726
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
void set_leader_on_right(bool flag)
Definition: blobbox.h:352
const int kSearchRadius
Definition: strokewidth.cpp:88
bool IsVerticalType() const
Definition: colpartition.h:435
void MoveNonTextlineBlobs(BLOBNBOX_LIST *blobs, BLOBNBOX_LIST *small_blobs) const
static bool WithinTestRegion(int detail_level, int x, int y)
void CorrectForRotation(const FCOORD &rerotation, ColPartitionGrid *part_grid)
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type)
Definition: blobbox.cpp:437
BlobRegionType
Definition: blobbox.h:57
#define MAX(x, y)
Definition: ndminx.h:24
const double kLineResidueSizeRatio
void SetNeighboursOnMediumBlobs(TO_BLOCK *block)
BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir)
Definition: blobbox.h:91
float x() const
Definition: points.h:209
BlobTextFlowType
Definition: blobbox.h:99
static bool UnMergeableType(BlobRegionType type)
Definition: blobbox.h:415
static bool DifferentSizes(int size1, int size2)
Definition: tabfind.cpp:429
bool leader_on_left() const
Definition: blobbox.h:343
bool joined_to_prev() const
Definition: blobbox.h:241
static void Update()
Definition: scrollview.cpp:715
static bool VeryDifferentSizes(int size1, int size2)
Definition: tabfind.cpp:435
void ClearNeighbours()
Definition: blobbox.h:494
#define tprintf(...)
Definition: tprintf.h:31
#define MIN(x, y)
Definition: ndminx.h:28
void set_owner(tesseract::ColPartition *new_owner)
Definition: blobbox.h:340
bool horz_possible() const
Definition: blobbox.h:292
const int kLineTrapLongest
Definition: strokewidth.cpp:93
Definition: statistc.h:33
void set_right(int x)
Definition: rect.h:78
void print() const
Definition: rect.h:270
void ConstructProjection(TO_BLOCK *input_block, const FCOORD &rotation, Pix *nontext_map)
#define BOOL_VAR(name, val, comment)
Definition: params.h:280
const double kNoiseOverlapGrowthFactor
void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good)
Definition: blobbox.h:361
void AddBox(BLOBNBOX *box)
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
inT16 right() const
Definition: rect.h:75
int median_size() const
Definition: colpartition.h:136
void set_left(int x)
Definition: rect.h:71
bool textord_tabfind_only_strokewidths
Definition: strokewidth.cpp:45
const float kSizeRatioToReject
const double kDiacriticYPadRatio
Definition: strokewidth.cpp:77
const int kLineResiduePadRatio
#define ASSERT_HOST(x)
Definition: errcode.h:84
void InsertBlobList(BLOBNBOX_LIST *blobs)
Definition: blobgrid.cpp:34
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:283
void StartFullSearch()
Definition: bbgrid.h:668
void NeighbourGaps(int gaps[BND_COUNT]) const
Definition: blobbox.cpp:174
const int kMaxCJKSizeRatio
Definition: strokewidth.cpp:69
void pad(int xpad, int ypad)
Definition: rect.h:127
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:771
void FindLeaderPartitions(TO_BLOCK *block, ColPartitionGrid *part_grid)
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
const double kStrokeWidthFractionTolerance
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
int DistanceOfBoxFromBox(const TBOX &from_box, const TBOX &to_box, bool horizontal_textline, const DENORM *denorm, bool debug) const
BlobNeighbourDir
Definition: blobbox.h:72
const double kMaxDiacriticGapToBaseCharHeight
Definition: strokewidth.cpp:86
inT32 area() const
Definition: rect.h:118
void DeleteUnownedNoise()
Definition: blobbox.cpp:1031
void FindTextlineDirectionAndFixBrokenCJK(PageSegMode pageseg_mode, bool cjk_merge, TO_BLOCK *input_block)
int textord_debug_tabfind
Definition: alignedblob.cpp:27
const int kMaxLargeOverlaps
const double kBrokenCJKIterationFraction
Definition: strokewidth.cpp:71
virtual void HandleClick(int x, int y)
Definition: bbgrid.h:658
void StartRadSearch(int x, int y, int max_radius)
Definition: bbgrid.h:701
void set_bottom(int y)
Definition: rect.h:64
inT16 left() const
Definition: rect.h:68
const double kNoiseOverlapAreaFactor
void set_x(float xin)
rewrite function
Definition: points.h:216
TBOX BoundsWithinLimits(int left, int right)
Definition: blobbox.cpp:326
float area_stroke_width() const
Definition: blobbox.h:334
int gridsize() const
Definition: bbgrid.h:63
int y_gap(const TBOX &box) const
Definition: rect.h:225
BlobRegionType region_type() const
Definition: blobbox.h:268
C_BLOB * cblob() const
Definition: blobbox.h:253
void Brush(Color color)
Definition: scrollview.cpp:732
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:449
void set_y(float yin)
rewrite function
Definition: points.h:220
void set_base_char_blob(BLOBNBOX *blob)
Definition: blobbox.h:390
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:770
bool DefiniteIndividualFlow()
Definition: blobbox.cpp:245
bool y_overlap(const TBOX &box) const
Definition: rect.h:418
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
int textord_tabfind_show_strokewidths
Definition: strokewidth.cpp:44
const double kMaxDiacriticDistanceRatio
Definition: strokewidth.cpp:83
const ICOORD & bleft() const
Definition: bbgrid.h:72
void set_leader_on_left(bool flag)
Definition: blobbox.h:346
Assume a single column of text of variable sizes.
Definition: publictypes.h:157
void really_merge(BLOBNBOX *other)
Definition: blobbox.cpp:96
#define INT_VAR(name, val, comment)
Definition: params.h:277
void compute_bounding_box()
Definition: blobbox.h:225
const int kMostlyOneDirRatio
Definition: strokewidth.cpp:98
const double kCJKBrokenDistanceFraction
Definition: strokewidth.cpp:61
const double kDiacriticXPadRatio
Definition: strokewidth.cpp:74
bool TestVerticalTextDirection(double find_vertical_text_ratio, TO_BLOCK *block, BLOBNBOX_CLIST *osd_blobs)
integer coordinate
Definition: points.h:30
void set_diacritic_box(const TBOX &diacritic_box)
Definition: blobbox.h:383
inT16 bottom() const
Definition: rect.h:61
bool ConfirmNoTabViolation(const BLOBNBOX &other) const
Definition: blobbox.cpp:285
tesseract::ColPartition * owner() const
Definition: blobbox.h:337
int EvaluateColPartition(const ColPartition &part, const DENORM *denorm, bool debug) const
void rotate_box(FCOORD rotation)
Definition: blobbox.cpp:64
inT16 height() const
Definition: rect.h:104
void RemoveLineResidue(ColPartition_LIST *big_part_list)
bool good_stroke_neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:358
inT16 width() const
Definition: rect.h:111
PartitionFindResult
Definition: strokewidth.h:46
BBC * NextFullSearch()
Definition: bbgrid.h:678
ScrollView * MakeWindow(int x, int y, const char *window_name)
void set_horz_possible(bool value)
Definition: blobbox.h:295
int x_gap(const TBOX &box) const
Definition: rect.h:217
float horz_stroke_width() const
Definition: blobbox.h:322
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:429
bool vert_possible() const
Definition: blobbox.h:286
const int kLineTrapShortest
Definition: strokewidth.cpp:95
int IntCastRounded(double x)
Definition: helpers.h:172
BLOBNBOX * neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:355
BBC * NextRadSearch()
Definition: bbgrid.h:716
bool UniquelyHorizontal() const
Definition: blobbox.h:398
Definition: rect.h:30
void set_region_type(BlobRegionType new_type)
Definition: blobbox.h:271
void set_vert_possible(bool value)
Definition: blobbox.h:289
float y() const
Definition: points.h:212
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
bool UniquelyVertical() const
Definition: blobbox.h:395
static ColPartition * MakeBigPartition(BLOBNBOX *box, ColPartition_LIST *big_part_list)
float vert_stroke_width() const
Definition: blobbox.h:328
bool contains(const FCOORD pt) const
Definition: rect.h:323
const ICOORD & tright() const
Definition: bbgrid.h:75
const double kStrokeWidthTolerance
Definition: strokewidth.cpp:53
bool MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance, double constant_tolerance) const
Definition: blobbox.cpp:298
const double kCJKAspectRatioIncrease
Definition: strokewidth.cpp:67
const double kNeighbourSearchFactor
#define NULL
Definition: host.h:144
bool IsDiacritic() const
Definition: blobbox.h:365
const TBOX & bounding_box() const
Definition: blobbox.h:215
void set_top(int y)
Definition: rect.h:57
const double kStrokeWidthFractionCJK
Definition: strokewidth.cpp:55
bool leader_on_right() const
Definition: blobbox.h:349
int GoodTextBlob() const
Definition: blobbox.cpp:219
const double kStrokeWidthCJK
Definition: strokewidth.cpp:56
const double kLineResidueAspectRatio
void PlotGradedBlobs(BLOBNBOX_LIST *blobs, ScrollView *win)
inT16 top() const
Definition: rect.h:54
int base_char_bottom() const
Definition: blobbox.h:371
BlobTextFlowType flow() const
Definition: blobbox.h:280
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:772
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:538
static bool BlankImageInBetween(const TBOX &box1, const TBOX &box2, const TBOX &im_box, const FCOORD &rotation, Pix *pix)
Definition: imagefind.cpp:552
inT32 area()
Definition: stepblob.cpp:270
inT32 perimeter()
Definition: stepblob.cpp:289
Definition: points.h:189
const double kCJKAspectRatio
Definition: strokewidth.cpp:65
const double kMinDiacriticSizeRatio
Definition: strokewidth.cpp:80
ScrollView::Color BoxColor() const
Definition: blobbox.cpp:474
BlobGrid(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: blobgrid.cpp:24
bool overlap(const TBOX &box) const
Definition: rect.h:345
const int kCJKRadius
Definition: strokewidth.cpp:59
void set_owns_cblob(bool value)
Definition: blobbox.h:393
void GradeBlobsIntoPartitions(PageSegMode pageseg_mode, const FCOORD &rerotation, TO_BLOCK *block, Pix *nontext_pix, const DENORM *denorm, bool cjk_script, TextlineProjection *projection, BLOBNBOX_LIST *diacritic_blobs, ColPartitionGrid *part_grid, ColPartition_LIST *big_parts)
int base_char_top() const
Definition: blobbox.h:368
BLOBNBOX * base_char_blob() const
Definition: blobbox.h:387
const int kCJKMaxComponents
Definition: strokewidth.cpp:63
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:54
int right_rule() const
Definition: blobbox.h:304
BLOBNBOX_LIST blobs
Definition: blobbox.h:768