tesseract  4.0.0-1-g2a2b
tesseract::EquationDetect Class Reference

#include <equationdetect.h>

Inheritance diagram for tesseract::EquationDetect:
tesseract::EquationDetectBase

Public Types

enum  IndentType {
  NO_INDENT, LEFT_INDENT, RIGHT_INDENT, BOTH_INDENT,
  INDENT_TYPE_COUNT
}
 

Public Member Functions

 EquationDetect (const char *equ_datapath, const char *equ_language)
 
 ~EquationDetect ()
 
void SetLangTesseract (Tesseract *lang_tesseract)
 
int LabelSpecialText (TO_BLOCK *to_block)
 
int FindEquationParts (ColPartitionGrid *part_grid, ColPartitionSet **best_columns)
 
void SetResolution (const int resolution)
 
- Public Member Functions inherited from tesseract::EquationDetectBase
 EquationDetectBase ()=default
 
virtual ~EquationDetectBase ()
 

Protected Member Functions

void IdentifySpecialText (BLOBNBOX *blob, const int height_th)
 
BlobSpecialTextType EstimateTypeForUnichar (const UNICHARSET &unicharset, const UNICHAR_ID id) const
 
void IdentifySpecialText ()
 
void IdentifyBlobsToSkip (ColPartition *part)
 
void MergePartsByLocation ()
 
void SearchByOverlap (ColPartition *seed, GenericVector< ColPartition *> *parts_overlap)
 
void InsertPartAfterAbsorb (ColPartition *part)
 
void IdentifySeedParts ()
 
bool CheckSeedBlobsCount (ColPartition *part)
 
float ComputeForegroundDensity (const TBOX &tbox)
 
bool CheckForSeed2 (const GenericVector< int > &indented_texts_left, const float foreground_density_th, ColPartition *part)
 
int CountAlignment (const GenericVector< int > &sorted_vec, const int val) const
 
bool CheckSeedFgDensity (const float density_th, ColPartition *part)
 
void SplitCPHorLite (ColPartition *part, GenericVector< TBOX > *splitted_boxes)
 
void SplitCPHor (ColPartition *part, GenericVector< ColPartition *> *parts_splitted)
 
bool CheckSeedDensity (const float math_density_high, const float math_density_low, const ColPartition *part) const
 
IndentType IsIndented (ColPartition *part)
 
void IdentifyInlineParts ()
 
void ComputeCPsSuperBBox ()
 
void IdentifyInlinePartsHorizontal ()
 
int EstimateTextPartLineSpacing ()
 
void IdentifyInlinePartsVertical (const bool top_to_bottom, const int textPartsLineSpacing)
 
bool IsInline (const bool search_bottom, const int textPartsLineSpacing, ColPartition *part)
 
bool ExpandSeed (ColPartition *seed)
 
void ExpandSeedHorizontal (const bool search_left, ColPartition *seed, GenericVector< ColPartition *> *parts_to_merge)
 
void ExpandSeedVertical (const bool search_bottom, ColPartition *seed, GenericVector< ColPartition *> *parts_to_merge)
 
bool IsNearSmallNeighbor (const TBOX &seed_box, const TBOX &part_box) const
 
bool CheckSeedNeighborDensity (const ColPartition *part) const
 
void ProcessMathBlockSatelliteParts ()
 
bool IsMathBlockSatellite (ColPartition *part, GenericVector< ColPartition *> *math_blocks)
 
ColPartitionSearchNNVertical (const bool search_bottom, const ColPartition *part)
 
bool IsNearMathNeighbor (const int y_gap, const ColPartition *neighbor) const
 
void GetOutputTiffName (const char *name, STRING *image_name) const
 
void PaintColParts (const STRING &outfile) const
 
void PaintSpecialTexts (const STRING &outfile) const
 
void PrintSpecialBlobsDensity (const ColPartition *part) const
 

Protected Attributes

Tesseract equ_tesseract_
 
Tesseractlang_tesseract_
 
ColPartitionGridpart_grid_
 
ColPartitionSet ** best_columns_
 
TBOXcps_super_bbox_
 
GenericVector< ColPartition * > cp_seeds_
 
int resolution_
 
int page_count_
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::EquationDetectBase
static void RenderSpecialText (Pix *pix, BLOBNBOX *blob)
 

Detailed Description

Definition at line 39 of file equationdetect.h.

Member Enumeration Documentation

◆ IndentType

Constructor & Destructor Documentation

◆ EquationDetect()

tesseract::EquationDetect::EquationDetect ( const char *  equ_datapath,
const char *  equ_language 
)

Definition at line 103 of file equationdetect.cpp.

104  {
105  const char* default_name = "equ";
106  if (equ_name == nullptr) {
107  equ_name = default_name;
108  }
109  lang_tesseract_ = nullptr;
110  resolution_ = 0;
111  page_count_ = 0;
112 
113  if (equ_tesseract_.init_tesseract(equ_datapath, equ_name,
115  tprintf("Warning: equation region detection requested,"
116  " but %s failed to load from %s\n", equ_name, equ_datapath);
117  }
118 
119  cps_super_bbox_ = nullptr;
120 }
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:296
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37

◆ ~EquationDetect()

tesseract::EquationDetect::~EquationDetect ( )

Definition at line 122 of file equationdetect.cpp.

122 { delete (cps_super_bbox_); }

Member Function Documentation

◆ CheckForSeed2()

bool tesseract::EquationDetect::CheckForSeed2 ( const GenericVector< int > &  indented_texts_left,
const float  foreground_density_th,
ColPartition part 
)
protected

Definition at line 738 of file equationdetect.cpp.

741  {
742  ASSERT_HOST(part);
743  const TBOX& box = part->bounding_box();
744 
745  // Check if it is aligned with any indented_texts_left.
746  if (!indented_texts_left.empty() &&
747  CountAlignment(indented_texts_left, box.left()) >=
749  return false;
750  }
751 
752  // Check the foreground density.
753  if (ComputeForegroundDensity(box) > foreground_density_th) {
754  return false;
755  }
756 
757  return true;
758 }
int CountAlignment(const GenericVector< int > &sorted_vec, const int val) const
float ComputeForegroundDensity(const TBOX &tbox)
Definition: rect.h:34
int16_t left() const
Definition: rect.h:72
bool empty() const
Definition: genericvector.h:90
const int kLeftIndentAlignmentCountTh
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ CheckSeedBlobsCount()

bool tesseract::EquationDetect::CheckSeedBlobsCount ( ColPartition part)
protected

Definition at line 984 of file equationdetect.cpp.

984  {
985  if (!part) {
986  return false;
987  }
988  const int kSeedMathBlobsCount = 2;
989  const int kSeedMathDigitBlobsCount = 5;
990 
991  const int blobs = part->boxes_count(),
992  math_blobs = part->SpecialBlobsCount(BSTT_MATH),
993  digit_blobs = part->SpecialBlobsCount(BSTT_DIGIT);
994  if (blobs < kSeedBlobsCountTh || math_blobs <= kSeedMathBlobsCount ||
995  math_blobs + digit_blobs <= kSeedMathDigitBlobsCount) {
996  return false;
997  }
998 
999  return true;
1000 }
const int kSeedBlobsCountTh

◆ CheckSeedDensity()

bool tesseract::EquationDetect::CheckSeedDensity ( const float  math_density_high,
const float  math_density_low,
const ColPartition part 
) const
protected

Definition at line 1002 of file equationdetect.cpp.

1005  {
1006  ASSERT_HOST(part);
1007  float math_digit_density = part->SpecialBlobsDensity(BSTT_MATH)
1008  + part->SpecialBlobsDensity(BSTT_DIGIT);
1009  float italic_density = part->SpecialBlobsDensity(BSTT_ITALIC);
1010  if (math_digit_density > math_density_high) {
1011  return true;
1012  }
1013  if (math_digit_density + italic_density > kMathItalicDensityTh &&
1014  math_digit_density > math_density_low) {
1015  return true;
1016  }
1017 
1018  return false;
1019 }
const float kMathItalicDensityTh
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ CheckSeedFgDensity()

bool tesseract::EquationDetect::CheckSeedFgDensity ( const float  density_th,
ColPartition part 
)
protected

Definition at line 626 of file equationdetect.cpp.

627  {
628  ASSERT_HOST(part);
629 
630  // Split part horizontall, and check for each sub part.
631  GenericVector<TBOX> sub_boxes;
632  SplitCPHorLite(part, &sub_boxes);
633  float parts_passed = 0.0;
634  for (int i = 0; i < sub_boxes.size(); ++i) {
635  const float density = ComputeForegroundDensity(sub_boxes[i]);
636  if (density < density_th) {
637  parts_passed++;
638  }
639  }
640 
641  // If most sub parts passed, then we return true.
642  const float kSeedPartRatioTh = 0.3;
643  bool retval = (parts_passed / sub_boxes.size() >= kSeedPartRatioTh);
644 
645  return retval;
646 }
int size() const
Definition: genericvector.h:71
float ComputeForegroundDensity(const TBOX &tbox)
void SplitCPHorLite(ColPartition *part, GenericVector< TBOX > *splitted_boxes)
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ CheckSeedNeighborDensity()

bool tesseract::EquationDetect::CheckSeedNeighborDensity ( const ColPartition part) const
protected

Definition at line 1293 of file equationdetect.cpp.

1293  {
1294  ASSERT_HOST(part);
1295  if (part->boxes_count() < kSeedBlobsCountTh) {
1296  // Too few blobs, skip the check.
1297  return true;
1298  }
1299 
1300  // We check the math blobs density and the unclear blobs density.
1301  if (part->SpecialBlobsDensity(BSTT_MATH) +
1302  part->SpecialBlobsDensity(BSTT_DIGIT) > kMathDigitDensityTh1 ||
1303  part->SpecialBlobsDensity(BSTT_UNCLEAR) > kUnclearDensityTh) {
1304  return true;
1305  }
1306 
1307  return false;
1308 }
const int kSeedBlobsCountTh
const float kMathDigitDensityTh1
const float kUnclearDensityTh
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ ComputeCPsSuperBBox()

void tesseract::EquationDetect::ComputeCPsSuperBBox ( )
protected

Definition at line 792 of file equationdetect.cpp.

792  {
794  ColPartition *part = nullptr;
795  gsearch.StartFullSearch();
796  delete cps_super_bbox_;
797  cps_super_bbox_ = new TBOX();
798  while ((part = gsearch.NextFullSearch()) != nullptr) {
799  (*cps_super_bbox_) += part->bounding_box();
800  }
801 }
Definition: rect.h:34
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
ColPartitionGrid * part_grid_

◆ ComputeForegroundDensity()

float tesseract::EquationDetect::ComputeForegroundDensity ( const TBOX tbox)
protected

Definition at line 612 of file equationdetect.cpp.

612  {
613  Pix *pix_bi = lang_tesseract_->pix_binary();
614  const int pix_height = pixGetHeight(pix_bi);
615  Box* box = boxCreate(tbox.left(), pix_height - tbox.top(),
616  tbox.width(), tbox.height());
617  Pix *pix_sub = pixClipRectangle(pix_bi, box, nullptr);
618  l_float32 fract;
619  pixForegroundFraction(pix_sub, &fract);
620  pixDestroy(&pix_sub);
621  boxDestroy(&box);
622 
623  return fract;
624 }
int16_t width() const
Definition: rect.h:115
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
Pix * pix_binary() const
int16_t height() const
Definition: rect.h:108

◆ CountAlignment()

int tesseract::EquationDetect::CountAlignment ( const GenericVector< int > &  sorted_vec,
const int  val 
) const
protected

Definition at line 760 of file equationdetect.cpp.

761  {
762  if (sorted_vec.empty()) {
763  return 0;
764  }
765  const int kDistTh = static_cast<int>(roundf(0.03 * resolution_));
766  const int pos = sorted_vec.binary_search(val);
767  int count = 0;
768 
769  // Search left side.
770  int index = pos;
771  while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) {
772  count++;
773  }
774 
775  // Search right side.
776  index = pos + 1;
777  while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {
778  count++;
779  }
780 
781  return count;
782 }
int size() const
Definition: genericvector.h:71
int count(LIST var_list)
Definition: oldlist.cpp:98
bool empty() const
Definition: genericvector.h:90
int binary_search(const T &target) const

◆ EstimateTextPartLineSpacing()

int tesseract::EquationDetect::EstimateTextPartLineSpacing ( )
protected

Definition at line 868 of file equationdetect.cpp.

868  {
870 
871  // Get the y gap between text partitions;
872  ColPartition *current = nullptr, *prev = nullptr;
873  gsearch.StartFullSearch();
874  GenericVector<int> ygaps;
875  while ((current = gsearch.NextFullSearch()) != nullptr) {
876  if (!PTIsTextType(current->type())) {
877  continue;
878  }
879  if (prev != nullptr) {
880  const TBOX &current_box = current->bounding_box();
881  const TBOX &prev_box = prev->bounding_box();
882  // prev and current should be x major overlap and non y overlap.
883  if (current_box.major_x_overlap(prev_box) &&
884  !current_box.y_overlap(prev_box)) {
885  int gap = current_box.y_gap(prev_box);
886  if (gap < std::min(current_box.height(), prev_box.height())) {
887  // The gap should be smaller than the height of the bounding boxes.
888  ygaps.push_back(gap);
889  }
890  }
891  }
892  prev = current;
893  }
894 
895  if (ygaps.size() < 8) { // We do not have enough data.
896  return -1;
897  }
898 
899  // Compute the line spacing from ygaps: use the mean of the first half.
900  ygaps.sort();
901  int spacing = 0, count;
902  for (count = 0; count < ygaps.size() / 2; count++) {
903  spacing += ygaps[count];
904  }
905  return spacing / count;
906 }
int size() const
Definition: genericvector.h:71
int y_gap(const TBOX &box) const
Definition: rect.h:233
int count(LIST var_list)
Definition: oldlist.cpp:98
Definition: rect.h:34
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:412
ColPartitionGrid * part_grid_
int push_back(T object)
bool y_overlap(const TBOX &box) const
Definition: rect.h:428
bool PTIsTextType(PolyBlockType type)
Definition: publictypes.h:82
int16_t height() const
Definition: rect.h:108

◆ EstimateTypeForUnichar()

BlobSpecialTextType tesseract::EquationDetect::EstimateTypeForUnichar ( const UNICHARSET unicharset,
const UNICHAR_ID  id 
) const
protected

Definition at line 225 of file equationdetect.cpp.

226  {
227  const STRING s = unicharset.id_to_unichar(id);
228  if (unicharset.get_isalpha(id)) {
229  return BSTT_NONE;
230  }
231 
232  if (unicharset.get_ispunctuation(id)) {
233  // Exclude some special texts that are likely to be confused as math symbol.
234  static GenericVector<UNICHAR_ID> ids_to_exclude;
235  if (ids_to_exclude.empty()) {
236  static const STRING kCharsToEx[] = {"'", "`", "\"", "\\", ",", ".",
237  "〈", "〉", "《", "》", "」", "「", ""};
238  int i = 0;
239  while (kCharsToEx[i] != "") {
240  ids_to_exclude.push_back(
241  unicharset.unichar_to_id(kCharsToEx[i++].string()));
242  }
243  ids_to_exclude.sort();
244  }
245  return ids_to_exclude.bool_binary_search(id) ? BSTT_NONE : BSTT_MATH;
246  }
247 
248  // Check if it is digit. In addition to the isdigit attribute, we also check
249  // if this character belongs to those likely to be confused with a digit.
250  static const STRING kDigitsChars = "|";
251  if (unicharset.get_isdigit(id) ||
252  (s.length() == 1 && kDigitsChars.contains(s[0]))) {
253  return BSTT_DIGIT;
254  } else {
255  return BSTT_MATH;
256  }
257 }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:514
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
bool bool_binary_search(const T &target) const
bool empty() const
Definition: genericvector.h:90
int push_back(T object)
Definition: strngs.h:45
bool contains(const char c) const
Definition: strngs.cpp:187
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
int32_t length() const
Definition: strngs.cpp:191

◆ ExpandSeed()

bool tesseract::EquationDetect::ExpandSeed ( ColPartition seed)
protected

Definition at line 1085 of file equationdetect.cpp.

1085  {
1086  if (seed == nullptr || // This seed has been absorbed by other seeds.
1087  seed->IsVerticalType()) { // We skip vertical type right now.
1088  return false;
1089  }
1090 
1091  // Expand in four directions.
1092  GenericVector<ColPartition*> parts_to_merge;
1093  ExpandSeedHorizontal(true, seed, &parts_to_merge);
1094  ExpandSeedHorizontal(false, seed, &parts_to_merge);
1095  ExpandSeedVertical(true, seed, &parts_to_merge);
1096  ExpandSeedVertical(false, seed, &parts_to_merge);
1097  SearchByOverlap(seed, &parts_to_merge);
1098 
1099  if (parts_to_merge.empty()) { // We don't find any partition to merge.
1100  return false;
1101  }
1102 
1103  // Merge all partitions in parts_to_merge with seed. We first remove seed
1104  // from part_grid_ as its bounding box is going to expand. Then we add it
1105  // back after it aborbs all parts_to_merge parititions.
1106  part_grid_->RemoveBBox(seed);
1107  for (int i = 0; i < parts_to_merge.size(); ++i) {
1108  ColPartition* part = parts_to_merge[i];
1109  if (part->type() == PT_EQUATION) {
1110  // If part is in cp_seeds_, then we mark it as nullptr so that we won't
1111  // process it again.
1112  for (int j = 0; j < cp_seeds_.size(); ++j) {
1113  if (part == cp_seeds_[j]) {
1114  cp_seeds_[j] = nullptr;
1115  break;
1116  }
1117  }
1118  }
1119 
1120  // part has already been removed from part_grid_ in function
1121  // ExpandSeedHorizontal/ExpandSeedVertical.
1122  seed->Absorb(part, nullptr);
1123  }
1124 
1125  return true;
1126 }
int size() const
Definition: genericvector.h:71
void SearchByOverlap(ColPartition *seed, GenericVector< ColPartition *> *parts_overlap)
void RemoveBBox(BBC *bbox)
Definition: bbgrid.h:535
bool empty() const
Definition: genericvector.h:90
ColPartitionGrid * part_grid_
GenericVector< ColPartition * > cp_seeds_
void ExpandSeedVertical(const bool search_bottom, ColPartition *seed, GenericVector< ColPartition *> *parts_to_merge)
void ExpandSeedHorizontal(const bool search_left, ColPartition *seed, GenericVector< ColPartition *> *parts_to_merge)

◆ ExpandSeedHorizontal()

void tesseract::EquationDetect::ExpandSeedHorizontal ( const bool  search_left,
ColPartition seed,
GenericVector< ColPartition *> *  parts_to_merge 
)
protected

Definition at line 1128 of file equationdetect.cpp.

1131  {
1132  ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr);
1133  const float kYOverlapTh = 0.6;
1134  const int kXGapTh = static_cast<int>(roundf(0.2 * resolution_));
1135 
1137  const TBOX& seed_box(seed->bounding_box());
1138  const int x = search_left ? seed_box.left() : seed_box.right();
1139  search.StartSideSearch(x, seed_box.bottom(), seed_box.top());
1140  search.SetUniqueMode(true);
1141 
1142  // Search iteratively.
1143  ColPartition *part = nullptr;
1144  while ((part = search.NextSideSearch(search_left)) != nullptr) {
1145  if (part == seed) {
1146  continue;
1147  }
1148  const TBOX& part_box(part->bounding_box());
1149  if (part_box.x_gap(seed_box) > kXGapTh) { // Out of scope.
1150  break;
1151  }
1152 
1153  // Check part location.
1154  if ((part_box.left() >= seed_box.left() && search_left) ||
1155  (part_box.right() <= seed_box.right() && !search_left)) {
1156  continue;
1157  }
1158 
1159  if (part->type() != PT_EQUATION) { // Non-equation type.
1160  // Skip PT_LINLINE_EQUATION and non text type.
1161  if (part->type() == PT_INLINE_EQUATION ||
1162  (!IsTextOrEquationType(part->type()) &&
1163  part->blob_type() != BRT_HLINE)) {
1164  continue;
1165  }
1166  // For other types, it should be the near small neighbor of seed.
1167  if (!IsNearSmallNeighbor(seed_box, part_box) ||
1168  !CheckSeedNeighborDensity(part)) {
1169  continue;
1170  }
1171  } else { // Equation type, check the y overlap.
1172  if (part_box.y_overlap_fraction(seed_box) < kYOverlapTh &&
1173  seed_box.y_overlap_fraction(part_box) < kYOverlapTh) {
1174  continue;
1175  }
1176  }
1177 
1178  // Passed the check, delete it from search and add into parts_to_merge.
1179  search.RemoveBBox();
1180  parts_to_merge->push_back(part);
1181  }
1182 }
Definition: rect.h:34
bool IsTextOrEquationType(PolyBlockType type)
int16_t left() const
Definition: rect.h:72
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:366
ColPartitionGrid * part_grid_
int push_back(T object)
bool CheckSeedNeighborDensity(const ColPartition *part) const
bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ ExpandSeedVertical()

void tesseract::EquationDetect::ExpandSeedVertical ( const bool  search_bottom,
ColPartition seed,
GenericVector< ColPartition *> *  parts_to_merge 
)
protected

Definition at line 1184 of file equationdetect.cpp.

1187  {
1188  ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr &&
1189  cps_super_bbox_ != nullptr);
1190  const float kXOverlapTh = 0.4;
1191  const int kYGapTh = static_cast<int>(roundf(0.2 * resolution_));
1192 
1194  const TBOX& seed_box(seed->bounding_box());
1195  const int y = search_bottom ? seed_box.bottom() : seed_box.top();
1196  search.StartVerticalSearch(
1198  search.SetUniqueMode(true);
1199 
1200  // Search iteratively.
1201  ColPartition *part = nullptr;
1203  int skipped_min_top = std::numeric_limits<int>::max(), skipped_max_bottom = -1;
1204  while ((part = search.NextVerticalSearch(search_bottom)) != nullptr) {
1205  if (part == seed) {
1206  continue;
1207  }
1208  const TBOX& part_box(part->bounding_box());
1209 
1210  if (part_box.y_gap(seed_box) > kYGapTh) { // Out of scope.
1211  break;
1212  }
1213 
1214  // Check part location.
1215  if ((part_box.bottom() >= seed_box.bottom() && search_bottom) ||
1216  (part_box.top() <= seed_box.top() && !search_bottom)) {
1217  continue;
1218  }
1219 
1220  bool skip_part = false;
1221  if (part->type() != PT_EQUATION) { // Non-equation type.
1222  // Skip PT_LINLINE_EQUATION and non text type.
1223  if (part->type() == PT_INLINE_EQUATION ||
1224  (!IsTextOrEquationType(part->type()) &&
1225  part->blob_type() != BRT_HLINE)) {
1226  skip_part = true;
1227  } else if (!IsNearSmallNeighbor(seed_box, part_box) ||
1228  !CheckSeedNeighborDensity(part)) {
1229  // For other types, it should be the near small neighbor of seed.
1230  skip_part = true;
1231  }
1232  } else { // Equation type, check the x overlap.
1233  if (part_box.x_overlap_fraction(seed_box) < kXOverlapTh &&
1234  seed_box.x_overlap_fraction(part_box) < kXOverlapTh) {
1235  skip_part = true;
1236  }
1237  }
1238  if (skip_part) {
1239  if (part->type() != PT_EQUATION) {
1240  if (skipped_min_top > part_box.top()) {
1241  skipped_min_top = part_box.top();
1242  }
1243  if (skipped_max_bottom < part_box.bottom()) {
1244  skipped_max_bottom = part_box.bottom();
1245  }
1246  }
1247  } else {
1248  parts.push_back(part);
1249  }
1250  }
1251 
1252  // For every part in parts, we need verify it is not above skipped_min_top
1253  // when search top, or not below skipped_max_bottom when search bottom. I.e.,
1254  // we will skip a part if it looks like:
1255  // search bottom | search top
1256  // seed: ****************** | part: **********
1257  // skipped: xxx | skipped: xxx
1258  // part: ********** | seed: ***********
1259  for (int i = 0; i < parts.size(); i++) {
1260  const TBOX& part_box(parts[i]->bounding_box());
1261  if ((search_bottom && part_box.top() <= skipped_max_bottom) ||
1262  (!search_bottom && part_box.bottom() >= skipped_min_top)) {
1263  continue;
1264  }
1265  // Add parts[i] into parts_to_merge, and delete it from part_grid_.
1266  parts_to_merge->push_back(parts[i]);
1267  part_grid_->RemoveBBox(parts[i]);
1268  }
1269 }
int size() const
Definition: genericvector.h:71
Definition: rect.h:34
bool IsTextOrEquationType(PolyBlockType type)
int16_t left() const
Definition: rect.h:72
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
int16_t top() const
Definition: rect.h:58
void RemoveBBox(BBC *bbox)
Definition: bbgrid.h:535
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:366
ColPartitionGrid * part_grid_
int push_back(T object)
bool CheckSeedNeighborDensity(const ColPartition *part) const
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ FindEquationParts()

int tesseract::EquationDetect::FindEquationParts ( ColPartitionGrid part_grid,
ColPartitionSet **  best_columns 
)
virtual

Implements tesseract::EquationDetectBase.

Definition at line 359 of file equationdetect.cpp.

360  {
361  if (!lang_tesseract_) {
362  tprintf("Warning: lang_tesseract_ is nullptr!\n");
363  return -1;
364  }
365  if (!part_grid || !best_columns) {
366  tprintf("part_grid/best_columns is nullptr!!\n");
367  return -1;
368  }
369  cp_seeds_.clear();
370  part_grid_ = part_grid;
371  best_columns_ = best_columns;
373  STRING outfile;
374  page_count_++;
375 
377  GetOutputTiffName("_bi", &outfile);
378  pixWrite(outfile.string(), lang_tesseract_->pix_binary(), IFF_TIFF_G4);
379  }
380 
381  // Pass 0: Compute special text type for blobs.
383 
384  // Pass 1: Merge parts by overlap.
386 
387  // Pass 2: compute the math blob density and find the seed partition.
389  // We still need separate seed into block seed and inline seed partition.
391 
393  GetOutputTiffName("_seed", &outfile);
394  PaintColParts(outfile);
395  }
396 
397  // Pass 3: expand block equation seeds.
398  while (!cp_seeds_.empty()) {
399  GenericVector<ColPartition*> seeds_expanded;
400  for (int i = 0; i < cp_seeds_.size(); ++i) {
401  if (ExpandSeed(cp_seeds_[i])) {
402  // If this seed is expanded, then we add it into seeds_expanded. Note
403  // this seed has been removed from part_grid_ if it is expanded.
404  seeds_expanded.push_back(cp_seeds_[i]);
405  }
406  }
407  // Add seeds_expanded back into part_grid_ and reset cp_seeds_.
408  for (int i = 0; i < seeds_expanded.size(); ++i) {
409  InsertPartAfterAbsorb(seeds_expanded[i]);
410  }
411  cp_seeds_ = seeds_expanded;
412  }
413 
414  // Pass 4: find math block satellite text partitions and merge them.
416 
417  if (equationdetect_save_merged_image) { // For debug.
418  GetOutputTiffName("_merged", &outfile);
419  PaintColParts(outfile);
420  }
421 
422  return 0;
423 }
bool equationdetect_save_bi_image
int size() const
Definition: genericvector.h:71
void GetOutputTiffName(const char *name, STRING *image_name) const
const char * string() const
Definition: strngs.cpp:196
bool equationdetect_save_seed_image
void PaintColParts(const STRING &outfile) const
int source_resolution() const
Pix * pix_binary() const
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool ExpandSeed(ColPartition *seed)
void InsertPartAfterAbsorb(ColPartition *part)
ColPartitionGrid * part_grid_
bool equationdetect_save_merged_image
int push_back(T object)
GenericVector< ColPartition * > cp_seeds_
Definition: strngs.h:45
ColPartitionSet ** best_columns_

◆ GetOutputTiffName()

void tesseract::EquationDetect::GetOutputTiffName ( const char *  name,
STRING image_name 
) const
protected

Definition at line 1457 of file equationdetect.cpp.

1458  {
1459  ASSERT_HOST(image_name && name);
1460  char page[50];
1461  snprintf(page, sizeof(page), "%04d", page_count_);
1462  *image_name = STRING(lang_tesseract_->imagebasename) + page + name + ".tif";
1463 }
STRING imagebasename
Definition: ccutil.h:65
Definition: strngs.h:45
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ IdentifyBlobsToSkip()

void tesseract::EquationDetect::IdentifyBlobsToSkip ( ColPartition part)
protected

Definition at line 311 of file equationdetect.cpp.

311  {
312  ASSERT_HOST(part);
313  BLOBNBOX_C_IT blob_it(part->boxes());
314 
315  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
316  // At this moment, no blob should have been joined.
317  ASSERT_HOST(!blob_it.data()->joined_to_prev());
318  }
319  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
320  BLOBNBOX* blob = blob_it.data();
321  if (blob->joined_to_prev() || blob->special_text_type() == BSTT_SKIP) {
322  continue;
323  }
324  TBOX blob_box = blob->bounding_box();
325 
326  // Search if any blob can be merged into blob. If found, then we mark all
327  // these blobs as BSTT_SKIP.
328  BLOBNBOX_C_IT blob_it2 = blob_it;
329  bool found = false;
330  while (!blob_it2.at_last()) {
331  BLOBNBOX* nextblob = blob_it2.forward();
332  const TBOX& nextblob_box = nextblob->bounding_box();
333  if (nextblob_box.left() >= blob_box.right()) {
334  break;
335  }
336  const float kWidthR = 0.4, kHeightR = 0.3;
337  const bool xoverlap = blob_box.major_x_overlap(nextblob_box),
338  yoverlap = blob_box.y_overlap(nextblob_box);
339  const float widthR = static_cast<float>(
340  std::min(nextblob_box.width(), blob_box.width())) /
341  std::max(nextblob_box.width(), blob_box.width());
342  const float heightR = static_cast<float>(
343  std::min(nextblob_box.height(), blob_box.height())) /
344  std::max(nextblob_box.height(), blob_box.height());
345 
346  if (xoverlap && yoverlap && widthR > kWidthR && heightR > kHeightR) {
347  // Found one, set nextblob type and recompute blob_box.
348  found = true;
349  nextblob->set_special_text_type(BSTT_SKIP);
350  blob_box += nextblob_box;
351  }
352  }
353  if (found) {
355  }
356  }
357 }
Definition: rect.h:34
int16_t width() const
Definition: rect.h:115
int16_t left() const
Definition: rect.h:72
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:412
bool joined_to_prev() const
Definition: blobbox.h:257
BlobSpecialTextType special_text_type() const
Definition: blobbox.h:290
bool y_overlap(const TBOX &box) const
Definition: rect.h:428
const TBOX & bounding_box() const
Definition: blobbox.h:231
int16_t right() const
Definition: rect.h:79
int16_t height() const
Definition: rect.h:108
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_special_text_type(BlobSpecialTextType new_type)
Definition: blobbox.h:293

◆ IdentifyInlineParts()

void tesseract::EquationDetect::IdentifyInlineParts ( )
protected

Definition at line 784 of file equationdetect.cpp.

784  {
787  const int textparts_linespacing = EstimateTextPartLineSpacing();
788  IdentifyInlinePartsVertical(true, textparts_linespacing);
789  IdentifyInlinePartsVertical(false, textparts_linespacing);
790 }
void IdentifyInlinePartsVertical(const bool top_to_bottom, const int textPartsLineSpacing)

◆ IdentifyInlinePartsHorizontal()

void tesseract::EquationDetect::IdentifyInlinePartsHorizontal ( )
protected

Definition at line 803 of file equationdetect.cpp.

803  {
806  const int kMarginDiffTh = IntCastRounded(
808  const int kGapTh = static_cast<int>(roundf(
811  search.SetUniqueMode(true);
812  // The center x coordinate of the cp_super_bbox_.
813  const int cps_cx = cps_super_bbox_->left() + cps_super_bbox_->width() / 2;
814  for (int i = 0; i < cp_seeds_.size(); ++i) {
815  ColPartition* part = cp_seeds_[i];
816  const TBOX& part_box(part->bounding_box());
817  const int left_margin = part_box.left() - cps_super_bbox_->left(),
818  right_margin = cps_super_bbox_->right() - part_box.right();
819  bool right_to_left;
820  if (left_margin + kMarginDiffTh < right_margin &&
821  left_margin < kMarginDiffTh) {
822  // part is left aligned, so we search if it has any right neighbor.
823  search.StartSideSearch(
824  part_box.right(), part_box.top(), part_box.bottom());
825  right_to_left = false;
826  } else if (left_margin > cps_cx) {
827  // part locates on the right half on image, so search if it has any left
828  // neighbor.
829  search.StartSideSearch(
830  part_box.left(), part_box.top(), part_box.bottom());
831  right_to_left = true;
832  } else { // part is not an inline equation.
833  new_seeds.push_back(part);
834  continue;
835  }
836  ColPartition* neighbor = nullptr;
837  bool side_neighbor_found = false;
838  while ((neighbor = search.NextSideSearch(right_to_left)) != nullptr) {
839  const TBOX& neighbor_box(neighbor->bounding_box());
840  if (!IsTextOrEquationType(neighbor->type()) ||
841  part_box.x_gap(neighbor_box) > kGapTh ||
842  !part_box.major_y_overlap(neighbor_box) ||
843  part_box.major_x_overlap(neighbor_box)) {
844  continue;
845  }
846  // We have found one. Set the side_neighbor_found flag.
847  side_neighbor_found = true;
848  break;
849  }
850  if (!side_neighbor_found) { // Mark part as PT_INLINE_EQUATION.
851  part->set_type(PT_INLINE_EQUATION);
852  } else {
853  // Check the geometric feature of neighbor.
854  const TBOX& neighbor_box(neighbor->bounding_box());
855  if (neighbor_box.width() > part_box.width() &&
856  neighbor->type() != PT_EQUATION) { // Mark as PT_INLINE_EQUATION.
857  part->set_type(PT_INLINE_EQUATION);
858  } else { // part is not an inline equation type.
859  new_seeds.push_back(part);
860  }
861  }
862  }
863 
864  // Reset the cp_seeds_ using the new_seeds.
865  cp_seeds_ = new_seeds;
866 }
Definition: rect.h:34
bool IsTextOrEquationType(PolyBlockType type)
int16_t width() const
Definition: rect.h:115
int16_t left() const
Definition: rect.h:72
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
int source_resolution() const
int IntCastRounded(double x)
Definition: helpers.h:168
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:366
ColPartitionGrid * part_grid_
int push_back(T object)
GenericVector< ColPartition * > cp_seeds_
int16_t right() const
Definition: rect.h:79
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ IdentifyInlinePartsVertical()

void tesseract::EquationDetect::IdentifyInlinePartsVertical ( const bool  top_to_bottom,
const int  textPartsLineSpacing 
)
protected

Definition at line 908 of file equationdetect.cpp.

909  {
910  if (cp_seeds_.empty()) {
911  return;
912  }
913 
914  // Sort cp_seeds_.
915  if (top_to_bottom) { // From top to bottom.
916  cp_seeds_.sort(&SortCPByTopReverse);
917  } else { // From bottom to top.
918  cp_seeds_.sort(&SortCPByBottom);
919  }
920 
922  for (int i = 0; i < cp_seeds_.size(); ++i) {
923  ColPartition* part = cp_seeds_[i];
924  // If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look
925  // for its top neighbors, so that if two/more inline regions are connected
926  // to each other, then we will identify the top one, and then use it to
927  // identify the bottom one.
928  if (IsInline(!top_to_bottom, textparts_linespacing, part)) {
929  part->set_type(PT_INLINE_EQUATION);
930  } else {
931  new_seeds.push_back(part);
932  }
933  }
934  cp_seeds_ = new_seeds;
935 }
bool IsInline(const bool search_bottom, const int textPartsLineSpacing, ColPartition *part)
int push_back(T object)
GenericVector< ColPartition * > cp_seeds_

◆ IdentifySeedParts()

void tesseract::EquationDetect::IdentifySeedParts ( )
protected

Definition at line 540 of file equationdetect.cpp.

540  {
542  ColPartition *part = nullptr;
543  gsearch.StartFullSearch();
544 
545  GenericVector<ColPartition*> seeds1, seeds2;
546  // The left coordinates of indented text partitions.
547  GenericVector<int> indented_texts_left;
548  // The foreground density of text partitions.
549  GenericVector<float> texts_foreground_density;
550  while ((part = gsearch.NextFullSearch()) != nullptr) {
551  if (!IsTextOrEquationType(part->type())) {
552  continue;
553  }
554  part->ComputeSpecialBlobsDensity();
555  const bool blobs_check = CheckSeedBlobsCount(part);
556  const int kTextBlobsTh = 20;
557 
559  blobs_check) {
560  // Passed high density threshold test, save into seeds1.
561  seeds1.push_back(part);
562  } else {
563  IndentType indent = IsIndented(part);
564  if (IsLeftIndented(indent) && blobs_check &&
566  // Passed low density threshold test and is indented, save into seeds2.
567  seeds2.push_back(part);
568  } else if (!IsRightIndented(indent) &&
569  part->boxes_count() > kTextBlobsTh) {
570  // This is likely to be a text part, save the features.
571  const TBOX&box = part->bounding_box();
572  if (IsLeftIndented(indent)) {
573  indented_texts_left.push_back(box.left());
574  }
575  texts_foreground_density.push_back(ComputeForegroundDensity(box));
576  }
577  }
578  }
579 
580  // Sort the features collected from text regions.
581  indented_texts_left.sort();
582  texts_foreground_density.sort();
583  float foreground_density_th = 0.15; // Default value.
584  if (!texts_foreground_density.empty()) {
585  // Use the median of the texts_foreground_density.
586  foreground_density_th = 0.8 * texts_foreground_density[
587  texts_foreground_density.size() / 2];
588  }
589 
590  for (int i = 0; i < seeds1.size(); ++i) {
591  const TBOX& box = seeds1[i]->bounding_box();
592  if (CheckSeedFgDensity(foreground_density_th, seeds1[i]) &&
593  !(IsLeftIndented(IsIndented(seeds1[i])) &&
594  CountAlignment(indented_texts_left, box.left()) >=
596  // Mark as PT_EQUATION type.
597  seeds1[i]->set_type(PT_EQUATION);
598  cp_seeds_.push_back(seeds1[i]);
599  } else { // Mark as PT_INLINE_EQUATION type.
600  seeds1[i]->set_type(PT_INLINE_EQUATION);
601  }
602  }
603 
604  for (int i = 0; i < seeds2.size(); ++i) {
605  if (CheckForSeed2(indented_texts_left, foreground_density_th, seeds2[i])) {
606  seeds2[i]->set_type(PT_EQUATION);
607  cp_seeds_.push_back(seeds2[i]);
608  }
609  }
610 }
int CountAlignment(const GenericVector< int > &sorted_vec, const int val) const
int size() const
Definition: genericvector.h:71
bool IsRightIndented(const EquationDetect::IndentType type)
float ComputeForegroundDensity(const TBOX &tbox)
Definition: rect.h:34
bool CheckForSeed2(const GenericVector< int > &indented_texts_left, const float foreground_density_th, ColPartition *part)
bool IsTextOrEquationType(PolyBlockType type)
IndentType IsIndented(ColPartition *part)
int16_t left() const
Definition: rect.h:72
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
const float kMathDigitDensityTh2
bool CheckSeedFgDensity(const float density_th, ColPartition *part)
bool empty() const
Definition: genericvector.h:90
ColPartitionGrid * part_grid_
int push_back(T object)
GenericVector< ColPartition * > cp_seeds_
bool CheckSeedBlobsCount(ColPartition *part)
bool CheckSeedDensity(const float math_density_high, const float math_density_low, const ColPartition *part) const
const int kLeftIndentAlignmentCountTh
bool IsLeftIndented(const EquationDetect::IndentType type)
const float kMathDigitDensityTh1

◆ IdentifySpecialText() [1/2]

void tesseract::EquationDetect::IdentifySpecialText ( BLOBNBOX blob,
const int  height_th 
)
protected

Definition at line 152 of file equationdetect.cpp.

153  {
154  ASSERT_HOST(blobnbox != nullptr);
155  if (blobnbox->bounding_box().height() < height_th && height_th > 0) {
156  // For small blob, we simply set to BSTT_NONE.
157  blobnbox->set_special_text_type(BSTT_NONE);
158  return;
159  }
160 
161  BLOB_CHOICE_LIST ratings_equ, ratings_lang;
162  C_BLOB* blob = blobnbox->cblob();
163  // TODO(joeliu/rays) Fix this. We may have to normalize separately for
164  // each classifier here, as they may require different PolygonalCopy.
165  TBLOB* tblob = TBLOB::PolygonalCopy(false, blob);
166  const TBOX& box = tblob->bounding_box();
167 
168  // Normalize the blob. Set the origin to the place we want to be the
169  // bottom-middle, and scaling is to make the height the x-height.
170  const float scaling = static_cast<float>(kBlnXHeight) / box.height();
171  const float x_orig = (box.left() + box.right()) / 2.0f, y_orig = box.bottom();
172  std::unique_ptr<TBLOB> normed_blob(new TBLOB(*tblob));
173  normed_blob->Normalize(nullptr, nullptr, nullptr, x_orig, y_orig, scaling, scaling,
174  0.0f, static_cast<float>(kBlnBaselineOffset),
175  false, nullptr);
176  equ_tesseract_.AdaptiveClassifier(normed_blob.get(), &ratings_equ);
177  lang_tesseract_->AdaptiveClassifier(normed_blob.get(), &ratings_lang);
178  delete tblob;
179 
180  // Get the best choice from ratings_lang and rating_equ. As the choice in the
181  // list has already been sorted by the certainty, we simply use the first
182  // choice.
183  BLOB_CHOICE *lang_choice = nullptr, *equ_choice = nullptr;
184  if (ratings_lang.length() > 0) {
185  BLOB_CHOICE_IT choice_it(&ratings_lang);
186  lang_choice = choice_it.data();
187  }
188  if (ratings_equ.length() > 0) {
189  BLOB_CHOICE_IT choice_it(&ratings_equ);
190  equ_choice = choice_it.data();
191  }
192 
193  const float lang_score = lang_choice ? lang_choice->certainty() : -FLT_MAX;
194  const float equ_score = equ_choice ? equ_choice->certainty() : -FLT_MAX;
195 
196  const float kConfScoreTh = -5.0f, kConfDiffTh = 1.8;
197  // The scores here are negative, so the max/min == fabs(min/max).
198  // float ratio = fmax(lang_score, equ_score) / fmin(lang_score, equ_score);
199  const float diff = fabs(lang_score - equ_score);
201 
202  // Classification.
203  if (fmax(lang_score, equ_score) < kConfScoreTh) {
204  // If both score are very small, then mark it as unclear.
205  type = BSTT_UNCLEAR;
206  } else if (diff > kConfDiffTh && equ_score > lang_score) {
207  // If equ_score is significantly higher, then we classify this character as
208  // math symbol.
209  type = BSTT_MATH;
210  } else if (lang_choice) {
211  // For other cases: lang_score is similar or significantly higher.
212  type = EstimateTypeForUnichar(
213  lang_tesseract_->unicharset, lang_choice->unichar_id());
214  }
215 
216  if (type == BSTT_NONE && lang_tesseract_->get_fontinfo_table().get(
217  lang_choice->fontinfo_id()).is_italic()) {
218  // For text symbol, we still check if it is italic.
219  blobnbox->set_special_text_type(BSTT_ITALIC);
220  } else {
221  blobnbox->set_special_text_type(type);
222  }
223 }
float certainty() const
Definition: ratngs.h:83
static TBLOB * PolygonalCopy(bool allow_detailed_fx, C_BLOB *src)
Definition: blobs.cpp:337
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:192
Definition: rect.h:34
const int kBlnXHeight
Definition: normalis.h:24
const int kBlnBaselineOffset
Definition: normalis.h:25
BlobSpecialTextType
Definition: blobbox.h:97
int16_t fontinfo_id() const
Definition: ratngs.h:86
int16_t left() const
Definition: rect.h:72
BlobSpecialTextType EstimateTypeForUnichar(const UNICHARSET &unicharset, const UNICHAR_ID id) const
UNICHARSET unicharset
Definition: ccutil.h:68
TBOX bounding_box() const
Definition: blobs.cpp:478
int16_t right() const
Definition: rect.h:79
Definition: blobs.h:268
int16_t bottom() const
Definition: rect.h:65
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
int16_t height() const
Definition: rect.h:108
C_BLOB * cblob() const
Definition: blobbox.h:269
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ IdentifySpecialText() [2/2]

void tesseract::EquationDetect::IdentifySpecialText ( )
protected

Definition at line 259 of file equationdetect.cpp.

259  {
260  // Set configuration for Tesseract::AdaptiveClassifier.
261  equ_tesseract_.tess_cn_matching.set_value(1); // turn it on
262  equ_tesseract_.tess_bn_matching.set_value(0);
263 
264  // Set the multiplier to zero for lang_tesseract_ to improve the accuracy.
265  const int classify_class_pruner = lang_tesseract_->classify_class_pruner_multiplier;
266  const int classify_integer_matcher =
270 
272  ColPartition *part = nullptr;
273  gsearch.StartFullSearch();
274  while ((part = gsearch.NextFullSearch()) != nullptr) {
275  if (!IsTextOrEquationType(part->type())) {
276  continue;
277  }
278  IdentifyBlobsToSkip(part);
279  BLOBNBOX_C_IT bbox_it(part->boxes());
280  // Compute the height threshold.
281  GenericVector<int> blob_heights;
282  for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
283  bbox_it.forward()) {
284  if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
285  blob_heights.push_back(bbox_it.data()->bounding_box().height());
286  }
287  }
288  blob_heights.sort();
289  const int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2;
290  for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
291  bbox_it.forward()) {
292  if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
293  IdentifySpecialText(bbox_it.data(), height_th);
294  }
295  }
296  }
297 
298  // Set the multiplier values back.
300  classify_class_pruner);
302  classify_integer_matcher);
303 
304  if (equationdetect_save_spt_image) { // For debug.
305  STRING outfile;
306  GetOutputTiffName("_spt", &outfile);
307  PaintSpecialTexts(outfile);
308  }
309 }
void GetOutputTiffName(const char *name, STRING *image_name) const
void IdentifyBlobsToSkip(ColPartition *part)
bool equationdetect_save_spt_image
bool IsTextOrEquationType(PolyBlockType type)
int classify_class_pruner_multiplier
Definition: classify.h:506
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
ColPartitionGrid * part_grid_
int push_back(T object)
int classify_integer_matcher_multiplier
Definition: classify.h:510
Definition: strngs.h:45
void PaintSpecialTexts(const STRING &outfile) const

◆ InsertPartAfterAbsorb()

void tesseract::EquationDetect::InsertPartAfterAbsorb ( ColPartition part)
protected

Definition at line 513 of file equationdetect.cpp.

513  {
514  ASSERT_HOST(part);
515 
516  // Before insert part back into part_grid_, we will need re-compute some
517  // of its attributes such as first_column_, last_column_. However, we still
518  // want to preserve its type.
519  BlobTextFlowType flow_type = part->flow();
520  PolyBlockType part_type = part->type();
521  BlobRegionType blob_type = part->blob_type();
522 
523  // Call SetPartitionType to re-compute the attributes of part.
524  const TBOX& part_box(part->bounding_box());
525  int grid_x, grid_y;
527  part_box.left(), part_box.bottom(), &grid_x, &grid_y);
528  part->SetPartitionType(resolution_, best_columns_[grid_y]);
529 
530  // Reset the types back.
531  part->set_type(part_type);
532  part->set_blob_type(blob_type);
533  part->set_flow(flow_type);
534  part->SetBlobTypes();
535 
536  // Insert into part_grid_.
537  part_grid_->InsertBBox(true, true, part);
538 }
BlobRegionType
Definition: blobbox.h:73
Definition: rect.h:34
PolyBlockType
Definition: publictypes.h:53
BlobTextFlowType
Definition: blobbox.h:115
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:488
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:53
ColPartitionGrid * part_grid_
ColPartitionSet ** best_columns_
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ IsIndented()

EquationDetect::IndentType tesseract::EquationDetect::IsIndented ( ColPartition part)
protected

Definition at line 1021 of file equationdetect.cpp.

1021  {
1022  ASSERT_HOST(part);
1023 
1025  ColPartition *neighbor = nullptr;
1026  const TBOX& part_box(part->bounding_box());
1027  const int kXGapTh = static_cast<int>(roundf(0.5 * resolution_));
1028  const int kRadiusTh = static_cast<int>(roundf(3.0 * resolution_));
1029  const int kYGapTh = static_cast<int>(roundf(0.5 * resolution_));
1030 
1031  // Here we use a simple approximation algorithm: from the center of part, We
1032  // perform the radius search, and check if we can find a neighboring partition
1033  // that locates on the top/bottom left of part.
1034  search.StartRadSearch((part_box.left() + part_box.right()) / 2,
1035  (part_box.top() + part_box.bottom()) / 2, kRadiusTh);
1036  search.SetUniqueMode(true);
1037  bool left_indented = false, right_indented = false;
1038  while ((neighbor = search.NextRadSearch()) != nullptr &&
1039  (!left_indented || !right_indented)) {
1040  if (neighbor == part) {
1041  continue;
1042  }
1043  const TBOX& neighbor_box(neighbor->bounding_box());
1044 
1045  if (part_box.major_y_overlap(neighbor_box) &&
1046  part_box.x_gap(neighbor_box) < kXGapTh) {
1047  // When this happens, it is likely part is a fragment of an
1048  // over-segmented colpartition. So we return false.
1049  return NO_INDENT;
1050  }
1051 
1052  if (!IsTextOrEquationType(neighbor->type())) {
1053  continue;
1054  }
1055 
1056  // The neighbor should be above/below part, and overlap in x direction.
1057  if (!part_box.x_overlap(neighbor_box) || part_box.y_overlap(neighbor_box)) {
1058  continue;
1059  }
1060 
1061  if (part_box.y_gap(neighbor_box) < kYGapTh) {
1062  const int left_gap = part_box.left() - neighbor_box.left();
1063  const int right_gap = neighbor_box.right() - part_box.right();
1064  if (left_gap > kXGapTh) {
1065  left_indented = true;
1066  }
1067  if (right_gap > kXGapTh) {
1068  right_indented = true;
1069  }
1070  }
1071  }
1072 
1073  if (left_indented && right_indented) {
1074  return BOTH_INDENT;
1075  }
1076  if (left_indented) {
1077  return LEFT_INDENT;
1078  }
1079  if (right_indented) {
1080  return RIGHT_INDENT;
1081  }
1082  return NO_INDENT;
1083 }
Definition: rect.h:34
bool IsTextOrEquationType(PolyBlockType type)
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:366
ColPartitionGrid * part_grid_
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ IsInline()

bool tesseract::EquationDetect::IsInline ( const bool  search_bottom,
const int  textPartsLineSpacing,
ColPartition part 
)
protected

Definition at line 937 of file equationdetect.cpp.

939  {
940  ASSERT_HOST(part != nullptr);
941  // Look for its nearest vertical neighbor that hardly overlaps in y but
942  // largely overlaps in x.
944  ColPartition *neighbor = nullptr;
945  const TBOX& part_box(part->bounding_box());
946  const float kYGapRatioTh = 1.0;
947 
948  if (search_bottom) {
949  search.StartVerticalSearch(part_box.left(), part_box.right(),
950  part_box.bottom());
951  } else {
952  search.StartVerticalSearch(part_box.left(), part_box.right(),
953  part_box.top());
954  }
955  search.SetUniqueMode(true);
956  while ((neighbor = search.NextVerticalSearch(search_bottom)) != nullptr) {
957  const TBOX& neighbor_box(neighbor->bounding_box());
958  if (part_box.y_gap(neighbor_box) > kYGapRatioTh *
959  std::min(part_box.height(), neighbor_box.height())) {
960  // Finished searching.
961  break;
962  }
963  if (!PTIsTextType(neighbor->type())) {
964  continue;
965  }
966 
967  // Check if neighbor and part is inline similar.
968  const float kHeightRatioTh = 0.5;
969  const int kYGapTh = textparts_linespacing > 0 ?
970  textparts_linespacing + static_cast<int>(roundf(0.02 * resolution_)):
971  static_cast<int>(roundf(0.05 * resolution_)); // Default value.
972  if (part_box.x_overlap(neighbor_box) && // Location feature.
973  part_box.y_gap(neighbor_box) <= kYGapTh && // Line spacing.
974  // Geo feature.
975  static_cast<float>(std::min(part_box.height(), neighbor_box.height())) /
976  std::max(part_box.height(), neighbor_box.height()) > kHeightRatioTh) {
977  return true;
978  }
979  }
980 
981  return false;
982 }
Definition: rect.h:34
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:366
ColPartitionGrid * part_grid_
bool PTIsTextType(PolyBlockType type)
Definition: publictypes.h:82
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ IsMathBlockSatellite()

bool tesseract::EquationDetect::IsMathBlockSatellite ( ColPartition part,
GenericVector< ColPartition *> *  math_blocks 
)
protected

Definition at line 1359 of file equationdetect.cpp.

1360  {
1361  ASSERT_HOST(part != nullptr && math_blocks != nullptr);
1362  math_blocks->clear();
1363  const TBOX& part_box(part->bounding_box());
1364  // Find the top/bottom nearest neighbor of part.
1365  ColPartition *neighbors[2];
1366  int y_gaps[2] = {std::numeric_limits<int>::max(), std::numeric_limits<int>::max()};
1367  // The horizontal boundary of the neighbors.
1368  int neighbors_left = std::numeric_limits<int>::max(), neighbors_right = 0;
1369  for (int i = 0; i < 2; ++i) {
1370  neighbors[i] = SearchNNVertical(i != 0, part);
1371  if (neighbors[i]) {
1372  const TBOX& neighbor_box = neighbors[i]->bounding_box();
1373  y_gaps[i] = neighbor_box.y_gap(part_box);
1374  if (neighbor_box.left() < neighbors_left) {
1375  neighbors_left = neighbor_box.left();
1376  }
1377  if (neighbor_box.right() > neighbors_right) {
1378  neighbors_right = neighbor_box.right();
1379  }
1380  }
1381  }
1382  if (neighbors[0] == neighbors[1]) {
1383  // This happens when part is inside neighbor.
1384  neighbors[1] = nullptr;
1385  y_gaps[1] = std::numeric_limits<int>::max();
1386  }
1387 
1388  // Check if part is within [neighbors_left, neighbors_right].
1389  if (part_box.left() < neighbors_left || part_box.right() > neighbors_right) {
1390  return false;
1391  }
1392 
1393  // Get the index of the near one in neighbors.
1394  int index = y_gaps[0] < y_gaps[1] ? 0 : 1;
1395 
1396  // Check the near one.
1397  if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {
1398  math_blocks->push_back(neighbors[index]);
1399  } else {
1400  // If the near one failed the check, then we skip checking the far one.
1401  return false;
1402  }
1403 
1404  // Check the far one.
1405  index = 1 - index;
1406  if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {
1407  math_blocks->push_back(neighbors[index]);
1408  }
1409 
1410  return true;
1411 }
int y_gap(const TBOX &box) const
Definition: rect.h:233
Definition: rect.h:34
bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const
ColPartition * SearchNNVertical(const bool search_bottom, const ColPartition *part)
int16_t left() const
Definition: rect.h:72
int push_back(T object)
int16_t right() const
Definition: rect.h:79
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ IsNearMathNeighbor()

bool tesseract::EquationDetect::IsNearMathNeighbor ( const int  y_gap,
const ColPartition neighbor 
) const
protected

Definition at line 1448 of file equationdetect.cpp.

1449  {
1450  if (!neighbor) {
1451  return false;
1452  }
1453  const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.1));
1454  return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh;
1455 }

◆ IsNearSmallNeighbor()

bool tesseract::EquationDetect::IsNearSmallNeighbor ( const TBOX seed_box,
const TBOX part_box 
) const
protected

Definition at line 1271 of file equationdetect.cpp.

1272  {
1273  const int kXGapTh = static_cast<int>(roundf(0.25 * resolution_));
1274  const int kYGapTh = static_cast<int>(roundf(0.05 * resolution_));
1275 
1276  // Check geometric feature.
1277  if (part_box.height() > seed_box.height() ||
1278  part_box.width() > seed_box.width()) {
1279  return false;
1280  }
1281 
1282  // Check overlap and distance.
1283  if ((!part_box.major_x_overlap(seed_box) ||
1284  part_box.y_gap(seed_box) > kYGapTh) &&
1285  (!part_box.major_y_overlap(seed_box) ||
1286  part_box.x_gap(seed_box) > kXGapTh)) {
1287  return false;
1288  }
1289 
1290  return true;
1291 }
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:439
int y_gap(const TBOX &box) const
Definition: rect.h:233
int x_gap(const TBOX &box) const
Definition: rect.h:225
int16_t width() const
Definition: rect.h:115
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:412
int16_t height() const
Definition: rect.h:108

◆ LabelSpecialText()

int tesseract::EquationDetect::LabelSpecialText ( TO_BLOCK to_block)
virtual

Implements tesseract::EquationDetectBase.

Definition at line 132 of file equationdetect.cpp.

132  {
133  if (to_block == nullptr) {
134  tprintf("Warning: input to_block is nullptr!\n");
135  return -1;
136  }
137 
139  blob_lists.push_back(&(to_block->blobs));
140  blob_lists.push_back(&(to_block->large_blobs));
141  for (int i = 0; i < blob_lists.size(); ++i) {
142  BLOBNBOX_IT bbox_it(blob_lists[i]);
143  for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
144  bbox_it.forward()) {
145  bbox_it.data()->set_special_text_type(BSTT_NONE);
146  }
147  }
148 
149  return 0;
150 }
int size() const
Definition: genericvector.h:71
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int push_back(T object)
BLOBNBOX_LIST blobs
Definition: blobbox.h:785
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:789

◆ MergePartsByLocation()

void tesseract::EquationDetect::MergePartsByLocation ( )
protected

Definition at line 425 of file equationdetect.cpp.

425  {
426  while (true) {
427  ColPartition* part = nullptr;
428  // partitions that have been updated.
429  GenericVector<ColPartition*> parts_updated;
431  gsearch.StartFullSearch();
432  while ((part = gsearch.NextFullSearch()) != nullptr) {
433  if (!IsTextOrEquationType(part->type())) {
434  continue;
435  }
436  GenericVector<ColPartition*> parts_to_merge;
437  SearchByOverlap(part, &parts_to_merge);
438  if (parts_to_merge.empty()) {
439  continue;
440  }
441 
442  // Merge parts_to_merge with part, and remove them from part_grid_.
443  part_grid_->RemoveBBox(part);
444  for (int i = 0; i < parts_to_merge.size(); ++i) {
445  ASSERT_HOST(parts_to_merge[i] != nullptr && parts_to_merge[i] != part);
446  part->Absorb(parts_to_merge[i], nullptr);
447  }
448  gsearch.RepositionIterator();
449 
450  parts_updated.push_back(part);
451  }
452 
453  if (parts_updated.empty()) { // Exit the loop
454  break;
455  }
456 
457  // Re-insert parts_updated into part_grid_.
458  for (int i = 0; i < parts_updated.size(); ++i) {
459  InsertPartAfterAbsorb(parts_updated[i]);
460  }
461  }
462 }
int size() const
Definition: genericvector.h:71
void SearchByOverlap(ColPartition *seed, GenericVector< ColPartition *> *parts_overlap)
bool IsTextOrEquationType(PolyBlockType type)
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
void RemoveBBox(BBC *bbox)
Definition: bbgrid.h:535
bool empty() const
Definition: genericvector.h:90
void InsertPartAfterAbsorb(ColPartition *part)
ColPartitionGrid * part_grid_
int push_back(T object)
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ PaintColParts()

void tesseract::EquationDetect::PaintColParts ( const STRING outfile) const
protected

Definition at line 1482 of file equationdetect.cpp.

1482  {
1483  Pix *pix = pixConvertTo32(lang_tesseract_->BestPix());
1485  gsearch.StartFullSearch();
1486  ColPartition* part = nullptr;
1487  while ((part = gsearch.NextFullSearch()) != nullptr) {
1488  const TBOX& tbox = part->bounding_box();
1489  Box *box = boxCreate(tbox.left(), pixGetHeight(pix) - tbox.top(),
1490  tbox.width(), tbox.height());
1491  if (part->type() == PT_EQUATION) {
1492  pixRenderBoxArb(pix, box, 5, 255, 0, 0);
1493  } else if (part->type() == PT_INLINE_EQUATION) {
1494  pixRenderBoxArb(pix, box, 5, 0, 255, 0);
1495  } else {
1496  pixRenderBoxArb(pix, box, 5, 0, 0, 255);
1497  }
1498  boxDestroy(&box);
1499  }
1500 
1501  pixWrite(outfile.string(), pix, IFF_TIFF_LZW);
1502  pixDestroy(&pix);
1503 }
const char * string() const
Definition: strngs.cpp:196
Definition: rect.h:34
Pix * BestPix() const
int16_t width() const
Definition: rect.h:115
int16_t left() const
Definition: rect.h:72
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
int16_t top() const
Definition: rect.h:58
ColPartitionGrid * part_grid_
int16_t height() const
Definition: rect.h:108

◆ PaintSpecialTexts()

void tesseract::EquationDetect::PaintSpecialTexts ( const STRING outfile) const
protected

Definition at line 1465 of file equationdetect.cpp.

1465  {
1466  Pix *pix = nullptr, *pixBi = lang_tesseract_->pix_binary();
1467  pix = pixConvertTo32(pixBi);
1469  ColPartition* part = nullptr;
1470  gsearch.StartFullSearch();
1471  while ((part = gsearch.NextFullSearch()) != nullptr) {
1472  BLOBNBOX_C_IT blob_it(part->boxes());
1473  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1474  RenderSpecialText(pix, blob_it.data());
1475  }
1476  }
1477 
1478  pixWrite(outfile.string(), pix, IFF_TIFF_LZW);
1479  pixDestroy(&pix);
1480 }
const char * string() const
Definition: strngs.cpp:196
static void RenderSpecialText(Pix *pix, BLOBNBOX *blob)
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
Pix * pix_binary() const
ColPartitionGrid * part_grid_

◆ PrintSpecialBlobsDensity()

void tesseract::EquationDetect::PrintSpecialBlobsDensity ( const ColPartition part) const
protected

Definition at line 1505 of file equationdetect.cpp.

1505  {
1506  ASSERT_HOST(part);
1507  TBOX box(part->bounding_box());
1508  int h = pixGetHeight(lang_tesseract_->BestPix());
1509  tprintf("Printing special blobs density values for ColParition (t=%d,b=%d) ",
1510  h - box.top(), h - box.bottom());
1511  box.print();
1512  tprintf("blobs count = %d, density = ", part->boxes_count());
1513  for (int i = 0; i < BSTT_COUNT; ++i) {
1514  BlobSpecialTextType type = static_cast<BlobSpecialTextType>(i);
1515  tprintf("%d:%f ", i, part->SpecialBlobsDensity(type));
1516  }
1517  tprintf("\n");
1518 }
Definition: rect.h:34
BlobSpecialTextType
Definition: blobbox.h:97
Pix * BestPix() const
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ ProcessMathBlockSatelliteParts()

void tesseract::EquationDetect::ProcessMathBlockSatelliteParts ( )
protected

Definition at line 1310 of file equationdetect.cpp.

1310  {
1311  // Iterate over part_grid_, and find all parts that are text type but not
1312  // equation type.
1313  ColPartition *part = nullptr;
1314  GenericVector<ColPartition*> text_parts;
1316  gsearch.StartFullSearch();
1317  while ((part = gsearch.NextFullSearch()) != nullptr) {
1318  if (part->type() == PT_FLOWING_TEXT || part->type() == PT_HEADING_TEXT) {
1319  text_parts.push_back(part);
1320  }
1321  }
1322  if (text_parts.empty()) {
1323  return;
1324  }
1325 
1326  // Compute the medium height of the text_parts.
1327  text_parts.sort(&SortCPByHeight);
1328  const TBOX& text_box = text_parts[text_parts.size() / 2]->bounding_box();
1329  int med_height = text_box.height();
1330  if (text_parts.size() % 2 == 0 && text_parts.size() > 1) {
1331  const TBOX& text_box =
1332  text_parts[text_parts.size() / 2 - 1]->bounding_box();
1333  med_height = static_cast<int>(roundf(
1334  0.5 * (text_box.height() + med_height)));
1335  }
1336 
1337  // Iterate every text_parts and check if it is a math block satellite.
1338  for (int i = 0; i < text_parts.size(); ++i) {
1339  const TBOX& text_box(text_parts[i]->bounding_box());
1340  if (text_box.height() > med_height) {
1341  continue;
1342  }
1343  GenericVector<ColPartition*> math_blocks;
1344  if (!IsMathBlockSatellite(text_parts[i], &math_blocks)) {
1345  continue;
1346  }
1347 
1348  // Found. merge text_parts[i] with math_blocks.
1349  part_grid_->RemoveBBox(text_parts[i]);
1350  text_parts[i]->set_type(PT_EQUATION);
1351  for (int j = 0; j < math_blocks.size(); ++j) {
1352  part_grid_->RemoveBBox(math_blocks[j]);
1353  text_parts[i]->Absorb(math_blocks[j], nullptr);
1354  }
1355  InsertPartAfterAbsorb(text_parts[i]);
1356  }
1357 }
int size() const
Definition: genericvector.h:71
Definition: rect.h:34
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
void RemoveBBox(BBC *bbox)
Definition: bbgrid.h:535
bool empty() const
Definition: genericvector.h:90
void InsertPartAfterAbsorb(ColPartition *part)
ColPartitionGrid * part_grid_
int push_back(T object)
bool IsMathBlockSatellite(ColPartition *part, GenericVector< ColPartition *> *math_blocks)
int16_t height() const
Definition: rect.h:108

◆ SearchByOverlap()

void tesseract::EquationDetect::SearchByOverlap ( ColPartition seed,
GenericVector< ColPartition *> *  parts_overlap 
)
protected

Definition at line 464 of file equationdetect.cpp.

466  {
467  ASSERT_HOST(seed != nullptr && parts_overlap != nullptr);
468  if (!IsTextOrEquationType(seed->type())) {
469  return;
470  }
472  const TBOX& seed_box(seed->bounding_box());
473  const int kRadNeighborCells = 30;
474  search.StartRadSearch((seed_box.left() + seed_box.right()) / 2,
475  (seed_box.top() + seed_box.bottom()) / 2,
476  kRadNeighborCells);
477  search.SetUniqueMode(true);
478 
479  // Search iteratively.
480  ColPartition *part;
482  const float kLargeOverlapTh = 0.95;
483  const float kEquXOverlap = 0.4, kEquYOverlap = 0.5;
484  while ((part = search.NextRadSearch()) != nullptr) {
485  if (part == seed || !IsTextOrEquationType(part->type())) {
486  continue;
487  }
488  const TBOX& part_box(part->bounding_box());
489  bool merge = false;
490 
491  const float x_overlap_fraction = part_box.x_overlap_fraction(seed_box),
492  y_overlap_fraction = part_box.y_overlap_fraction(seed_box);
493 
494  // If part is large overlapped with seed, then set merge to true.
495  if (x_overlap_fraction >= kLargeOverlapTh &&
496  y_overlap_fraction >= kLargeOverlapTh) {
497  merge = true;
498  } else if (seed->type() == PT_EQUATION &&
499  IsTextOrEquationType(part->type())) {
500  if ((x_overlap_fraction > kEquXOverlap && y_overlap_fraction > 0.0) ||
501  (x_overlap_fraction > 0.0 && y_overlap_fraction > kEquYOverlap)) {
502  merge = true;
503  }
504  }
505 
506  if (merge) { // Remove the part from search and put it into parts.
507  search.RemoveBBox();
508  parts_overlap->push_back(part);
509  }
510  }
511 }
Definition: rect.h:34
bool IsTextOrEquationType(PolyBlockType type)
double x_overlap_fraction(const TBOX &box) const
Definition: rect.h:457
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:366
ColPartitionGrid * part_grid_
int push_back(T object)
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ SearchNNVertical()

ColPartition * tesseract::EquationDetect::SearchNNVertical ( const bool  search_bottom,
const ColPartition part 
)
protected

Definition at line 1413 of file equationdetect.cpp.

1414  {
1415  ASSERT_HOST(part);
1416  ColPartition *nearest_neighbor = nullptr, *neighbor = nullptr;
1417  const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.5));
1418 
1420  search.SetUniqueMode(true);
1421  const TBOX& part_box(part->bounding_box());
1422  int y = search_bottom ? part_box.bottom() : part_box.top();
1423  search.StartVerticalSearch(part_box.left(), part_box.right(), y);
1424  int min_y_gap = std::numeric_limits<int>::max();
1425  while ((neighbor = search.NextVerticalSearch(search_bottom)) != nullptr) {
1426  if (neighbor == part || !IsTextOrEquationType(neighbor->type())) {
1427  continue;
1428  }
1429  const TBOX& neighbor_box(neighbor->bounding_box());
1430  int y_gap = neighbor_box.y_gap(part_box);
1431  if (y_gap > kYGapTh) { // Out of scope.
1432  break;
1433  }
1434  if (!neighbor_box.major_x_overlap(part_box) ||
1435  (search_bottom && neighbor_box.bottom() > part_box.bottom()) ||
1436  (!search_bottom && neighbor_box.top() < part_box.top())) {
1437  continue;
1438  }
1439  if (y_gap < min_y_gap) {
1440  min_y_gap = y_gap;
1441  nearest_neighbor = neighbor;
1442  }
1443  }
1444 
1445  return nearest_neighbor;
1446 }
int y_gap(const TBOX &box) const
Definition: rect.h:233
Definition: rect.h:34
bool IsTextOrEquationType(PolyBlockType type)
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:366
ColPartitionGrid * part_grid_
int16_t bottom() const
Definition: rect.h:65
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ SetLangTesseract()

void tesseract::EquationDetect::SetLangTesseract ( Tesseract lang_tesseract)

Definition at line 124 of file equationdetect.cpp.

124  {
125  lang_tesseract_ = lang_tesseract;
126 }

◆ SetResolution()

void tesseract::EquationDetect::SetResolution ( const int  resolution)

Definition at line 128 of file equationdetect.cpp.

128  {
129  resolution_ = resolution;
130 }

◆ SplitCPHor()

void tesseract::EquationDetect::SplitCPHor ( ColPartition part,
GenericVector< ColPartition *> *  parts_splitted 
)
protected

Definition at line 648 of file equationdetect.cpp.

649  {
650  ASSERT_HOST(part && parts_splitted);
651  if (part->median_width() == 0 || part->boxes_count() == 0) {
652  return;
653  }
654 
655  // Make a copy of part, and reset parts_splitted.
656  ColPartition* right_part = part->CopyButDontOwnBlobs();
657  parts_splitted->delete_data_pointers();
658  parts_splitted->clear();
659 
660  const double kThreshold = part->median_width() * 3.0;
661  bool found_split = true;
662  while (found_split) {
663  found_split = false;
664  BLOBNBOX_C_IT box_it(right_part->boxes());
665  // Blobs are sorted left side first. If blobs overlap,
666  // the previous blob may have a "more right" right side.
667  // Account for this by always keeping the largest "right"
668  // so far.
669  int previous_right = INT32_MIN;
670 
671  // Look for the next split in the partition.
672  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
673  const TBOX& box = box_it.data()->bounding_box();
674  if (previous_right != INT32_MIN &&
675  box.left() - previous_right > kThreshold) {
676  // We have a split position. Split the partition in two pieces.
677  // Insert the left piece in the grid and keep processing the right.
678  const int mid_x = (box.left() + previous_right) / 2;
679  ColPartition* left_part = right_part;
680  right_part = left_part->SplitAt(mid_x);
681 
682  parts_splitted->push_back(left_part);
683  left_part->ComputeSpecialBlobsDensity();
684  found_split = true;
685  break;
686  }
687 
688  // The right side of the previous blobs.
689  previous_right = std::max(previous_right, static_cast<int>(box.right()));
690  }
691  }
692 
693  // Add the last piece.
694  right_part->ComputeSpecialBlobsDensity();
695  parts_splitted->push_back(right_part);
696 }
Definition: rect.h:34
int16_t left() const
Definition: rect.h:72
int push_back(T object)
void delete_data_pointers()
int16_t right() const
Definition: rect.h:79
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ SplitCPHorLite()

void tesseract::EquationDetect::SplitCPHorLite ( ColPartition part,
GenericVector< TBOX > *  splitted_boxes 
)
protected

Definition at line 698 of file equationdetect.cpp.

699  {
700  ASSERT_HOST(part && splitted_boxes);
701  splitted_boxes->clear();
702  if (part->median_width() == 0) {
703  return;
704  }
705 
706  const double kThreshold = part->median_width() * 3.0;
707 
708  // Blobs are sorted left side first. If blobs overlap,
709  // the previous blob may have a "more right" right side.
710  // Account for this by always keeping the largest "right"
711  // so far.
712  TBOX union_box;
713  int previous_right = INT32_MIN;
714  BLOBNBOX_C_IT box_it(part->boxes());
715  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
716  const TBOX& box = box_it.data()->bounding_box();
717  if (previous_right != INT32_MIN &&
718  box.left() - previous_right > kThreshold) {
719  // We have a split position.
720  splitted_boxes->push_back(union_box);
721  previous_right = INT32_MIN;
722  }
723  if (previous_right == INT32_MIN) {
724  union_box = box;
725  } else {
726  union_box += box;
727  }
728  // The right side of the previous blobs.
729  previous_right = std::max(previous_right, static_cast<int>(box.right()));
730  }
731 
732  // Add the last piece.
733  if (previous_right != INT32_MIN) {
734  splitted_boxes->push_back(union_box);
735  }
736 }
Definition: rect.h:34
int16_t left() const
Definition: rect.h:72
int push_back(T object)
int16_t right() const
Definition: rect.h:79
#define ASSERT_HOST(x)
Definition: errcode.h:84

Member Data Documentation

◆ best_columns_

ColPartitionSet** tesseract::EquationDetect::best_columns_
protected

Definition at line 257 of file equationdetect.h.

◆ cp_seeds_

GenericVector<ColPartition*> tesseract::EquationDetect::cp_seeds_
protected

Definition at line 263 of file equationdetect.h.

◆ cps_super_bbox_

TBOX* tesseract::EquationDetect::cps_super_bbox_
protected

Definition at line 260 of file equationdetect.h.

◆ equ_tesseract_

Tesseract tesseract::EquationDetect::equ_tesseract_
protected

Definition at line 244 of file equationdetect.h.

◆ lang_tesseract_

Tesseract* tesseract::EquationDetect::lang_tesseract_
protected

Definition at line 248 of file equationdetect.h.

◆ page_count_

int tesseract::EquationDetect::page_count_
protected

Definition at line 269 of file equationdetect.h.

◆ part_grid_

ColPartitionGrid* tesseract::EquationDetect::part_grid_
protected

Definition at line 252 of file equationdetect.h.

◆ resolution_

int tesseract::EquationDetect::resolution_
protected

Definition at line 266 of file equationdetect.h.


The documentation for this class was generated from the following files: