21 #pragma warning(disable:4244) // Conversion warnings
33 #include "config_auto.h"
58 static int SortCPByTopReverse(
const void* p1,
const void* p2) {
59 const ColPartition* cp1 = *
reinterpret_cast<ColPartition* const*
>(p1);
60 const ColPartition* cp2 = *
reinterpret_cast<ColPartition* const*
>(p2);
62 const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());
63 return box2.
top() - box1.top();
66 static int SortCPByBottom(
const void* p1,
const void* p2) {
67 const ColPartition* cp1 = *
reinterpret_cast<ColPartition* const*
>(p1);
68 const ColPartition* cp2 = *
reinterpret_cast<ColPartition* const*
>(p2);
70 const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());
71 return box1.
bottom() - box2.bottom();
74 static int SortCPByHeight(
const void* p1,
const void* p2) {
75 const ColPartition* cp1 = *
reinterpret_cast<ColPartition* const*
>(p1);
76 const ColPartition* cp2 = *
reinterpret_cast<ColPartition* const*
>(p2);
78 const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());
79 return box1.
height() - box2.height();
106 const char* equ_name) {
107 const char* default_name =
"equ";
108 if (equ_name ==
NULL) {
109 equ_name = default_name;
119 tprintf(
"Warning: equation region detection requested,"
120 " but %s failed to load from %s\n", equ_name, equ_datapath);
146 if (to_block ==
NULL) {
147 tprintf(
"Warning: input to_block is NULL!\n");
154 for (
int i = 0; i < blob_lists.
size(); ++i) {
155 BLOBNBOX_IT bbox_it(blob_lists[i]);
156 for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
158 bbox_it.data()->set_special_text_type(
BSTT_NONE);
166 BLOBNBOX *blobnbox,
const int height_th) {
174 BLOB_CHOICE_LIST ratings_equ, ratings_lang;
184 float x_orig = (box.
left() + box.
right()) / 2.0f, y_orig = box.
bottom();
198 if (ratings_lang.length() > 0) {
199 BLOB_CHOICE_IT choice_it(&ratings_lang);
200 lang_choice = choice_it.data();
202 if (ratings_equ.length() > 0) {
203 BLOB_CHOICE_IT choice_it(&ratings_equ);
204 equ_choice = choice_it.data();
207 float lang_score = lang_choice ? lang_choice->
certainty() : -FLT_MAX;
208 float equ_score = equ_choice ? equ_choice->certainty() : -FLT_MAX;
210 const float kConfScoreTh = -5.0f, kConfDiffTh = 1.8;
213 float diff = fabs(lang_score - equ_score);
217 if (
fmax(lang_score, equ_score) < kConfScoreTh) {
220 }
else if (diff > kConfDiffTh && equ_score > lang_score) {
224 }
else if (lang_choice) {
249 if (ids_to_exclude.
empty()) {
250 static const STRING kCharsToEx[] = {
"'",
"`",
"\"",
"\\",
",",
".",
251 "〈",
"〉",
"《",
"》",
"」",
"「",
""};
253 while (kCharsToEx[i] !=
"") {
257 ids_to_exclude.
sort();
264 static const STRING kDigitsChars =
"|";
280 int classify_integer_matcher =
293 BLOBNBOX_C_IT bbox_it(part->
boxes());
296 for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
298 if (bbox_it.data()->special_text_type() !=
BSTT_SKIP) {
299 blob_heights.
push_back(bbox_it.data()->bounding_box().height());
303 int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2;
304 for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
306 if (bbox_it.data()->special_text_type() !=
BSTT_SKIP) {
314 classify_class_pruner);
316 classify_integer_matcher);
327 BLOBNBOX_C_IT blob_it(part->
boxes());
329 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
333 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
342 BLOBNBOX_C_IT blob_it2 = blob_it;
344 while (!blob_it2.at_last()) {
345 BLOBNBOX* nextblob = blob_it2.forward();
347 if (nextblob_box.
left() >= blob_box.
right()) {
350 const float kWidthR = 0.4, kHeightR = 0.3;
352 yoverlap = blob_box.
y_overlap(nextblob_box);
353 float widthR =
static_cast<float>(
356 float heightR =
static_cast<float>(
360 if (xoverlap && yoverlap && widthR > kWidthR && heightR > kHeightR) {
364 blob_box += nextblob_box;
376 tprintf(
"Warning: equ_tesseract_/lang_tesseract_ is NULL!\n");
379 if (!part_grid || !best_columns) {
380 tprintf(
"part_grid/best_columns is NULL!!\n");
414 for (
int i = 0; i <
cp_seeds_.size(); ++i) {
422 for (
int i = 0; i < seeds_expanded.
size(); ++i) {
452 if (parts_to_merge.
empty()) {
458 for (
int i = 0; i < parts_to_merge.
size(); ++i) {
467 if (parts_updated.
empty()) {
472 for (
int i = 0; i < parts_updated.
size(); ++i) {
487 const int kRadNeighborCells = 30;
489 (seed_box.top() + seed_box.bottom()) / 2,
496 const float kLargeOverlapTh = 0.95;
497 const float kEquXOverlap = 0.4, kEquYOverlap = 0.5;
506 y_overlap_fraction = part_box.y_overlap_fraction(seed_box);
509 if (x_overlap_fraction >= kLargeOverlapTh &&
510 y_overlap_fraction >= kLargeOverlapTh) {
514 if ((x_overlap_fraction > kEquXOverlap && y_overlap_fraction > 0.0) ||
515 (x_overlap_fraction > 0.0 && y_overlap_fraction > kEquYOverlap)) {
541 part_box.left(), part_box.bottom(), &grid_x, &grid_y);
570 const int kTextBlobsTh = 20;
595 indented_texts_left.
sort();
596 texts_foreground_density.
sort();
597 float foreground_density_th = 0.15;
598 if (!texts_foreground_density.
empty()) {
600 foreground_density_th = 0.8 * texts_foreground_density[
601 texts_foreground_density.
size() / 2];
604 for (
int i = 0; i < seeds1.
size(); ++i) {
605 const TBOX& box = seeds1[i]->bounding_box();
609 kLeftIndentAlignmentCountTh)) {
618 for (
int i = 0; i < seeds2.
size(); ++i) {
619 if (
CheckForSeed2(indented_texts_left, foreground_density_th, seeds2[i])) {
627 #if LIBLEPT_MINOR_VERSION < 69 && LIBLEPT_MAJOR_VERSION <= 1
632 int pix_height = pixGetHeight(pix_bi);
633 Box* box = boxCreate(tbox.
left(), pix_height - tbox.
top(),
635 Pix *pix_sub = pixClipRectangle(pix_bi, box,
NULL);
637 pixForegroundFraction(pix_sub, &fract);
638 pixDestroy(&pix_sub);
652 float parts_passed = 0.0;
653 for (
int i = 0; i < sub_boxes.
size(); ++i) {
655 if (density < density_th) {
661 const float kSeedPartRatioTh = 0.3;
662 bool retval = (parts_passed / sub_boxes.
size() >= kSeedPartRatioTh);
677 parts_splitted->
clear();
680 bool found_split =
true;
681 while (found_split) {
683 BLOBNBOX_C_IT box_it(right_part->
boxes());
691 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
692 const TBOX& box = box_it.data()->bounding_box();
694 box.
left() - previous_right > kThreshold) {
697 int mid_x = (box.
left() + previous_right) / 2;
699 right_part = left_part->
SplitAt(mid_x);
708 previous_right =
MAX(previous_right, box.
right());
720 splitted_boxes->
clear();
733 BLOBNBOX_C_IT box_it(part->
boxes());
734 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
735 const TBOX& box = box_it.data()->bounding_box();
737 box.
left() - previous_right > kThreshold) {
748 previous_right =
MAX(previous_right, box.
right());
759 const float foreground_density_th,
765 if (!indented_texts_left.
empty() &&
767 kLeftIndentAlignmentCountTh) {
781 if (sorted_vec.
empty()) {
789 while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) {
795 while (index < sorted_vec.
size() && sorted_vec[index++] - val < kDistTh) {
828 const int kGapTh =
static_cast<int>(
roundf(
834 for (
int i = 0; i <
cp_seeds_.size(); ++i) {
840 if (left_margin + kMarginDiffTh < right_margin &&
841 left_margin < kMarginDiffTh) {
844 part_box.right(), part_box.top(), part_box.bottom());
845 right_to_left =
false;
846 }
else if (left_margin > cps_cx) {
850 part_box.left(), part_box.top(), part_box.bottom());
851 right_to_left =
true;
857 bool side_neighbor_found =
false;
861 part_box.x_gap(neighbor_box) > kGapTh ||
862 !part_box.major_y_overlap(neighbor_box) ||
863 part_box.major_x_overlap(neighbor_box)) {
867 side_neighbor_found =
true;
870 if (!side_neighbor_found) {
875 if (neighbor_box.width() > part_box.width() &&
901 const TBOX &prev_box = prev->bounding_box();
905 int gap = current_box.
y_gap(prev_box);
915 if (ygaps.
size() < 8) {
921 int spacing = 0,
count;
923 spacing += ygaps[
count];
925 return spacing /
count;
929 const bool top_to_bottom,
const int textparts_linespacing) {
942 for (
int i = 0; i <
cp_seeds_.size(); ++i) {
948 if (
IsInline(!top_to_bottom, textparts_linespacing, part)) {
958 const int textparts_linespacing,
966 const float kYGapRatioTh = 1.0;
978 if (part_box.y_gap(neighbor_box) > kYGapRatioTh *
979 MIN(part_box.height(), neighbor_box.height())) {
988 const float kHeightRatioTh = 0.5;
989 const int kYGapTh = textparts_linespacing > 0 ?
992 if (part_box.x_overlap(neighbor_box) &&
993 part_box.y_gap(neighbor_box) <= kYGapTh &&
995 static_cast<float>(
MIN(part_box.height(), neighbor_box.height())) /
996 MAX(part_box.height(), neighbor_box.height()) > kHeightRatioTh) {
1008 const int kSeedMathBlobsCount = 2;
1009 const int kSeedMathDigitBlobsCount = 5;
1014 if (blobs < kSeedBlobsCountTh || math_blobs <= kSeedMathBlobsCount ||
1015 math_blobs + digit_blobs <= kSeedMathDigitBlobsCount) {
1023 const float math_density_high,
1024 const float math_density_low,
1030 if (math_digit_density > math_density_high) {
1033 if (math_digit_density + italic_density > kMathItalicDensityTh &&
1034 math_digit_density > math_density_low) {
1055 (part_box.top() + part_box.bottom()) / 2, kRadiusTh);
1057 bool left_indented =
false, right_indented =
false;
1059 (!left_indented || !right_indented)) {
1060 if (neighbor == part) {
1065 if (part_box.major_y_overlap(neighbor_box) &&
1066 part_box.x_gap(neighbor_box) < kXGapTh) {
1077 if (!part_box.x_overlap(neighbor_box) || part_box.y_overlap(neighbor_box)) {
1081 if (part_box.y_gap(neighbor_box) < kYGapTh) {
1082 int left_gap = part_box.left() - neighbor_box.left();
1083 int right_gap = neighbor_box.right() - part_box.right();
1084 if (left_gap > kXGapTh) {
1085 left_indented =
true;
1087 if (right_gap > kXGapTh) {
1088 right_indented =
true;
1093 if (left_indented && right_indented) {
1096 if (left_indented) {
1099 if (right_indented) {
1119 if (parts_to_merge.
empty()) {
1127 for (
int i = 0; i < parts_to_merge.
size(); ++i) {
1132 for (
int j = 0; j <
cp_seeds_.size(); ++j) {
1149 const bool search_left,
1153 const float kYOverlapTh = 0.6;
1158 int x = search_left ? seed_box.
left() : seed_box.right();
1169 if (part_box.x_gap(seed_box) > kXGapTh) {
1174 if ((part_box.left() >= seed_box.left() && search_left) ||
1175 (part_box.right() <= seed_box.right() && !search_left)) {
1192 if (part_box.y_overlap_fraction(seed_box) < kYOverlapTh &&
1193 seed_box.y_overlap_fraction(part_box) < kYOverlapTh) {
1205 const bool search_bottom,
1210 const float kXOverlapTh = 0.4;
1215 int y = search_bottom ? seed_box.
bottom() : seed_box.top();
1223 int skipped_min_top = INT_MAX, skipped_max_bottom = -1;
1230 if (part_box.y_gap(seed_box) > kYGapTh) {
1235 if ((part_box.bottom() >= seed_box.bottom() && search_bottom) ||
1236 (part_box.top() <= seed_box.top() && !search_bottom)) {
1240 bool skip_part =
false;
1253 if (part_box.x_overlap_fraction(seed_box) < kXOverlapTh &&
1254 seed_box.x_overlap_fraction(part_box) < kXOverlapTh) {
1260 if (skipped_min_top > part_box.top()) {
1261 skipped_min_top = part_box.
top();
1263 if (skipped_max_bottom < part_box.bottom()) {
1264 skipped_max_bottom = part_box.bottom();
1279 for (
int i = 0; i < parts.
size(); i++) {
1280 const TBOX& part_box(parts[i]->bounding_box());
1281 if ((search_bottom && part_box.
top() <= skipped_max_bottom) ||
1282 (!search_bottom && part_box.
bottom() >= skipped_min_top)) {
1292 const TBOX& part_box)
const {
1304 part_box.
y_gap(seed_box) > kYGapTh) &&
1306 part_box.
x_gap(seed_box) > kXGapTh)) {
1342 if (text_parts.
empty()) {
1347 text_parts.
sort(&SortCPByHeight);
1348 const TBOX& text_box = text_parts[text_parts.
size() / 2]->bounding_box();
1349 int med_height = text_box.
height();
1350 if (text_parts.
size() % 2 == 0 && text_parts.
size() > 1) {
1351 const TBOX& text_box =
1352 text_parts[text_parts.
size() / 2 - 1]->bounding_box();
1353 med_height =
static_cast<int>(
roundf(
1354 0.5 * (text_box.
height() + med_height)));
1358 for (
int i = 0; i < text_parts.
size(); ++i) {
1359 const TBOX& text_box(text_parts[i]->bounding_box());
1360 if (text_box.
height() > med_height) {
1371 for (
int j = 0; j < math_blocks.
size(); ++j) {
1373 text_parts[i]->Absorb(math_blocks[j],
NULL);
1382 math_blocks->
clear();
1386 int y_gaps[2] = {INT_MAX, INT_MAX};
1388 int neighbors_left = INT_MAX, neighbors_right = 0;
1389 for (
int i = 0; i < 2; ++i) {
1392 const TBOX& neighbor_box = neighbors[i]->bounding_box();
1393 y_gaps[i] = neighbor_box.
y_gap(part_box);
1394 if (neighbor_box.
left() < neighbors_left) {
1395 neighbors_left = neighbor_box.
left();
1397 if (neighbor_box.
right() > neighbors_right) {
1398 neighbors_right = neighbor_box.
right();
1402 if (neighbors[0] == neighbors[1]) {
1404 neighbors[1] =
NULL;
1405 y_gaps[1] = INT_MAX;
1409 if (part_box.left() < neighbors_left || part_box.right() > neighbors_right) {
1414 int index = y_gaps[0] < y_gaps[1] ? 0 : 1;
1418 math_blocks->
push_back(neighbors[index]);
1427 math_blocks->
push_back(neighbors[index]);
1442 int y = search_bottom ? part_box.
bottom() : part_box.top();
1444 int min_y_gap = INT_MAX;
1449 const TBOX& neighbor_box(neighbor->bounding_box());
1450 int y_gap = neighbor_box.
y_gap(part_box);
1451 if (y_gap > kYGapTh) {
1454 if (!neighbor_box.major_x_overlap(part_box) ||
1455 (search_bottom && neighbor_box.bottom() > part_box.bottom()) ||
1456 (!search_bottom && neighbor_box.top() < part_box.top())) {
1459 if (y_gap < min_y_gap) {
1461 nearest_neighbor = neighbor;
1465 return nearest_neighbor;
1478 STRING* image_name)
const {
1481 snprintf(page,
sizeof(page),
"%04d",
page_count_);
1487 pix = pixConvertTo32(pixBi);
1492 BLOBNBOX_C_IT blob_it(part->
boxes());
1493 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1498 pixWrite(outfile.
string(), pix, IFF_TIFF_LZW);
1505 gsearch.StartFullSearch();
1507 while ((part = gsearch.NextFullSearch()) !=
NULL) {
1509 Box *box = boxCreate(tbox.
left(), pixGetHeight(pix) - tbox.
top(),
1512 pixRenderBoxArb(pix, box, 5, 255, 0, 0);
1514 pixRenderBoxArb(pix, box, 5, 0, 255, 0);
1516 pixRenderBoxArb(pix, box, 5, 0, 0, 255);
1521 pixWrite(outfile.
string(), pix, IFF_TIFF_LZW);
1529 tprintf(
"Printing special blobs density values for ColParition (t=%d,b=%d) ",
1530 h - box.top(), h - box.bottom());
void IdentifyInlinePartsVertical(const bool top_to_bottom, const int textPartsLineSpacing)
void IdentifyInlinePartsHorizontal()
bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const
void ProcessMathBlockSatelliteParts()
bool IsVerticalType() const
Tesseract * lang_tesseract_
int classify_integer_matcher_multiplier
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
float SpecialBlobsDensity(const BlobSpecialTextType type) const
ColPartitionSet ** best_columns_
ColPartitionGrid * part_grid_
ColPartition * SearchNNVertical(const bool search_bottom, const ColPartition *part)
const float kMathItalicDensityTh
bool equationdetect_save_seed_image
BBC * NextVerticalSearch(bool top_to_bottom)
double x_overlap_fraction(const TBOX &box) const
bool joined_to_prev() const
const TBOX & bounding_box() const
void InsertPartAfterAbsorb(ColPartition *part)
bool IsMathBlockSatellite(ColPartition *part, GenericVector< ColPartition * > *math_blocks)
void PaintSpecialTexts(const STRING &outfile) const
bool IsInline(const bool search_bottom, const int textPartsLineSpacing, ColPartition *part)
void RepositionIterator()
void GetOutputTiffName(const char *name, STRING *image_name) const
#define BOOL_VAR(name, val, comment)
BlobRegionType blob_type() const
void Normalize(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor, float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift, float final_yshift, bool inverse, Pix *pix)
bool CheckForSeed2(const GenericVector< int > &indented_texts_left, const float foreground_density_th, ColPartition *part)
bool equationdetect_save_spt_image
const int kLeftIndentAlignmentCountTh
bool CheckSeedFgDensity(const float density_th, ColPartition *part)
bool IsRightIndented(const EquationDetect::IndentType type)
bool bool_binary_search(const T &target) const
int source_resolution() const
ColPartition * SplitAt(int split_x)
void set_special_text_type(BlobSpecialTextType new_type)
BBC * NextSideSearch(bool right_to_left)
void PaintColParts(const STRING &outfile) const
BlobSpecialTextType EstimateTypeForUnichar(const UNICHARSET &unicharset, const UNICHAR_ID id) const
static TBLOB * PolygonalCopy(bool allow_detailed_fx, C_BLOB *src)
void IdentifyBlobsToSkip(ColPartition *part)
void SetResolution(const int resolution)
bool CheckSeedBlobsCount(ColPartition *part)
IndentType IsIndented(ColPartition *part)
bool equationdetect_save_bi_image
void ExpandSeedVertical(const bool search_bottom, ColPartition *seed, GenericVector< ColPartition * > *parts_to_merge)
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
bool get_isdigit(UNICHAR_ID unichar_id) const
void SplitCPHorLite(ColPartition *part, GenericVector< TBOX > *splitted_boxes)
void SetPartitionType(int resolution, ColPartitionSet *columns)
inT16 fontinfo_id() const
void SetUniqueMode(bool mode)
LIST search(LIST list, void *key, int_compare is_equal)
void StartRadSearch(int x, int y, int max_radius)
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
void delete_data_pointers()
void PrintSpecialBlobsDensity(const ColPartition *part) const
const char *const id_to_unichar(UNICHAR_ID id) const
void ExpandSeedHorizontal(const bool search_left, ColPartition *seed, GenericVector< ColPartition * > *parts_to_merge)
int y_gap(const TBOX &box) const
BlobTextFlowType flow() const
int classify_class_pruner_multiplier
float ComputeForegroundDensity(const TBOX &tbox)
const int kSeedBlobsCountTh
void ComputeCPsSuperBBox()
void SetLangTesseract(Tesseract *lang_tesseract)
int binary_search(const T &target) const
bool y_overlap(const TBOX &box) const
const int kBlnBaselineOffset
static void RenderSpecialText(Pix *pix, BLOBNBOX *blob)
int LabelSpecialText(TO_BLOCK *to_block)
void SplitCPHor(ColPartition *part, GenericVector< ColPartition * > *parts_splitted)
BlobSpecialTextType special_text_type() const
bool IsTextOrEquationType(PolyBlockType type)
bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const
bool get_ispunctuation(UNICHAR_ID unichar_id) const
void set_blob_type(BlobRegionType t)
void IdentifySpecialText()
bool major_x_overlap(const TBOX &box) const
UnicityTable< FontInfo > & get_fontinfo_table()
const float kMathDigitDensityTh2
bool IsLeftIndented(const EquationDetect::IndentType type)
ColPartition * CopyButDontOwnBlobs()
PolyBlockType type() const
void StartSideSearch(int x, int ymin, int ymax)
const float kMathDigitDensityTh1
void Absorb(ColPartition *other, WidthCallback *cb)
void set_flow(BlobTextFlowType f)
int EstimateTextPartLineSpacing()
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
int x_gap(const TBOX &box) const
bool major_y_overlap(const TBOX &box) const
int IntCastRounded(double x)
bool get_isalpha(UNICHAR_ID unichar_id) const
bool equationdetect_save_merged_image
bool PTIsTextType(PolyBlockType type)
EquationDetect(const char *equ_datapath, const char *equ_language)
int FindEquationParts(ColPartitionGrid *part_grid, ColPartitionSet **best_columns)
int CountAlignment(const GenericVector< int > &sorted_vec, const int val) const
int SpecialBlobsCount(const BlobSpecialTextType type)
const float kUnclearDensityTh
GenericVector< ColPartition * > cp_seeds_
void IdentifyInlineParts()
const TBOX & bounding_box() const
TBOX bounding_box() const
void RemoveBBox(BBC *bbox)
const char * string() const
BLOBNBOX_LIST large_blobs
Tesseract * equ_tesseract_
UNICHAR_ID unichar_id() const
bool CheckSeedDensity(const float math_density_high, const float math_density_low, const ColPartition *part) const
void ComputeSpecialBlobsDensity()
void SearchByOverlap(ColPartition *seed, GenericVector< ColPartition * > *parts_overlap)
bool CheckSeedNeighborDensity(const ColPartition *part) const
void MergePartsByLocation()
BOOL8 contains(const char c) const
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
bool ExpandSeed(ColPartition *seed)
void set_type(PolyBlockType t)
void StartVerticalSearch(int xmin, int xmax, int y)