20 #define __func__ __FUNCTION__
52 static int Epsilon(
int space_pix) {
53 return space_pix * 4 / 5;
56 static bool AcceptableRowArgs(
57 int debug_level,
int min_num_rows,
const char *function_name,
59 int row_start,
int row_end) {
60 if (row_start < 0 || row_end > rows->
size() || row_start > row_end) {
61 tprintf(
"Invalid arguments rows[%d, %d) while rows is of size %d.\n",
62 row_start, row_end, rows->
size());
65 if (row_end - row_start < min_num_rows) {
66 if (debug_level > 1) {
67 tprintf(
"# Too few rows[%d, %d) for %s.\n",
68 row_start, row_end, function_name);
78 static STRING StrOf(
int num) {
80 snprintf(buffer,
sizeof(buffer),
"%d", num);
89 for (
int r = 0; r < rows.
size(); r++) {
90 int num_columns = rows[r].
size();
91 for (
int c = 0; c < num_columns; c++) {
93 for (
int i = 0; i < rows[r][c].
size(); i++) {
94 if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++;
96 if (c >= max_col_widths.
size()) {
99 if (num_unicodes > max_col_widths[c])
100 max_col_widths[c] = num_unicodes;
106 for (
int c = 0; c < max_col_widths.
size(); c++) {
108 STRING(
"%-") + StrOf(max_col_widths[c]) +
"s");
111 for (
int r = 0; r < rows.
size(); r++) {
112 for (
int c = 0; c < rows[r].
size(); c++) {
115 tprintf(col_width_patterns[c].
string(), rows[r][c].
string());
128 static void PrintDetectorState(
const ParagraphTheory &theory,
132 output.
back().push_back(
"#row");
133 output.
back().push_back(
"space");
134 output.
back().push_back(
"..");
135 output.
back().push_back(
"lword[widthSEL]");
136 output.
back().push_back(
"rword[widthSEL]");
138 output.
back().push_back(
"text");
140 for (
int i = 0; i < rows.
size(); i++) {
143 const RowInfo& ri = *rows[i].ri_;
145 row.
push_back(StrOf(ri.average_interword_space));
146 row.
push_back(ri.has_leaders ?
".." :
" ");
148 "[" + StrOf(ri.lword_box.width()) +
149 (ri.lword_likely_starts_idea ?
"S" :
"s") +
150 (ri.lword_likely_ends_idea ?
"E" :
"e") +
151 (ri.lword_indicates_list_item ?
"L" :
"l") +
154 "[" + StrOf(ri.rword_box.width()) +
155 (ri.rword_likely_starts_idea ?
"S" :
"s") +
156 (ri.rword_likely_ends_idea ?
"E" :
"e") +
157 (ri.rword_indicates_list_item ?
"L" :
"l") +
159 rows[i].AppendDebugInfo(theory, &row);
162 PrintTable(output,
" ");
164 tprintf(
"Active Paragraph Models:\n");
165 for (
int m = 0; m < theory.models().size(); m++) {
166 tprintf(
" %d: %s\n", m + 1, theory.models()[m]->ToString().string());
170 static void DebugDump(
173 const ParagraphTheory &theory,
178 PrintDetectorState(theory, rows);
183 int row_start,
int row_end) {
184 tprintf(
"======================================\n");
185 for (
int row = row_start; row < row_end; row++) {
186 tprintf(
"%s\n", rows[row].ri_->text.string());
188 tprintf(
"======================================\n");
194 return (ch >=
'a' && ch <=
'z') || (ch >=
'A' && ch <=
'Z');
198 return ch ==
'o' || ch ==
'O' || ch ==
'l' || ch ==
'I';
202 return strchr(
"'\"({[", ch) !=
NULL;
206 return strchr(
":'\".?!]})", ch) !=
NULL;
210 const char *
SkipChars(
const char *str,
const char *toskip) {
211 while (*str !=
'\0' && strchr(toskip, *str)) { str++; }
215 const char *
SkipChars(
const char *str,
bool (*skip)(
int)) {
216 while (*str !=
'\0' && skip(*str)) { str++; }
220 const char *
SkipOne(
const char *str,
const char *toskip) {
221 if (*str !=
'\0' && strchr(toskip, *str))
return str + 1;
229 const char *kRomans =
"ivxlmdIVXLMD";
230 const char *kDigits =
"012345789";
231 const char *kOpen =
"[{(";
232 const char *kSep =
":;-.,";
233 const char *kClose =
"]})";
235 int num_segments = 0;
236 const char *pos = word.
string();
237 while (*pos !=
'\0' && num_segments < 3) {
240 const char *numeral_end =
SkipChars(numeral_start, kRomans);
241 if (numeral_end != numeral_start) {
244 numeral_end =
SkipChars(numeral_start, kDigits);
245 if (numeral_end == numeral_start) {
248 if (numeral_end - numeral_start != 1)
256 if (pos == numeral_end)
263 const char *kListMarks =
"0Oo*.,+.";
264 return word.
size() == 1 && strchr(kListMarks, word[0]) !=
NULL;
275 if (!u || !werd || pos > werd->
length())
285 : u_(unicharset), word_(word) { wordlen_ = word->
length(); }
303 while (pos < wordlen_ && u_->get_ispunctuation(word_->
unichar_id(pos))) pos++;
314 const char *kRomans =
"ivxlmdIVXLMD";
315 while (pos < wordlen_) {
317 if (ch >= 0xF0 || strchr(kRomans, ch) == 0)
break;
324 while (pos < wordlen_ && u_->get_isalpha(word_->
unichar_id(pos))) pos++;
362 int num_segments = 0;
364 while (pos < werd->length() && num_segments < 3) {
365 int numeral_start = m.
SkipPunc(pos);
366 if (numeral_start > pos + 1)
break;
367 int numeral_end = m.
SkipRomans(numeral_start);
368 if (numeral_end == numeral_start) {
370 if (numeral_end == numeral_start) {
372 numeral_end = m.
SkipAlpha(numeral_start);
373 if (numeral_end - numeral_start != 1)
381 if (pos == numeral_end)
384 return pos == werd->
length();
396 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
398 *starts_idea =
false;
405 if (unicharset && werd) {
423 int start_letter = utf8[0];
430 if (start_letter >=
'A' && start_letter <=
'Z') {
443 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
445 *starts_idea =
false;
452 if (unicharset && werd) {
466 int last_letter = utf8[utf8.
size() - 1];
477 header->
push_back(
"[lmarg,lind;rind,rmarg]");
484 snprintf(s,
sizeof(s),
"[%3d,%3d;%3d,%3d]",
491 int model_numbers = 0;
492 for (
int h = 0; h < hypotheses_.size(); h++) {
493 if (hypotheses_[h].model ==
NULL)
495 if (model_numbers > 0)
498 model_string += StrOf(1 + theory.
IndexOf(hypotheses_[h].model));
499 }
else if (hypotheses_[h].model ==
kCrownLeft) {
500 model_string +=
"CrL";
502 model_string +=
"CrR";
506 if (model_numbers == 0)
521 if (hypotheses_.empty())
523 bool has_start =
false;
524 bool has_body =
false;
525 for (
int i = 0; i < hypotheses_.size(); i++) {
526 switch (hypotheses_[i].ty) {
527 case LT_START: has_start =
true;
break;
528 case LT_BODY: has_body =
true;
break;
530 tprintf(
"Encountered bad value in hypothesis list: %c\n",
535 if (has_start && has_body)
541 if (hypotheses_.empty())
543 bool has_start =
false;
544 bool has_body =
false;
545 for (
int i = 0; i < hypotheses_.size(); i++) {
546 if (hypotheses_[i].model != model)
548 switch (hypotheses_[i].ty) {
549 case LT_START: has_start =
true;
break;
550 case LT_BODY: has_body =
true;
break;
552 tprintf(
"Encountered bad value in hypothesis list: %c\n",
557 if (has_start && has_body)
565 tprintf(
"Trying to set a line to be START when it's already BODY.\n");
575 tprintf(
"Trying to set a line to be BODY when it's already START.\n");
586 hypotheses_.remove(old_idx);
593 hypotheses_.remove(old_idx);
597 for (
int h = 0; h < hypotheses_.size(); h++) {
604 for (
int h = 0; h < hypotheses_.size(); h++) {
611 for (
int h = 0; h < hypotheses_.size(); h++) {
612 if (hypotheses_[h].model !=
NULL)
618 if (hypotheses_.size() != 1 || hypotheses_[0].ty !=
LT_START)
620 return hypotheses_[0].model;
624 if (hypotheses_.size() != 1 || hypotheses_[0].ty !=
LT_BODY)
626 return hypotheses_[0].model;
634 for (
int h = hypotheses_.size() - 1; h >= 0; h--) {
635 if (!models.
contains(hypotheses_[h].model)) {
636 hypotheses_.remove(h);
654 : max_cluster_width_(max_cluster_width) {}
660 int max_cluster_width_;
667 for (
int i = 0; i < clusters.
size(); i++) {
668 if (abs(value - clusters[i].center) <
669 abs(value - clusters[best_index].center))
678 for (
int i = 0; i < values_.
size();) {
682 while (++i < values_.
size() && values_[i] <= lo + max_cluster_width_) {
692 int row_start,
int row_end,
696 if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
703 for (
int i = row_start; i < row_end; i++) {
704 initial_lefts.
Add((*rows)[i].lindent_);
705 initial_rights.
Add((*rows)[i].rindent_);
723 int infrequent_enough_to_ignore = 0;
724 if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
725 if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
727 for (
int i = row_start; i < row_end; i++) {
728 int lidx =
ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
729 int ridx =
ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
730 if (initial_left_tabs[lidx].
count > infrequent_enough_to_ignore ||
731 initial_right_tabs[ridx].
count > infrequent_enough_to_ignore) {
732 lefts.
Add((*rows)[i].lindent_);
733 rights.
Add((*rows)[i].rindent_);
739 if ((left_tabs->
size() == 1 && right_tabs->
size() >= 4) ||
740 (right_tabs->
size() == 1 && left_tabs->
size() >= 4)) {
745 for (
int i = row_start; i < row_end; i++) {
746 int lidx =
ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
747 int ridx =
ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
748 if (!(initial_left_tabs[lidx].
count > infrequent_enough_to_ignore ||
749 initial_right_tabs[ridx].
count > infrequent_enough_to_ignore)) {
750 lefts.
Add((*rows)[i].lindent_);
751 rights.
Add((*rows)[i].rindent_);
760 if (left_tabs->
size() == 3 && right_tabs->
size() >= 4) {
762 for (
int i = left_tabs->
size() - 1; i >= 0; i--) {
764 (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
769 (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
770 left_tabs->
remove(to_prune);
773 if (right_tabs->
size() == 3 && left_tabs->
size() >= 4) {
775 for (
int i = right_tabs->
size() - 1; i >= 0; i--) {
777 (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
782 (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
783 right_tabs->
remove(to_prune);
808 int row_start,
int row_end,
812 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
814 for (
int row = row_start; row < row_end; row++) {
817 if (valid_first && !valid_body) {
818 (*rows)[row].AddStartLine(model);
819 }
else if (valid_body && !valid_first) {
820 (*rows)[row].AddBodyLine(model);
821 }
else if (valid_body && valid_first) {
822 bool after_eop = (row == row_start);
823 if (row > row_start) {
824 if (eop_threshold > 0) {
826 after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
828 after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
836 (*rows)[row].AddStartLine(model);
838 (*rows)[row].AddBodyLine(model);
857 int r_start,
int r_end)
858 : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end),
863 if (debug_level >= 3) {
864 tprintf(
"Geometry: TabStop cluster tolerance = %d; "
865 "%d left tabs; %d right tabs\n",
868 ltr = (*r)[r_start].ri_->ltr;
912 (*rows)[row_a], (*rows)[row_b],
just);
915 void PrintRows()
const { PrintRowRange(*rows, row_start, row_end); }
917 void Fail(
int min_debug_level,
const char *why)
const {
918 if (debug_level < min_debug_level)
return;
990 int num_full_rows = 0;
991 int last_row_full = 0;
995 if (i == s.
row_end - 1) last_row_full++;
999 if (num_full_rows < 0.7 * num_rows) {
1000 s.
Fail(1,
"Not enough full lines to know which lines start paras.");
1013 if (debug_level > 0) {
1014 tprintf(
"# Not enough variety for clear outline classification. "
1015 "Guessing these are %s aligned based on script.\n",
1016 s.
ltr ?
"left" :
"right");
1024 if (num_rows - 1 == num_full_rows - last_row_full) {
1029 (*s.
rows)[i].AddBodyLine(model);
1079 int row_start,
int row_end,
1081 if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
1083 if (debug_level > 1) {
1084 tprintf(
"###############################################\n");
1085 tprintf(
"##### GeometricClassify( rows[%d:%d) ) ####\n",
1086 row_start, row_end);
1087 tprintf(
"###############################################\n");
1093 s.
Fail(2,
"Too much variety for simple outline classification.");
1097 s.
Fail(1,
"Not enough variety for simple outline classification.");
1126 int firsts[2] = {0, 0};
1131 bool jam_packed =
true;
1146 int percent0firsts, percent1firsts;
1147 percent0firsts = (100 * firsts[0]) / s.
AlignTabs()[0].count;
1148 percent1firsts = (100 * firsts[1]) / s.
AlignTabs()[1].count;
1151 if ((percent0firsts < 20 && 30 < percent1firsts) ||
1152 percent0firsts + 30 < percent1firsts) {
1155 }
else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1156 percent1firsts + 30 < percent0firsts) {
1161 if (debug_level > 1) {
1162 tprintf(
"# Cannot determine %s indent likely to start paragraphs.\n",
1164 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1165 s.
AlignTabs()[0].center, percent0firsts);
1166 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1167 s.
AlignTabs()[1].center, percent1firsts);
1215 for (
int i = 0; i < models_->
size(); i++) {
1216 if ((*models_)[i]->Comparable(model))
1217 return (*models_)[i];
1226 for (
int i = models_->
size() - 1; i >= 0; i--) {
1241 for (
int m = 0; m < models_->
size(); m++) {
1251 for (
int m = 0; m < models_->
size(); m++) {
1259 for (
int i = 0; i < models_->
size(); i++) {
1260 if ((*models_)[i] == model)
1269 tprintf(
"ValidFirstLine() should only be called with strong models!\n");
1273 (*rows)[row].lmargin_, (*rows)[row].lindent_,
1274 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1280 tprintf(
"ValidBodyLine() should only be called with strong models!\n");
1284 (*rows)[row].lmargin_, (*rows)[row].lindent_,
1285 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1291 tprintf(
"CrownCompatible() should only be called with crown models!\n");
1312 : theory_(theory), rows_(rows), row_start_(row_start),
1314 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1320 for (
int row = row_start - 1; row <= row_end; row++) {
1321 open_models_.push_back(no_models);
1326 void ParagraphModelSmearer::CalculateOpenModels(
int row_start,
int row_end) {
1328 if (row_start < row_start_) row_start = row_start_;
1329 if (row_end > row_end_) row_end = row_end_;
1331 for (
int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
1333 if ((*rows_)[row].ri_->num_words == 0) {
1334 OpenModels(row + 1) = no_models;
1337 (*rows_)[row].StartHypotheses(&opened);
1341 for (
int m = 0; m < opened.size(); m++) {
1347 still_open.push_back_new(opened[m]);
1350 OpenModels(row + 1) = still_open;
1357 CalculateOpenModels(row_start_, row_end_);
1362 for (
int i = row_start_; i < row_end_; i++) {
1371 bool left_align_open =
false;
1372 bool right_align_open =
false;
1373 for (
int m = 0; m < OpenModels(i).
size(); m++) {
1374 switch (OpenModels(i)[m]->justification()) {
1377 default: left_align_open = right_align_open =
true;
1385 likely_start =
true;
1387 if ((left_align_open && right_align_open) ||
1388 (!left_align_open && !right_align_open)) {
1393 }
else if (left_align_open) {
1408 for (
int m = 0; m < OpenModels(i).
size(); m++) {
1417 (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1421 for (
int m = 0; m < last_line_models.
size(); m++) {
1436 for (
int m = 0; m < all_models.
size(); m++) {
1446 CalculateOpenModels(i + 1, row_end_);
1458 for (
int i = 0; i < rows.
size(); i++) {
1459 rows[i].StrongHypotheses(&used_models);
1492 for (
int end = rows->
size(); end > 0; end = start) {
1496 (model = (*rows)[end - 1].UniqueBodyHypothesis()) ==
NULL) {
1499 if (end == 0)
break;
1501 while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1504 if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
1530 (*rows)[start].SetUnknown();
1531 (*rows)[start].AddStartLine(crown_model);
1532 for (
int row = start + 1; row < end; row++) {
1533 (*rows)[row].SetUnknown();
1534 (*rows)[row].AddBodyLine(crown_model);
1561 if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
1564 int lmin, lmax, rmin, rmax;
1565 lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1566 rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1567 for (
int i = start; i < end; i++) {
1575 STATS lefts(lmin, lmax + 1);
1576 STATS rights(rmin, rmax + 1);
1577 for (
int i = start; i < end; i++) {
1584 int ignorable_left = lefts.
ile(
ClipToRange(percentile, 0, 100) / 100.0);
1585 int ignorable_right = rights.
ile(
ClipToRange(percentile, 0, 100) / 100.0);
1586 for (
int i = start; i < end; i++) {
1588 int ldelta = ignorable_left - sr.
lmargin_;
1591 int rdelta = ignorable_right - sr.
rmargin_;
1599 int row_start,
int row_end) {
1600 if (row_end < row_start + 1)
return 1;
1601 int word_height = (rows[row_start].ri_->lword_box.height() +
1602 rows[row_end - 1].ri_->lword_box.height()) / 2;
1603 int word_width = (rows[row_start].ri_->lword_box.width() +
1604 rows[row_end - 1].ri_->lword_box.width()) / 2;
1605 STATS spacing_widths(0, 5 + word_width);
1606 for (
int i = row_start; i < row_end; i++) {
1607 if (rows[i].ri_->num_words > 1) {
1608 spacing_widths.
add(rows[i].ri_->average_interword_space, 1);
1611 int minimum_reasonable_space = word_height / 3;
1612 if (minimum_reasonable_space < 2)
1613 minimum_reasonable_space = 2;
1614 int median = spacing_widths.
median();
1615 return (median > minimum_reasonable_space)
1616 ? median : minimum_reasonable_space;
1628 tprintf(
"Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1630 int available_space;
1651 int available_space = before.
lindent_;
1652 if (before.
rindent_ > available_space)
1694 int start,
int end,
int tolerance,
bool *consistent) {
1695 int ltr_line_count = 0;
1696 for (
int i = start; i < end; i++) {
1697 ltr_line_count +=
static_cast<int>((*rows)[i].ri_->ltr);
1699 bool ltr = (ltr_line_count >= (end - start) / 2);
1702 if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
1707 int lmargin = (*rows)[start].lmargin_;
1708 int rmargin = (*rows)[start].rmargin_;
1709 int lmin, lmax, rmin, rmax, cmin, cmax;
1710 lmin = lmax = (*rows)[start + 1].lindent_;
1711 rmin = rmax = (*rows)[start + 1].rindent_;
1713 for (
int i = start + 1; i < end; i++) {
1714 if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1715 tprintf(
"Margins don't match! Software error.\n");
1716 *consistent =
false;
1721 UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1723 int ldiff = lmax - lmin;
1724 int rdiff = rmax - rmin;
1725 int cdiff = cmax - cmin;
1726 if (rdiff > tolerance && ldiff > tolerance) {
1727 if (cdiff < tolerance * 2) {
1728 if (end - start < 3)
1732 *consistent =
false;
1735 if (end - start < 3)
1740 bool body_admits_left_alignment = ldiff < tolerance;
1741 bool body_admits_right_alignment = rdiff < tolerance;
1745 (lmin + lmax) / 2, tolerance);
1748 (rmin + rmax) / 2, tolerance);
1752 bool text_admits_left_alignment = ltr || left_model.
is_flush();
1753 bool text_admits_right_alignment = !ltr || right_model.
is_flush();
1758 if (tolerance < rdiff) {
1759 if (body_admits_left_alignment && text_admits_left_alignment)
1761 *consistent =
false;
1764 if (tolerance < ldiff) {
1765 if (body_admits_right_alignment && text_admits_right_alignment)
1767 *consistent =
false;
1775 int first_left = (*rows)[start].lindent_;
1776 int first_right = (*rows)[start].rindent_;
1778 if (ltr && body_admits_left_alignment &&
1779 (first_left < lmin || first_left > lmax))
1781 if (!ltr && body_admits_right_alignment &&
1782 (first_right < rmin || first_right > rmax))
1785 *consistent =
false;
1796 int start,
int end,
int tolerance) {
1797 bool unused_consistent;
1799 rows, start, end, tolerance, &unused_consistent);
1801 tprintf(
"Could not determine a model for this paragraph:\n");
1802 PrintRowRange(*rows, start, end);
1810 if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
1813 for (
int i = start + 1 ; i < end; i++) {
1831 int row_start,
int row_end) {
1833 for (
int i = row_start + 1; i < row_end; i++) {
1871 for (
int i = row_start + 1; i < row_end - 1; i++) {
1902 int row_start,
int row_end,
1903 bool allow_flush_models,
1905 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
1908 int start = row_start;
1909 while (start < row_end) {
1910 while (start < row_end && (*rows)[start].GetLineType() !=
LT_START)
1912 if (start >= row_end - 1)
1915 int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1918 bool next_consistent;
1924 if (end < row_end - 1) {
1927 next_consistent = lt ==
LT_BODY ||
1931 next_consistent =
false;
1933 if (next_consistent) {
1935 rows, start, end + 1, tolerance, &next_consistent);
1936 if (((*rows)[start].ri_->ltr &&
1939 (!(*rows)[start].ri_->ltr &&
1942 next_consistent =
false;
1944 last_model = next_model;
1946 next_consistent =
false;
1948 }
while (next_consistent && end < row_end);
1952 if (end > start + 1) {
1956 debug_level, rows, start, end,
1961 if (end == start + 2) {
1964 }
else if (start == row_start) {
1971 }
else if (allow_flush_models) {
1972 model = theory->
AddModel(new_model);
1975 model = theory->
AddModel(new_model);
1978 (*rows)[start].AddStartLine(model);
1979 for (
int i = start + 1; i < end; i++) {
1980 (*rows)[i].AddBodyLine(model);
1997 int row_start,
int row_end,
1999 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
2002 if (debug_level > 1) {
2003 tprintf(
"#############################################\n");
2004 tprintf(
"# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2005 tprintf(
"#############################################\n");
2011 DebugDump(debug_level > 2,
"Initial strong signals.", *theory, *rows);
2016 DebugDump(debug_level > 2,
"Unsmeared hypotheses.s.", *theory, *rows);
2026 int row_start,
int row_end,
2028 for (
int i = row_start + 1; i < row_end - 1; i++) {
2029 if ((*rows)[i - 1].ri_->has_leaders &&
2030 (*rows)[i].ri_->has_leaders &&
2031 (*rows)[i + 1].ri_->has_leaders) {
2034 (*rows)[i].AddStartLine(model);
2046 int end = rows.
size();
2048 for (; end > 0; end = start) {
2052 bool single_line_paragraph =
false;
2054 rows[start].NonNullHypotheses(&models);
2055 if (models.
size() > 0) {
2057 if (rows[start].GetLineType(model) !=
LT_BODY)
2058 single_line_paragraph =
true;
2060 if (model && !single_line_paragraph) {
2062 while (--start > 0 && rows[start].GetLineType(model) ==
LT_BODY) {
2065 if (start < 0 || rows[start].GetLineType(model) !=
LT_START) {
2069 if (model ==
NULL) {
2079 for (
int row = end; row < rows.
size(); row++) {
2080 if ((*row_owners)[row] &&
2084 model = (*row_owners)[row]->model;
2092 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2097 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2100 rows[start].SetUnknown();
2101 rows[start].AddStartLine(model);
2102 for (
int i = start + 1; i < end; i++) {
2103 rows[i].SetUnknown();
2104 rows[i].AddBodyLine(model);
2110 ? rows[start].ri_->rword_indicates_list_item
2111 : rows[start].ri_->lword_indicates_list_item;
2112 for (
int row = start; row < end; row++) {
2113 if ((*row_owners)[row] !=
NULL) {
2114 tprintf(
"Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2115 "more than once!\n");
2117 (*row_owners)[row] = p;
2141 rows[row].StrongHypotheses(&row_models);
2143 for (
int m = 0; m < row_models.
size(); m++) {
2144 bool all_starts = rows[row].GetLineType();
2146 bool continues =
true;
2147 for (
int i = row - 1; i >= 0 && continues; i--) {
2149 rows[i].NonNullHypotheses(&models);
2150 switch (rows[i].GetLineType(row_models[m])) {
2151 case LT_START: run_length++;
break;
2153 case LT_BODY: run_length++; all_starts =
false;
break;
2155 default: continues =
false;
2159 for (
int i = row + 1; i < rows.
size() && continues; i++) {
2161 rows[i].NonNullHypotheses(&models);
2162 switch (rows[i].GetLineType(row_models[m])) {
2163 case LT_START: run_length++;
break;
2165 case LT_BODY: run_length++; all_starts =
false;
break;
2167 default: continues =
false;
2170 if (run_length > 2 || (!all_starts && run_length > 1))
return false;
2183 int row_start,
int row_end) {
2185 for (
int i = row_start; i < row_end; i++) {
2186 bool needs_fixing =
false;
2190 rows[i].StrongHypotheses(&models);
2191 rows[i].NonNullHypotheses(&models_w_crowns);
2192 if (models.
empty() && models_w_crowns.
size() > 0) {
2194 for (
int end = i + 1; end < rows.
size(); end++) {
2197 rows[end].NonNullHypotheses(&end_models);
2198 rows[end].StrongHypotheses(&strong_end_models);
2199 if (end_models.
size() == 0) {
2200 needs_fixing =
true;
2202 }
else if (strong_end_models.
size() > 0) {
2203 needs_fixing =
false;
2207 }
else if (models.
empty() && rows[i].ri_->num_words > 0) {
2209 needs_fixing =
true;
2212 if (!needs_fixing && !models.
empty()) {
2224 for (
int i = 0; i < to_fix->
size(); i++) {
2225 (*to_fix)[i].end = (*to_fix)[i].end + 1;
2234 PARA_LIST *paragraphs) {
2236 paragraphs->
clear();
2237 PARA_IT out(paragraphs);
2239 for (
int i = 0; i < rows.
size(); i++) {
2240 if (rows[i] ==
NULL) {
2241 if (i == 0 || rows[i - 1] != formerly_null) {
2242 rows[i] = formerly_null =
new PARA();
2244 rows[i] = formerly_null;
2247 }
else if (i > 0 && rows[i - 1] == rows[i]) {
2250 out.add_after_then_move(rows[i]);
2267 PARA_LIST *paragraphs,
2277 for (
int i = 0; i < row_infos->
size(); i++) {
2278 rows[i].Init((*row_infos)[i]);
2288 DebugDump(debug_level > 1,
"End of Pass 1", theory, rows);
2292 for (
int i = 0; i < leftovers.
size(); i++) {
2299 leftovers[i].begin, leftovers[i].end, &theory);
2307 bool pass2a_was_useful = leftovers2.
size() > 1 ||
2308 (leftovers2.
size() == 1 &&
2309 (leftovers2[0].begin != 0 || leftovers2[0].end != rows.
size()));
2310 if (pass2a_was_useful) {
2311 for (
int j = 0; j < leftovers2.
size(); j++) {
2313 leftovers2[j].begin, leftovers2[j].end,
2319 DebugDump(debug_level > 1,
"End of Pass 2", theory, rows);
2326 for (
int i = 0; i < leftovers.
size(); i++) {
2328 leftovers[i].begin, leftovers[i].end, &theory);
2334 DebugDump(debug_level > 1,
"End of Pass 3", theory, rows);
2339 for (
int i = 0; i < leftovers.
size(); i++) {
2340 for (
int j = leftovers[i].begin; j < leftovers[i].end; j++) {
2341 rows[j].SetUnknown();
2345 DebugDump(debug_level > 1,
"End of Pass 4", theory, rows);
2351 DebugDump(debug_level > 0,
"Final Paragraph Segmentation", theory, rows);
2363 PageIterator pit(static_cast<const PageIterator&>(it));
2364 bool first_word =
true;
2379 if (fake_text.
size() == 0)
return;
2382 for (
int i = 0; i < lspaces; i++) {
2385 info->
text += fake_text;
2397 if (!lword) lword = word_res;
2398 if (rword != word_res) info->
num_words++;
2401 word_res = page_res_it.
forward();
2402 }
while (page_res_it.
row() == this_row);
2443 if (!after_recognition) {
2449 int trailing_ws_idx = strlen(text);
2450 while (trailing_ws_idx > 0 &&
2452 ((text[trailing_ws_idx - 1] & 0x80) == 0) &&
2453 isspace(text[trailing_ws_idx - 1]))
2455 if (trailing_ws_idx > 0) {
2457 for (
int i = 0; i < lspaces; i++)
2459 for (
int i = 0; i < trailing_ws_idx; i++)
2460 info->
text += text[i];
2472 int num_leaders = 0;
2482 word_res = page_res_it.
forward();
2483 }
while (page_res_it.
row() == this_row);
2484 info->
ltr = ltr >= rtl;
2487 if (werds.
size() > 0) {
2488 WERD_RES *lword = werds[0], *rword = werds[werds.
size() - 1];
2492 info->
rword_box = rword->word->bounding_box();
2510 bool after_text_recognition,
2540 if (row_infos.
size() > 0) {
2541 int min_lmargin = row_infos[0].pix_ldistance;
2542 int min_rmargin = row_infos[0].pix_rdistance;
2543 for (
int i = 1; i < row_infos.
size(); i++) {
2544 if (row_infos[i].pix_ldistance < min_lmargin)
2545 min_lmargin = row_infos[i].pix_ldistance;
2546 if (row_infos[i].pix_rdistance < min_rmargin)
2547 min_rmargin = row_infos[i].pix_rdistance;
2549 if (min_lmargin > 0 || min_rmargin > 0) {
2550 for (
int i = 0; i < row_infos.
size(); i++) {
2551 row_infos[i].pix_ldistance -= min_lmargin;
2552 row_infos[i].pix_rdistance -= min_rmargin;
2560 if (!is_image_block) {
2570 for (
int i = 0; i < row_owners.
size(); i++) {
ParagraphModel InternalParagraphModelByOutline(const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance, bool *consistent)
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
bool rword_indicates_list_item
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool IsOpeningPunct(int ch)
void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricClassifierState &s, ParagraphTheory *theory)
void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info)
bool LikelyListNumeral(const STRING &word)
const ParagraphModel * UniqueStartHypothesis() const
void DiscardUnusedModels(const GenericVector< RowScratchRegisters > &rows, ParagraphTheory *theory)
bool RowsFitModel(const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
WERD_CHOICE * best_choice
SimpleClusterer(int max_cluster_width)
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
void CalculateTabStops(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, int tolerance, GenericVector< Cluster > *left_tabs, GenericVector< Cluster > *right_tabs)
GenericVector< RowScratchRegisters > * rows
void NonNullHypotheses(SetOfModels *models) const
bool lword_likely_ends_idea
void ConvertHypothesizedModelRunsToParagraphs(int debug_level, const GenericVector< RowScratchRegisters > &rows, GenericVector< PARA * > *row_owners, ParagraphTheory *theory)
void ModelStrongEvidence(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, bool allow_flush_models, ParagraphTheory *theory)
bool get_isupper(UNICHAR_ID unichar_id) const
bool IsTerminalPunct(int ch)
void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info)
GeometricClassifierState(int dbg_level, GenericVector< RowScratchRegisters > *r, int r_start, int r_end)
bool IsFullRow(int i) const
bool lword_likely_starts_idea
ParagraphModel ParagraphModelByOutline(int debug_level, const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance)
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
bool Empty(PageIteratorLevel level) const
bool has_drop_cap() const
const ParagraphModel * kCrownRight
void NonCenteredModels(SetOfModels *models)
bool FirstWordWouldHaveFit(int row_a, int row_b)
void add(inT32 value, inT32 count)
virtual bool Next(PageIteratorLevel level)
bool LikelyListMarkUnicode(int ch)
int average_interword_space
bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const
TBOX bounding_box() const
const GenericVector< Cluster > & AlignTabs() const
virtual char * GetUTF8Text(PageIteratorLevel level) const
bool StrongModel(const ParagraphModel *model)
ParagraphModel Model() const
GenericVector< Cluster > left_tabs
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
ParagraphModelSmearer(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
GenericVectorEqEq< const ParagraphModel * > SetOfModels
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool rword_likely_ends_idea
bool AnyRtlCharsInWord() const
static void AppendDebugHeaderFields(GenericVector< STRING > *header)
void StrongHypotheses(SetOfModels *models) const
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
void LeftoverSegments(const GenericVector< RowScratchRegisters > &rows, GenericVector< Interval > *to_fix, int row_start, int row_end)
void AddStartLine(const ParagraphModel *model)
void MarkStrongEvidence(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end)
void Init(const RowInfo &row)
void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory, GenericVector< RowScratchRegisters > *rows)
const STRING & unichar_string() const
bool RowIsStranded(const GenericVector< RowScratchRegisters > &rows, int row)
BLOCK_RES * block() const
void RecomputeMarginsAndClearHypotheses(GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const
bool lword_indicates_list_item
bool get_isdigit(UNICHAR_ID unichar_id) const
const ParagraphModel * AddModel(const ParagraphModel &model)
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
bool LikelyListMark(const STRING &word)
void StartHypotheses(SetOfModels *models) const
void MarkRowsWithModel(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, const ParagraphModel *model, bool ltr, int eop_threshold)
void AddBodyLine(const ParagraphModel *model)
void CanonicalizeDetectionResults(GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs)
double ile(double frac) const
void AssumeLeftJustification()
bool NearlyEqual(T x, T y, T tolerance)
int AlignsideTabIndex(int row_idx) const
const UNICHAR_ID unichar_id(int index) const
const char *const id_to_unichar(UNICHAR_ID id) const
void AssumeRightJustification()
const UNICHARSET * uch_set
void init_to_size(int size, T t)
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
const char * SkipChars(const char *str, const char *toskip)
const GenericVector< Cluster > & OffsideTabs() const
tesseract::ParagraphJustification justification() const
bool get_ispunctuation(UNICHAR_ID unichar_id) const
bool rword_likely_starts_idea
const ParagraphModel * UniqueBodyHypothesis() const
bool contains(T object) const
int push_back_new(T object)
void DiscardNonMatchingHypotheses(const SetOfModels &models)
const ParagraphModel * kCrownLeft
void AppendDebugInfo(const ParagraphTheory &theory, GenericVector< STRING > *dbg) const
void GetClusters(GenericVector< Cluster > *clusters)
const PAGE_RES_IT * PageResIt() const
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel * > *models)
bool LikelyParagraphStart(const RowScratchRegisters &before, const RowScratchRegisters &after)
void SeparateSimpleLeaderLines(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
bool is_very_first_or_continuation
void Fail(int min_debug_level, const char *why) const
STRING RtlEmbed(const STRING &word, bool rtlify)
BOOL8 flag(WERD_FLAGS mask) const
bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after)
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
void DiscardUnusedModels(const SetOfModels &used_models)
LineType GetLineType() const
bool AnyLtrCharsInWord() const
const char * SkipOne(const char *str, const char *toskip)
bool CrownCompatible(const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
virtual bool Next(PageIteratorLevel level)
bool AsciiLikelyListItem(const STRING &word)
const ParagraphModel * Fits(const GenericVector< RowScratchRegisters > *rows, int start, int end) const
const ParagraphModel * model
void GeometricClassify(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
const char * string() const
int OffsideIndent(tesseract::ParagraphJustification just) const
int InterwordSpace(const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
POLY_BLOCK * poly_block() const
int ClosestCluster(const GenericVector< Cluster > &clusters, int value)
int get_index(T object) const
bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd)
void StrongEvidenceClassify(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
tesseract::ParagraphJustification just
bool IsLatinLetter(int ch)
Cluster(int cen, int num)
GenericVector< Cluster > right_tabs
int IndexOf(const ParagraphModel *model) const