67 static int Epsilon(
int space_pix) {
68 return space_pix * 4 / 5;
71 static bool AcceptableRowArgs(
72 int debug_level,
int min_num_rows,
const char *function_name,
74 int row_start,
int row_end) {
75 if (row_start < 0 || row_end > rows->
size() || row_start > row_end) {
76 tprintf(
"Invalid arguments rows[%d, %d) while rows is of size %d.\n",
77 row_start, row_end, rows->
size());
80 if (row_end - row_start < min_num_rows) {
81 if (debug_level > 1) {
82 tprintf(
"# Too few rows[%d, %d) for %s.\n",
83 row_start, row_end, function_name);
93 static STRING StrOf(
int num) {
95 snprintf(buffer,
sizeof(buffer),
"%d", num);
104 for (
int r = 0; r < rows.
size(); r++) {
105 int num_columns = rows[r].
size();
106 for (
int c = 0; c < num_columns; c++) {
107 int num_unicodes = 0;
108 for (
int i = 0; i < rows[r][c].
size(); i++) {
109 if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++;
111 if (c >= max_col_widths.
size()) {
114 if (num_unicodes > max_col_widths[c])
115 max_col_widths[c] = num_unicodes;
121 for (
int c = 0; c < max_col_widths.
size(); c++) {
123 STRING(
"%-") + StrOf(max_col_widths[c]) +
"s");
126 for (
int r = 0; r < rows.
size(); r++) {
127 for (
int c = 0; c < rows[r].
size(); c++) {
130 tprintf(col_width_patterns[c].
string(), rows[r][c].
string());
136 static STRING RtlEmbed(
const STRING &word,
bool rtlify) {
143 static void PrintDetectorState(
const ParagraphTheory &theory,
147 output.
back().push_back(
"#row");
148 output.
back().push_back(
"space");
149 output.
back().push_back(
"..");
150 output.
back().push_back(
"lword[widthSEL]");
151 output.
back().push_back(
"rword[widthSEL]");
153 output.
back().push_back(
"text");
155 for (
int i = 0; i < rows.
size(); i++) {
158 const RowInfo& ri = *rows[i].ri_;
160 row.
push_back(StrOf(ri.average_interword_space));
161 row.
push_back(ri.has_leaders ?
".." :
" ");
162 row.
push_back(RtlEmbed(ri.lword_text, !ri.ltr) +
163 "[" + StrOf(ri.lword_box.width()) +
164 (ri.lword_likely_starts_idea ?
"S" :
"s") +
165 (ri.lword_likely_ends_idea ?
"E" :
"e") +
166 (ri.lword_indicates_list_item ?
"L" :
"l") +
168 row.
push_back(RtlEmbed(ri.rword_text, !ri.ltr) +
169 "[" + StrOf(ri.rword_box.width()) +
170 (ri.rword_likely_starts_idea ?
"S" :
"s") +
171 (ri.rword_likely_ends_idea ?
"E" :
"e") +
172 (ri.rword_indicates_list_item ?
"L" :
"l") +
174 rows[i].AppendDebugInfo(theory, &row);
175 row.
push_back(RtlEmbed(ri.text, !ri.ltr));
177 PrintTable(output,
" ");
179 tprintf(
"Active Paragraph Models:\n");
180 for (
int m = 0; m < theory.models().size(); m++) {
181 tprintf(
" %d: %s\n", m + 1, theory.models()[m]->ToString().string());
185 static void DebugDump(
188 const ParagraphTheory &theory,
193 PrintDetectorState(theory, rows);
198 int row_start,
int row_end) {
199 tprintf(
"======================================\n");
200 for (
int row = row_start; row < row_end; row++) {
201 tprintf(
"%s\n", rows[row].ri_->text.string());
203 tprintf(
"======================================\n");
208 static bool IsLatinLetter(
int ch) {
209 return (ch >=
'a' && ch <=
'z') || (ch >=
'A' && ch <=
'Z');
212 static bool IsDigitLike(
int ch) {
213 return ch ==
'o' || ch ==
'O' || ch ==
'l' || ch ==
'I';
216 static bool IsOpeningPunct(
int ch) {
217 return strchr(
"'\"({[", ch) !=
nullptr;
220 static bool IsTerminalPunct(
int ch) {
221 return strchr(
":'\".?!]})", ch) !=
nullptr;
225 static const char *SkipChars(
const char *str,
const char *toskip) {
226 while (*str !=
'\0' && strchr(toskip, *str)) { str++; }
230 static const char *SkipChars(
const char *str,
bool (*skip)(
int)) {
231 while (*str !=
'\0' && skip(*str)) { str++; }
235 static const char *SkipOne(
const char *str,
const char *toskip) {
236 if (*str !=
'\0' && strchr(toskip, *str))
return str + 1;
243 static bool LikelyListNumeral(
const STRING &word) {
244 const char *kRomans =
"ivxlmdIVXLMD";
245 const char *kDigits =
"012345789";
246 const char *kOpen =
"[{(";
247 const char *kSep =
":;-.,";
248 const char *kClose =
"]})";
250 int num_segments = 0;
251 const char *pos = word.
string();
252 while (*pos !=
'\0' && num_segments < 3) {
254 const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
255 const char *numeral_end = SkipChars(numeral_start, kRomans);
256 if (numeral_end != numeral_start) {
259 numeral_end = SkipChars(numeral_start, kDigits);
260 if (numeral_end == numeral_start) {
262 numeral_end = SkipChars(numeral_start, IsLatinLetter);
263 if (numeral_end - numeral_start != 1)
270 pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
271 if (pos == numeral_end)
277 static bool LikelyListMark(
const STRING &word) {
278 const char *kListMarks =
"0Oo*.,+.";
279 return word.
size() == 1 && strchr(kListMarks, word[0]) !=
nullptr;
283 return LikelyListMark(word) || LikelyListNumeral(word);
290 if (!u || !werd || pos > werd->
length())
300 : u_(unicharset), word_(word) { wordlen_ = word->
length(); }
318 while (pos < wordlen_ && u_->get_ispunctuation(word_->
unichar_id(pos))) pos++;
324 IsDigitLike(
UnicodeFor(u_, word_, pos)))) pos++;
329 const char *kRomans =
"ivxlmdIVXLMD";
330 while (pos < wordlen_) {
332 if (ch >= 0xF0 || strchr(kRomans, ch) ==
nullptr)
break;
339 while (pos < wordlen_ && u_->get_isalpha(word_->
unichar_id(pos))) pos++;
343 static bool LikelyListMarkUnicode(
int ch) {
347 return LikelyListMark(single_ch);
376 UnicodeSpanSkipper m(u, werd);
377 int num_segments = 0;
379 while (pos < werd->length() && num_segments < 3) {
380 int numeral_start = m.SkipPunc(pos);
381 if (numeral_start > pos + 1)
break;
382 int numeral_end = m.SkipRomans(numeral_start);
383 if (numeral_end == numeral_start) {
384 numeral_end = m.SkipDigits(numeral_start);
385 if (numeral_end == numeral_start) {
387 numeral_end = m.SkipAlpha(numeral_start);
388 if (numeral_end - numeral_start != 1)
395 pos = m.SkipPunc(numeral_end);
396 if (pos == numeral_end)
399 return pos == werd->
length();
411 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
413 *starts_idea =
false;
415 if (utf8.
size() == 0 || (werd !=
nullptr && werd->
length() == 0)) {
420 if (unicharset && werd) {
421 if (UniLikelyListItem(unicharset, werd)) {
438 int start_letter = utf8[0];
439 if (IsOpeningPunct(start_letter)) {
442 if (IsTerminalPunct(start_letter)) {
445 if (start_letter >=
'A' && start_letter <=
'Z') {
458 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
460 *starts_idea =
false;
462 if (utf8.
size() == 0 || (werd !=
nullptr && werd->
length() == 0)) {
467 if (unicharset && werd) {
468 if (UniLikelyListItem(unicharset, werd)) {
481 int last_letter = utf8[utf8.
size() - 1];
482 if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
492 header->
push_back(
"[lmarg,lind;rind,rmarg]");
499 snprintf(s,
sizeof(s),
"[%3d,%3d;%3d,%3d]",
506 int model_numbers = 0;
507 for (
int h = 0; h < hypotheses_.size(); h++) {
508 if (hypotheses_[h].model ==
nullptr)
510 if (model_numbers > 0)
513 model_string += StrOf(1 + theory.
IndexOf(hypotheses_[h].model));
514 }
else if (hypotheses_[h].model ==
kCrownLeft) {
515 model_string +=
"CrL";
517 model_string +=
"CrR";
521 if (model_numbers == 0)
536 if (hypotheses_.empty())
538 bool has_start =
false;
539 bool has_body =
false;
540 for (
int i = 0; i < hypotheses_.size(); i++) {
541 switch (hypotheses_[i].ty) {
542 case LT_START: has_start =
true;
break;
543 case LT_BODY: has_body =
true;
break;
545 tprintf(
"Encountered bad value in hypothesis list: %c\n",
550 if (has_start && has_body)
556 if (hypotheses_.empty())
558 bool has_start =
false;
559 bool has_body =
false;
560 for (
int i = 0; i < hypotheses_.size(); i++) {
561 if (hypotheses_[i].model != model)
563 switch (hypotheses_[i].ty) {
564 case LT_START: has_start =
true;
break;
565 case LT_BODY: has_body =
true;
break;
567 tprintf(
"Encountered bad value in hypothesis list: %c\n",
572 if (has_start && has_body)
580 tprintf(
"Trying to set a line to be START when it's already BODY.\n");
590 tprintf(
"Trying to set a line to be BODY when it's already START.\n");
601 hypotheses_.remove(old_idx);
608 hypotheses_.remove(old_idx);
612 for (
int h = 0; h < hypotheses_.size(); h++) {
619 for (
int h = 0; h < hypotheses_.size(); h++) {
626 for (
int h = 0; h < hypotheses_.size(); h++) {
627 if (hypotheses_[h].model !=
nullptr)
633 if (hypotheses_.size() != 1 || hypotheses_[0].ty !=
LT_START)
635 return hypotheses_[0].model;
639 if (hypotheses_.size() != 1 || hypotheses_[0].ty !=
LT_BODY)
641 return hypotheses_[0].model;
649 for (
int h = hypotheses_.size() - 1; h >= 0; h--) {
650 if (!models.
contains(hypotheses_[h].model)) {
651 hypotheses_.remove(h);
669 : max_cluster_width_(max_cluster_width) {}
675 int max_cluster_width_;
682 for (
int i = 0; i < clusters.
size(); i++) {
683 if (abs(value - clusters[i].center) <
684 abs(value - clusters[best_index].center))
693 for (
int i = 0; i < values_.
size();) {
697 while (++i < values_.
size() && values_[i] <= lo + max_cluster_width_) {
707 int row_start,
int row_end,
int tolerance,
710 if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
717 for (
int i = row_start; i < row_end; i++) {
718 initial_lefts.Add((*rows)[i].lindent_);
719 initial_rights.Add((*rows)[i].rindent_);
721 initial_lefts.GetClusters(&initial_left_tabs);
722 initial_rights.GetClusters(&initial_right_tabs);
730 SimpleClusterer lefts(tolerance);
731 SimpleClusterer rights(tolerance);
737 int infrequent_enough_to_ignore = 0;
738 if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
739 if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
741 for (
int i = row_start; i < row_end; i++) {
742 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
743 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
744 if (initial_left_tabs[lidx].
count > infrequent_enough_to_ignore ||
745 initial_right_tabs[ridx].
count > infrequent_enough_to_ignore) {
746 lefts.Add((*rows)[i].lindent_);
747 rights.Add((*rows)[i].rindent_);
750 lefts.GetClusters(left_tabs);
751 rights.GetClusters(right_tabs);
753 if ((left_tabs->
size() == 1 && right_tabs->
size() >= 4) ||
754 (right_tabs->
size() == 1 && left_tabs->
size() >= 4)) {
759 for (
int i = row_start; i < row_end; i++) {
760 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
761 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
762 if (!(initial_left_tabs[lidx].
count > infrequent_enough_to_ignore ||
763 initial_right_tabs[ridx].
count > infrequent_enough_to_ignore)) {
764 lefts.Add((*rows)[i].lindent_);
765 rights.Add((*rows)[i].rindent_);
769 lefts.GetClusters(left_tabs);
770 rights.GetClusters(right_tabs);
774 if (left_tabs->
size() == 3 && right_tabs->
size() >= 4) {
776 for (
int i = left_tabs->
size() - 1; i >= 0; i--) {
778 (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
783 (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
784 left_tabs->
remove(to_prune);
787 if (right_tabs->
size() == 3 && left_tabs->
size() >= 4) {
789 for (
int i = right_tabs->
size() - 1; i >= 0; i--) {
791 (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
796 (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
797 right_tabs->
remove(to_prune);
822 int row_start,
int row_end,
824 bool ltr,
int eop_threshold) {
825 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
827 for (
int row = row_start; row < row_end; row++) {
830 if (valid_first && !valid_body) {
831 (*rows)[row].AddStartLine(model);
832 }
else if (valid_body && !valid_first) {
833 (*rows)[row].AddBodyLine(model);
834 }
else if (valid_body && valid_first) {
835 bool after_eop = (row == row_start);
836 if (row > row_start) {
837 if (eop_threshold > 0) {
839 after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
841 after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
849 (*rows)[row].AddStartLine(model);
851 (*rows)[row].AddBodyLine(model);
870 int r_start,
int r_end)
874 CalculateTabStops(r, r_start, r_end,
tolerance,
877 tprintf(
"Geometry: TabStop cluster tolerance = %d; " 878 "%d left tabs; %d right tabs\n",
881 ltr = (*r)[r_start].ri_->ltr;
913 return ClosestCluster(
left_tabs, (*
rows)[i].lindent_) == 0 &&
930 void Fail(
int min_debug_level,
const char *why)
const {
998 static void GeometricClassifyThreeTabStopTextBlock(
1003 int num_full_rows = 0;
1004 int last_row_full = 0;
1008 if (i == s.
row_end - 1) last_row_full++;
1012 if (num_full_rows < 0.7 * num_rows) {
1013 s.
Fail(1,
"Not enough full lines to know which lines start paras.");
1026 if (debug_level > 0) {
1027 tprintf(
"# Not enough variety for clear outline classification. " 1028 "Guessing these are %s aligned based on script.\n",
1029 s.
ltr ?
"left" :
"right");
1037 if (num_rows - 1 == num_full_rows - last_row_full) {
1042 (*s.
rows)[i].AddBodyLine(model);
1090 static void GeometricClassify(
int debug_level,
1092 int row_start,
int row_end,
1093 ParagraphTheory *theory) {
1094 if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
1096 if (debug_level > 1) {
1097 tprintf(
"###############################################\n");
1098 tprintf(
"##### GeometricClassify( rows[%d:%d) ) ####\n",
1099 row_start, row_end);
1100 tprintf(
"###############################################\n");
1104 GeometricClassifierState s(debug_level, rows, row_start, row_end);
1106 s.
Fail(2,
"Too much variety for simple outline classification.");
1110 s.
Fail(1,
"Not enough variety for simple outline classification.");
1114 GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
1139 int firsts[2] = {0, 0};
1144 bool jam_packed =
true;
1159 int percent0firsts, percent1firsts;
1160 percent0firsts = (100 * firsts[0]) / s.
AlignTabs()[0].count;
1161 percent1firsts = (100 * firsts[1]) / s.
AlignTabs()[1].count;
1164 if ((percent0firsts < 20 && 30 < percent1firsts) ||
1165 percent0firsts + 30 < percent1firsts) {
1168 }
else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1169 percent1firsts + 30 < percent0firsts) {
1174 if (debug_level > 1) {
1175 tprintf(
"# Cannot determine %s indent likely to start paragraphs.\n",
1177 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1178 s.
AlignTabs()[0].center, percent0firsts);
1179 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1180 s.
AlignTabs()[1].center, percent1firsts);
1222 MarkRowsWithModel(rows, row_start, row_end, model, s.
ltr, s.
eop_threshold);
1228 for (
int i = 0; i < models_->
size(); i++) {
1229 if ((*models_)[i]->Comparable(model))
1230 return (*models_)[i];
1239 for (
int i = models_->
size() - 1; i >= 0; i--) {
1254 for (
int m = 0; m < models_->
size(); m++) {
1264 for (
int m = 0; m < models_->
size(); m++) {
1272 for (
int i = 0; i < models_->
size(); i++) {
1273 if ((*models_)[i] == model)
1282 tprintf(
"ValidFirstLine() should only be called with strong models!\n");
1286 (*rows)[row].lmargin_, (*rows)[row].lindent_,
1287 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1293 tprintf(
"ValidBodyLine() should only be called with strong models!\n");
1297 (*rows)[row].lmargin_, (*rows)[row].lindent_,
1298 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1304 tprintf(
"CrownCompatible() should only be called with crown models!\n");
1325 : theory_(theory), rows_(rows), row_start_(row_start),
1327 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1333 for (
int row = row_start - 1; row <= row_end; row++) {
1334 open_models_.push_back(no_models);
1339 void ParagraphModelSmearer::CalculateOpenModels(
int row_start,
int row_end) {
1341 if (row_start < row_start_) row_start = row_start_;
1342 if (row_end > row_end_) row_end = row_end_;
1344 for (
int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
1346 if ((*rows_)[row].ri_->num_words == 0) {
1347 OpenModels(row + 1) = no_models;
1350 (*rows_)[row].StartHypotheses(&opened);
1354 for (
int m = 0; m < opened.size(); m++) {
1360 still_open.push_back_new(opened[m]);
1363 OpenModels(row + 1) = still_open;
1370 CalculateOpenModels(row_start_, row_end_);
1375 for (
int i = row_start_; i < row_end_; i++) {
1384 bool left_align_open =
false;
1385 bool right_align_open =
false;
1386 for (
int m = 0; m < OpenModels(i).
size(); m++) {
1387 switch (OpenModels(i)[m]->justification()) {
1390 default: left_align_open = right_align_open =
true;
1398 likely_start =
true;
1400 if ((left_align_open && right_align_open) ||
1401 (!left_align_open && !right_align_open)) {
1402 likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1404 LikelyParagraphStart((*rows_)[i - 1], row,
1406 }
else if (left_align_open) {
1407 likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1410 likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1421 for (
int m = 0; m < OpenModels(i).
size(); m++) {
1430 (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1434 for (
int m = 0; m < last_line_models.
size(); m++) {
1449 for (
int m = 0; m < all_models.
size(); m++) {
1459 CalculateOpenModels(i + 1, row_end_);
1471 for (
int i = 0; i < rows.
size(); i++) {
1472 rows[i].StrongHypotheses(&used_models);
1501 static void DowngradeWeakestToCrowns(
int debug_level, ParagraphTheory *theory,
1504 for (
int end = rows->
size(); end > 0; end = start) {
1508 (model = (*rows)[end - 1].UniqueBodyHypothesis()) ==
nullptr) {
1511 if (end == 0)
break;
1513 while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1516 if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
1542 (*rows)[start].SetUnknown();
1543 (*rows)[start].AddStartLine(crown_model);
1544 for (
int row = start + 1; row < end; row++) {
1545 (*rows)[row].SetUnknown();
1546 (*rows)[row].AddBodyLine(crown_model);
1550 DiscardUnusedModels(*rows, theory);
1573 if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
1576 int lmin, lmax, rmin, rmax;
1577 lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1578 rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1579 for (
int i = start; i < end; i++) {
1587 STATS lefts(lmin, lmax + 1);
1588 STATS rights(rmin, rmax + 1);
1589 for (
int i = start; i < end; i++) {
1596 int ignorable_left = lefts.
ile(
ClipToRange(percentile, 0, 100) / 100.0);
1597 int ignorable_right = rights.
ile(
ClipToRange(percentile, 0, 100) / 100.0);
1598 for (
int i = start; i < end; i++) {
1600 int ldelta = ignorable_left - sr.
lmargin_;
1603 int rdelta = ignorable_right - sr.
rmargin_;
1611 int row_start,
int row_end) {
1612 if (row_end < row_start + 1)
return 1;
1613 int word_height = (rows[row_start].ri_->lword_box.height() +
1614 rows[row_end - 1].ri_->lword_box.height()) / 2;
1615 int word_width = (rows[row_start].ri_->lword_box.width() +
1616 rows[row_end - 1].ri_->lword_box.width()) / 2;
1617 STATS spacing_widths(0, 5 + word_width);
1618 for (
int i = row_start; i < row_end; i++) {
1619 if (rows[i].ri_->num_words > 1) {
1620 spacing_widths.
add(rows[i].ri_->average_interword_space, 1);
1623 int minimum_reasonable_space = word_height / 3;
1624 if (minimum_reasonable_space < 2)
1625 minimum_reasonable_space = 2;
1626 int median = spacing_widths.
median();
1627 return (median > minimum_reasonable_space)
1628 ? median : minimum_reasonable_space;
1640 tprintf(
"Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1642 int available_space;
1663 int available_space = before.
lindent_;
1664 if (before.
rindent_ > available_space)
1673 static bool TextSupportsBreak(
const RowScratchRegisters &before,
1674 const RowScratchRegisters &after) {
1675 if (before.ri_->ltr) {
1676 return before.ri_->rword_likely_ends_idea &&
1677 after.ri_->lword_likely_starts_idea;
1679 return before.ri_->lword_likely_ends_idea &&
1680 after.ri_->rword_likely_starts_idea;
1684 static bool LikelyParagraphStart(
const RowScratchRegisters &before,
1685 const RowScratchRegisters &after,
1687 return before.ri_->num_words == 0 ||
1689 TextSupportsBreak(before, after));
1699 int start,
int end,
int tolerance,
bool *consistent) {
1700 int ltr_line_count = 0;
1701 for (
int i = start; i < end; i++) {
1702 ltr_line_count +=
static_cast<int>((*rows)[i].ri_->ltr);
1704 bool ltr = (ltr_line_count >= (end - start) / 2);
1707 if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
1712 int lmargin = (*rows)[start].lmargin_;
1713 int rmargin = (*rows)[start].rmargin_;
1714 int lmin, lmax, rmin, rmax, cmin, cmax;
1715 lmin = lmax = (*rows)[start + 1].lindent_;
1716 rmin = rmax = (*rows)[start + 1].rindent_;
1718 for (
int i = start + 1; i < end; i++) {
1719 if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1720 tprintf(
"Margins don't match! Software error.\n");
1721 *consistent =
false;
1726 UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1728 int ldiff = lmax - lmin;
1729 int rdiff = rmax - rmin;
1730 int cdiff = cmax - cmin;
1731 if (rdiff > tolerance && ldiff > tolerance) {
1732 if (cdiff < tolerance * 2) {
1733 if (end - start < 3)
1737 *consistent =
false;
1740 if (end - start < 3)
1745 bool body_admits_left_alignment = ldiff < tolerance;
1746 bool body_admits_right_alignment = rdiff < tolerance;
1750 (lmin + lmax) / 2, tolerance);
1753 (rmin + rmax) / 2, tolerance);
1757 bool text_admits_left_alignment = ltr || left_model.
is_flush();
1758 bool text_admits_right_alignment = !ltr || right_model.
is_flush();
1763 if (tolerance < rdiff) {
1764 if (body_admits_left_alignment && text_admits_left_alignment)
1766 *consistent =
false;
1769 if (tolerance < ldiff) {
1770 if (body_admits_right_alignment && text_admits_right_alignment)
1772 *consistent =
false;
1780 int first_left = (*rows)[start].lindent_;
1781 int first_right = (*rows)[start].rindent_;
1783 if (ltr && body_admits_left_alignment &&
1784 (first_left < lmin || first_left > lmax))
1786 if (!ltr && body_admits_right_alignment &&
1787 (first_right < rmin || first_right > rmax))
1790 *consistent =
false;
1801 int start,
int end,
int tolerance) {
1802 bool unused_consistent;
1804 rows, start, end, tolerance, &unused_consistent);
1806 tprintf(
"Could not determine a model for this paragraph:\n");
1807 PrintRowRange(*rows, start, end);
1815 if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
1818 for (
int i = start + 1 ; i < end; i++) {
1836 int row_start,
int row_end) {
1838 for (
int i = row_start + 1; i < row_end; i++) {
1839 const RowScratchRegisters &prev = (*rows)[i - 1];
1840 RowScratchRegisters &curr = (*rows)[i];
1843 if (!curr.ri_->rword_likely_starts_idea &&
1844 !curr.ri_->lword_likely_starts_idea &&
1864 RowScratchRegisters &curr = (*rows)[row_start];
1865 RowScratchRegisters &next = (*rows)[row_start + 1];
1870 (curr.ri_->lword_likely_starts_idea ||
1871 curr.ri_->rword_likely_starts_idea)) {
1872 curr.SetStartLine();
1876 for (
int i = row_start + 1; i < row_end - 1; i++) {
1877 RowScratchRegisters &prev = (*rows)[i - 1];
1878 RowScratchRegisters &curr = (*rows)[i];
1879 RowScratchRegisters &next = (*rows)[i + 1];
1884 LikelyParagraphStart(prev, curr, j)) {
1885 curr.SetStartLine();
1890 RowScratchRegisters &prev = (*rows)[row_end - 2];
1891 RowScratchRegisters &curr = (*rows)[row_end - 1];
1896 LikelyParagraphStart(prev, curr, j)) {
1897 curr.SetStartLine();
1905 static void ModelStrongEvidence(
int debug_level,
1907 int row_start,
int row_end,
1908 bool allow_flush_models,
1909 ParagraphTheory *theory) {
1910 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
1913 int start = row_start;
1914 while (start < row_end) {
1915 while (start < row_end && (*rows)[start].GetLineType() !=
LT_START)
1917 if (start >= row_end - 1)
1920 int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1923 bool next_consistent;
1929 if (end < row_end - 1) {
1930 RowScratchRegisters &next = (*rows)[end];
1932 next_consistent = lt ==
LT_BODY ||
1936 next_consistent =
false;
1938 if (next_consistent) {
1940 rows, start, end + 1, tolerance, &next_consistent);
1941 if (((*rows)[start].ri_->ltr &&
1944 (!(*rows)[start].ri_->ltr &&
1947 next_consistent =
false;
1949 last_model = next_model;
1951 next_consistent =
false;
1953 }
while (next_consistent && end < row_end);
1957 if (end > start + 1) {
1961 debug_level, rows, start, end,
1966 if (end == start + 2) {
1969 }
else if (start == row_start) {
1976 }
else if (allow_flush_models) {
1977 model = theory->AddModel(new_model);
1980 model = theory->AddModel(new_model);
1983 (*rows)[start].AddStartLine(model);
1984 for (
int i = start + 1; i < end; i++) {
1985 (*rows)[i].AddBodyLine(model);
2000 static void StrongEvidenceClassify(
int debug_level,
2002 int row_start,
int row_end,
2003 ParagraphTheory *theory) {
2004 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
2007 if (debug_level > 1) {
2008 tprintf(
"#############################################\n");
2009 tprintf(
"# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2010 tprintf(
"#############################################\n");
2014 MarkStrongEvidence(rows, row_start, row_end);
2016 DebugDump(debug_level > 2,
"Initial strong signals.", *theory, *rows);
2019 ModelStrongEvidence(debug_level, rows, row_start, row_end,
false, theory);
2021 DebugDump(debug_level > 2,
"Unsmeared hypotheses.s.", *theory, *rows);
2026 ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
2031 int row_start,
int row_end,
2032 ParagraphTheory *theory) {
2033 for (
int i = row_start + 1; i < row_end - 1; i++) {
2034 if ((*rows)[i - 1].ri_->has_leaders &&
2035 (*rows)[i].ri_->has_leaders &&
2036 (*rows)[i + 1].ri_->has_leaders) {
2039 (*rows)[i].AddStartLine(model);
2046 static void ConvertHypothesizedModelRunsToParagraphs(
2050 ParagraphTheory *theory) {
2051 int end = rows.
size();
2053 for (; end > 0; end = start) {
2057 bool single_line_paragraph =
false;
2059 rows[start].NonNullHypotheses(&models);
2060 if (!models.empty()) {
2062 if (rows[start].GetLineType(model) !=
LT_BODY)
2063 single_line_paragraph =
true;
2065 if (model && !single_line_paragraph) {
2067 while (--start > 0 && rows[start].GetLineType(model) ==
LT_BODY) {
2070 if (start < 0 || rows[start].GetLineType(model) !=
LT_START) {
2074 if (model ==
nullptr) {
2084 for (
int row = end; row < rows.
size(); row++) {
2085 if ((*row_owners)[row] &&
2089 model = (*row_owners)[row]->model;
2097 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2102 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2105 rows[start].SetUnknown();
2106 rows[start].AddStartLine(model);
2107 for (
int i = start + 1; i < end; i++) {
2108 rows[i].SetUnknown();
2109 rows[i].AddBodyLine(model);
2115 ? rows[start].ri_->rword_indicates_list_item
2116 : rows[start].ri_->lword_indicates_list_item;
2117 for (
int row = start; row < end; row++) {
2118 if ((*row_owners)[row] !=
nullptr) {
2119 tprintf(
"Memory leak! ConvertHypothesizeModelRunsToParagraphs() called " 2120 "more than once!\n");
2121 delete (*row_owners)[row];
2123 (*row_owners)[row] = p;
2148 rows[row].StrongHypotheses(&row_models);
2150 for (
int m = 0; m < row_models.
size(); m++) {
2151 bool all_starts = rows[row].GetLineType();
2153 bool continues =
true;
2154 for (
int i = row - 1; i >= 0 && continues; i--) {
2156 rows[i].NonNullHypotheses(&models);
2157 switch (rows[i].GetLineType(row_models[m])) {
2158 case LT_START: run_length++;
break;
2160 case LT_BODY: run_length++; all_starts =
false;
break;
2162 default: continues =
false;
2166 for (
int i = row + 1; i < rows.
size() && continues; i++) {
2168 rows[i].NonNullHypotheses(&models);
2169 switch (rows[i].GetLineType(row_models[m])) {
2170 case LT_START: run_length++;
break;
2172 case LT_BODY: run_length++; all_starts =
false;
break;
2174 default: continues =
false;
2177 if (run_length > 2 || (!all_starts && run_length > 1))
return false;
2190 int row_start,
int row_end) {
2192 for (
int i = row_start; i < row_end; i++) {
2193 bool needs_fixing =
false;
2197 rows[i].StrongHypotheses(&models);
2198 rows[i].NonNullHypotheses(&models_w_crowns);
2199 if (models.empty() && !models_w_crowns.empty()) {
2201 for (
int end = i + 1; end < rows.
size(); end++) {
2204 rows[end].NonNullHypotheses(&end_models);
2205 rows[end].StrongHypotheses(&strong_end_models);
2206 if (end_models.empty()) {
2207 needs_fixing =
true;
2209 }
else if (!strong_end_models.empty()) {
2210 needs_fixing =
false;
2214 }
else if (models.empty() && rows[i].ri_->num_words > 0) {
2216 needs_fixing =
true;
2219 if (!needs_fixing && !models.empty()) {
2220 needs_fixing = RowIsStranded(rows, i);
2231 for (
int i = 0; i < to_fix->
size(); i++) {
2232 (*to_fix)[i].end = (*to_fix)[i].end + 1;
2241 PARA_LIST *paragraphs) {
2243 paragraphs->
clear();
2244 PARA_IT out(paragraphs);
2245 PARA *formerly_null =
nullptr;
2246 for (
int i = 0; i < rows.
size(); i++) {
2247 if (rows[i] ==
nullptr) {
2248 if (i == 0 || rows[i - 1] != formerly_null) {
2249 rows[i] = formerly_null =
new PARA();
2251 rows[i] = formerly_null;
2254 }
else if (i > 0 && rows[i - 1] == rows[i]) {
2257 out.add_after_then_move(rows[i]);
2274 PARA_LIST *paragraphs,
2284 for (
int i = 0; i < row_infos->
size(); i++) {
2285 rows[i].Init((*row_infos)[i]);
2293 SeparateSimpleLeaderLines(&rows, 0, rows.
size(), &theory);
2295 DebugDump(debug_level > 1,
"End of Pass 1", theory, rows);
2298 LeftoverSegments(rows, &leftovers, 0, rows.
size());
2299 for (
int i = 0; i < leftovers.
size(); i++) {
2305 StrongEvidenceClassify(debug_level, &rows,
2306 leftovers[i].begin, leftovers[i].end, &theory);
2313 LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
2314 bool pass2a_was_useful = leftovers2.
size() > 1 ||
2315 (leftovers2.
size() == 1 &&
2316 (leftovers2[0].begin != 0 || leftovers2[0].end != rows.
size()));
2317 if (pass2a_was_useful) {
2318 for (
int j = 0; j < leftovers2.
size(); j++) {
2319 StrongEvidenceClassify(debug_level, &rows,
2320 leftovers2[j].begin, leftovers2[j].end,
2326 DebugDump(debug_level > 1,
"End of Pass 2", theory, rows);
2332 LeftoverSegments(rows, &leftovers, 0, rows.
size());
2333 for (
int i = 0; i < leftovers.
size(); i++) {
2334 GeometricClassify(debug_level, &rows,
2335 leftovers[i].begin, leftovers[i].end, &theory);
2339 DowngradeWeakestToCrowns(debug_level, &theory, &rows);
2341 DebugDump(debug_level > 1,
"End of Pass 3", theory, rows);
2345 LeftoverSegments(rows, &leftovers, 0, rows.
size());
2346 for (
int i = 0; i < leftovers.
size(); i++) {
2347 for (
int j = leftovers[i].begin; j < leftovers[i].end; j++) {
2348 rows[j].SetUnknown();
2352 DebugDump(debug_level > 1,
"End of Pass 4", theory, rows);
2355 ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
2358 DebugDump(debug_level > 0,
"Final Paragraph Segmentation", theory, rows);
2366 static void InitializeTextAndBoxesPreRecognition(
const MutableIterator &it,
2370 PageIterator pit(static_cast<const PageIterator&>(it));
2371 bool first_word =
true;
2375 if (first_word) info->lword_text +=
"x";
2376 info->rword_text +=
"x";
2380 info->rword_text =
"";
2386 if (fake_text.
size() == 0)
return;
2388 int lspaces = info->pix_ldistance / info->average_interword_space;
2389 for (
int i = 0; i < lspaces; i++) {
2392 info->text += fake_text;
2401 info->num_words = 0;
2404 if (!lword) lword = word_res;
2405 if (rword != word_res) info->num_words++;
2408 word_res = page_res_it.
forward();
2409 }
while (page_res_it.
row() == this_row);
2418 static void InitializeRowInfo(
bool after_recognition,
2419 const MutableIterator &it, RowInfo *info) {
2420 if (it.PageResIt()->row() !=
nullptr) {
2421 ROW *row = it.PageResIt()->row()->row;
2422 info->pix_ldistance = row->
lmargin();
2423 info->pix_rdistance = row->
rmargin();
2424 info->average_interword_space =
2426 info->pix_xheight = row->
x_height();
2427 info->has_leaders =
false;
2431 info->pix_ldistance = info->pix_rdistance = 0;
2432 info->average_interword_space = 1;
2433 info->pix_xheight = 1.0;
2434 info->has_leaders =
false;
2435 info->has_drop_cap =
false;
2439 info->num_words = 0;
2440 info->lword_indicates_list_item =
false;
2441 info->lword_likely_starts_idea =
false;
2442 info->lword_likely_ends_idea =
false;
2443 info->rword_indicates_list_item =
false;
2444 info->rword_likely_starts_idea =
false;
2445 info->rword_likely_ends_idea =
false;
2446 info->has_leaders =
false;
2449 if (!after_recognition) {
2450 InitializeTextAndBoxesPreRecognition(it, info);
2454 const std::unique_ptr<const char[]> text(it.GetUTF8Text(
RIL_TEXTLINE));
2455 int trailing_ws_idx = strlen(text.get());
2456 while (trailing_ws_idx > 0 &&
2458 isascii(text[trailing_ws_idx - 1]) &&
2459 isspace(text[trailing_ws_idx - 1]))
2461 if (trailing_ws_idx > 0) {
2462 int lspaces = info->pix_ldistance / info->average_interword_space;
2463 for (
int i = 0; i < lspaces; i++)
2465 for (
int i = 0; i < trailing_ws_idx; i++)
2466 info->text += text[i];
2469 if (info->text.size() == 0) {
2477 int num_leaders = 0;
2487 word_res = page_res_it.
forward();
2488 }
while (page_res_it.
row() == this_row);
2489 info->ltr = ltr >= rtl;
2490 info->has_leaders = num_leaders > 3;
2491 info->num_words = werds.
size();
2492 if (!werds.
empty()) {
2493 WERD_RES *lword = werds[0], *rword = werds[werds.
size() - 1];
2495 info->rword_text = rword->best_choice->unichar_string().string();
2497 info->rword_box = rword->word->bounding_box();
2500 &info->lword_indicates_list_item,
2501 &info->lword_likely_starts_idea,
2502 &info->lword_likely_ends_idea);
2505 &info->rword_indicates_list_item,
2506 &info->rword_likely_starts_idea,
2507 &info->rword_likely_ends_idea);
2515 bool after_text_recognition,
2539 InitializeRowInfo(after_text_recognition, row, &ri);
2545 if (!row_infos.
empty()) {
2546 int min_lmargin = row_infos[0].pix_ldistance;
2547 int min_rmargin = row_infos[0].pix_rdistance;
2548 for (
int i = 1; i < row_infos.
size(); i++) {
2549 if (row_infos[i].pix_ldistance < min_lmargin)
2550 min_lmargin = row_infos[i].pix_ldistance;
2551 if (row_infos[i].pix_rdistance < min_rmargin)
2552 min_rmargin = row_infos[i].pix_rdistance;
2554 if (min_lmargin > 0 || min_rmargin > 0) {
2555 for (
int i = 0; i < row_infos.
size(); i++) {
2556 row_infos[i].pix_ldistance -= min_lmargin;
2557 row_infos[i].pix_rdistance -= min_rmargin;
2565 if (!is_image_block) {
2575 for (
int i = 0; i < row_owners.
size(); i++) {
void AssumeLeftJustification()
BLOCK_RES * block() const
GenericVector< ParagraphModel * > & models()
bool get_ispunctuation(UNICHAR_ID unichar_id) const
bool IsFullRow(int i) const
bool CrownCompatible(const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
int InterwordSpace(const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
tesseract::ParagraphJustification justification() const
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
Cluster(int cen, int num)
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
ParagraphModel Model() const
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
const PAGE_RES_IT * PageResIt() const
const char * string() const
const ParagraphModel * kCrownRight
void GetClusters(GenericVector< Cluster > *clusters)
TBOX bounding_box() const
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
void DiscardUnusedModels(const SetOfModels &used_models)
bool AnyRtlCharsInWord() const
static void AppendDebugHeaderFields(GenericVector< STRING > *header)
int AlignsideTabIndex(int row_idx) const
GeometricClassifierState(int dbg_level, GenericVector< RowScratchRegisters > *r, int r_start, int r_end)
const ParagraphModel * AddModel(const ParagraphModel &model)
void NonCenteredModels(SetOfModels *models)
bool contains(const T &object) const
const ParagraphModel * UniqueBodyHypothesis() const
bool has_drop_cap() const
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
int get_index(const T &object) const
void StartHypotheses(SetOfModels *models) const
void AssumeRightJustification()
LineType GetLineType() const
void CanonicalizeDetectionResults(GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs)
void RecomputeMarginsAndClearHypotheses(GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
bool flag(WERD_FLAGS mask) const
bool NearlyEqual(T x, T y, T tolerance)
const ParagraphModel * kCrownLeft
GenericVector< Cluster > right_tabs
bool get_isdigit(UNICHAR_ID unichar_id) const
const GenericVector< Cluster > & AlignTabs() const
tesseract::ParagraphJustification just
void init_to_size(int size, const T &t)
bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const
void AddStartLine(const ParagraphModel *model)
double ile(double frac) const
int push_back_new(const T &object)
int IndexOf(const ParagraphModel *model) const
POLY_BLOCK * poly_block() const
UNICHAR_ID unichar_id(int index) const
void DiscardNonMatchingHypotheses(const SetOfModels &models)
DLLSYM void tprintf(const char *format,...)
GenericVector< RowScratchRegisters > * rows
int OffsideIndent(tesseract::ParagraphJustification just) const
bool AnyLtrCharsInWord() const
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
void add(int32_t value, int32_t count)
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
bool AsciiLikelyListItem(const STRING &word)
SimpleClusterer(int max_cluster_width)
void Init(const RowInfo &row)
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel *> *models)
GenericVectorEqEq< const ParagraphModel * > SetOfModels
const ParagraphModel * model
virtual bool Next(PageIteratorLevel level)
const UNICHARSET * uch_set
const char * id_to_unichar(UNICHAR_ID id) const
bool StrongModel(const ParagraphModel *model)
const STRING & unichar_string() const
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
void AddBodyLine(const ParagraphModel *model)
const GenericVector< Cluster > & OffsideTabs() const
int average_interword_space
bool RowsFitModel(const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
GenericVector< Cluster > left_tabs
bool Empty(PageIteratorLevel level) const
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
ParagraphModelSmearer(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
void StrongHypotheses(SetOfModels *models) const
bool get_isupper(UNICHAR_ID unichar_id) const
bool FirstWordWouldHaveFit(int row_a, int row_b)
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
void NonNullHypotheses(SetOfModels *models) const
void Fail(int min_debug_level, const char *why) const
WERD_CHOICE * best_choice
bool is_very_first_or_continuation
const ParagraphModel * UniqueStartHypothesis() const
void AppendDebugInfo(const ParagraphTheory &theory, GenericVector< STRING > *dbg) const
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const
const ParagraphModel * Fits(const GenericVector< RowScratchRegisters > *rows, int start, int end) const