54 = reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD111F));
56 = reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD888F));
59 static bool LikelyParagraphStart(
const RowScratchRegisters &before,
60 const RowScratchRegisters &after,
66 static int Epsilon(
int space_pix) {
67 return space_pix * 4 / 5;
70 static bool AcceptableRowArgs(
71 int debug_level,
int min_num_rows,
const char *function_name,
73 int row_start,
int row_end) {
74 if (row_start < 0 || row_end > rows->
size() || row_start > row_end) {
75 tprintf(
"Invalid arguments rows[%d, %d) while rows is of size %d.\n",
76 row_start, row_end, rows->
size());
79 if (row_end - row_start < min_num_rows) {
80 if (debug_level > 1) {
81 tprintf(
"# Too few rows[%d, %d) for %s.\n",
82 row_start, row_end, function_name);
92 static STRING StrOf(
int num) {
94 snprintf(buffer,
sizeof(buffer),
"%d", num);
103 for (
int r = 0; r < rows.size(); r++) {
104 int num_columns = rows[r].
size();
105 for (
int c = 0; c < num_columns; c++) {
106 int num_unicodes = 0;
107 for (
int i = 0; i < rows[r][c].size(); i++) {
108 if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++;
110 if (c >= max_col_widths.
size()) {
113 if (num_unicodes > max_col_widths[c])
114 max_col_widths[c] = num_unicodes;
120 for (
int c = 0; c < max_col_widths.
size(); c++) {
122 STRING(
"%-") + StrOf(max_col_widths[c]) +
"s");
125 for (
int r = 0; r < rows.size(); r++) {
126 for (
int c = 0; c < rows[r].size(); c++) {
129 tprintf(col_width_patterns[c].c_str(), rows[r][c].c_str());
135 static STRING RtlEmbed(
const STRING &word,
bool rtlify) {
142 static void PrintDetectorState(
const ParagraphTheory &theory,
146 output.
back().push_back(
"#row");
147 output.
back().push_back(
"space");
148 output.
back().push_back(
"..");
149 output.
back().push_back(
"lword[widthSEL]");
150 output.
back().push_back(
"rword[widthSEL]");
152 output.
back().push_back(
"text");
154 for (
int i = 0; i < rows.
size(); i++) {
157 const RowInfo& ri = *rows[i].ri_;
159 row.
push_back(StrOf(ri.average_interword_space));
160 row.
push_back(ri.has_leaders ?
".." :
" ");
161 row.
push_back(RtlEmbed(ri.lword_text, !ri.ltr) +
162 "[" + StrOf(ri.lword_box.width()) +
163 (ri.lword_likely_starts_idea ?
"S" :
"s") +
164 (ri.lword_likely_ends_idea ?
"E" :
"e") +
165 (ri.lword_indicates_list_item ?
"L" :
"l") +
167 row.
push_back(RtlEmbed(ri.rword_text, !ri.ltr) +
168 "[" + StrOf(ri.rword_box.width()) +
169 (ri.rword_likely_starts_idea ?
"S" :
"s") +
170 (ri.rword_likely_ends_idea ?
"E" :
"e") +
171 (ri.rword_indicates_list_item ?
"L" :
"l") +
173 rows[i].AppendDebugInfo(theory, &row);
174 row.
push_back(RtlEmbed(ri.text, !ri.ltr));
176 PrintTable(output,
" ");
178 tprintf(
"Active Paragraph Models:\n");
179 for (
int m = 0; m < theory.models().size(); m++) {
180 tprintf(
" %d: %s\n", m + 1, theory.models()[m]->ToString().c_str());
184 static void DebugDump(
187 const ParagraphTheory &theory,
192 PrintDetectorState(theory, rows);
197 int row_start,
int row_end) {
198 tprintf(
"======================================\n");
199 for (
int row = row_start; row < row_end; row++) {
200 tprintf(
"%s\n", rows[row].ri_->text.c_str());
202 tprintf(
"======================================\n");
207 static bool IsLatinLetter(
int ch) {
208 return (ch >=
'a' && ch <=
'z') || (ch >=
'A' && ch <=
'Z');
211 static bool IsDigitLike(
int ch) {
212 return ch ==
'o' || ch ==
'O' || ch ==
'l' || ch ==
'I';
215 static bool IsOpeningPunct(
int ch) {
216 return strchr(
"'\"({[", ch) !=
nullptr;
219 static bool IsTerminalPunct(
int ch) {
220 return strchr(
":'\".?!]})", ch) !=
nullptr;
224 static const char *SkipChars(
const char *str,
const char *toskip) {
225 while (*str !=
'\0' && strchr(toskip, *str)) { str++; }
229 static const char *SkipChars(
const char *str,
bool (*skip)(
int)) {
230 while (*str !=
'\0' && skip(*str)) { str++; }
234 static const char *SkipOne(
const char *str,
const char *toskip) {
235 if (*str !=
'\0' && strchr(toskip, *str))
return str + 1;
242 static bool LikelyListNumeral(
const STRING &word) {
243 const char *kRomans =
"ivxlmdIVXLMD";
244 const char *kDigits =
"012345789";
245 const char *kOpen =
"[{(";
246 const char *kSep =
":;-.,";
247 const char *kClose =
"]})";
249 int num_segments = 0;
250 const char *pos = word.
c_str();
251 while (*pos !=
'\0' && num_segments < 3) {
253 const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
254 const char *numeral_end = SkipChars(numeral_start, kRomans);
255 if (numeral_end != numeral_start) {
258 numeral_end = SkipChars(numeral_start, kDigits);
259 if (numeral_end == numeral_start) {
261 numeral_end = SkipChars(numeral_start, IsLatinLetter);
262 if (numeral_end - numeral_start != 1)
269 pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
270 if (pos == numeral_end)
276 static bool LikelyListMark(
const STRING &word) {
277 const char *kListMarks =
"0Oo*.,+.";
278 return word.
size() == 1 && strchr(kListMarks, word[0]) !=
nullptr;
282 return LikelyListMark(word) || LikelyListNumeral(word);
289 if (!u || !werd || pos > werd->
length())
299 : u_(unicharset), word_(word) { wordlen_ = word->
length(); }
317 while (pos < wordlen_ && u_->get_ispunctuation(word_->
unichar_id(pos))) pos++;
323 IsDigitLike(
UnicodeFor(u_, word_, pos)))) pos++;
328 const char *kRomans =
"ivxlmdIVXLMD";
329 while (pos < wordlen_) {
331 if (ch >= 0xF0 || strchr(kRomans, ch) ==
nullptr)
break;
338 while (pos < wordlen_ && u_->get_isalpha(word_->
unichar_id(pos))) pos++;
342 static bool LikelyListMarkUnicode(
int ch) {
346 return LikelyListMark(single_ch);
375 UnicodeSpanSkipper m(u, werd);
376 int num_segments = 0;
378 while (pos < werd->length() && num_segments < 3) {
379 int numeral_start = m.SkipPunc(pos);
380 if (numeral_start > pos + 1)
break;
381 int numeral_end = m.SkipRomans(numeral_start);
382 if (numeral_end == numeral_start) {
383 numeral_end = m.SkipDigits(numeral_start);
384 if (numeral_end == numeral_start) {
386 numeral_end = m.SkipAlpha(numeral_start);
387 if (numeral_end - numeral_start != 1)
394 pos = m.SkipPunc(numeral_end);
395 if (pos == numeral_end)
398 return pos == werd->
length();
410 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
412 *starts_idea =
false;
414 if (utf8.
size() == 0 || (werd !=
nullptr && werd->
length() == 0)) {
419 if (unicharset && werd) {
420 if (UniLikelyListItem(unicharset, werd)) {
437 int start_letter = utf8[0];
438 if (IsOpeningPunct(start_letter)) {
441 if (IsTerminalPunct(start_letter)) {
444 if (start_letter >=
'A' && start_letter <=
'Z') {
457 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
459 *starts_idea =
false;
461 if (utf8.
size() == 0 || (werd !=
nullptr && werd->
length() == 0)) {
466 if (unicharset && werd) {
467 if (UniLikelyListItem(unicharset, werd)) {
480 int last_letter = utf8[utf8.
size() - 1];
481 if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
491 header->
push_back(
"[lmarg,lind;rind,rmarg]");
498 snprintf(s,
sizeof(s),
"[%3d,%3d;%3d,%3d]",
505 int model_numbers = 0;
506 for (
int h = 0; h < hypotheses_.size(); h++) {
507 if (hypotheses_[h].model ==
nullptr)
509 if (model_numbers > 0)
512 model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model));
513 }
else if (hypotheses_[h].model ==
kCrownLeft) {
514 model_string +=
"CrL";
516 model_string +=
"CrR";
520 if (model_numbers == 0)
535 if (hypotheses_.empty())
537 bool has_start =
false;
538 bool has_body =
false;
539 for (
int i = 0; i < hypotheses_.size(); i++) {
540 switch (hypotheses_[i].ty) {
542 case LT_BODY: has_body =
true;
break;
544 tprintf(
"Encountered bad value in hypothesis list: %c\n",
549 if (has_start && has_body)
555 if (hypotheses_.empty())
557 bool has_start =
false;
558 bool has_body =
false;
559 for (
int i = 0; i < hypotheses_.size(); i++) {
560 if (hypotheses_[i].model != model)
562 switch (hypotheses_[i].ty) {
563 case LT_START: has_start =
true;
break;
564 case LT_BODY: has_body =
true;
break;
566 tprintf(
"Encountered bad value in hypothesis list: %c\n",
571 if (has_start && has_body)
579 tprintf(
"Trying to set a line to be START when it's already BODY.\n");
582 hypotheses_.push_back_new(LineHypothesis(
LT_START,
nullptr));
589 tprintf(
"Trying to set a line to be BODY when it's already START.\n");
597 hypotheses_.push_back_new(LineHypothesis(
LT_START, model));
598 int old_idx = hypotheses_.get_index(LineHypothesis(
LT_START,
nullptr));
600 hypotheses_.remove(old_idx);
607 hypotheses_.remove(old_idx);
611 for (
int h = 0; h < hypotheses_.size(); h++) {
613 models->push_back_new(hypotheses_[h].model);
618 for (
int h = 0; h < hypotheses_.size(); h++) {
620 models->push_back_new(hypotheses_[h].model);
625 for (
int h = 0; h < hypotheses_.size(); h++) {
626 if (hypotheses_[h].model !=
nullptr)
627 models->push_back_new(hypotheses_[h].model);
632 if (hypotheses_.size() != 1 || hypotheses_[0].ty !=
LT_START)
634 return hypotheses_[0].model;
638 if (hypotheses_.size() != 1 || hypotheses_[0].ty !=
LT_BODY)
640 return hypotheses_[0].model;
648 for (
int h = hypotheses_.size() - 1; h >= 0; h--) {
649 if (!models.
contains(hypotheses_[h].model)) {
650 hypotheses_.remove(h);
665 class SimpleClusterer {
668 : max_cluster_width_(max_cluster_width) {}
670 int size()
const {
return values_.
size(); }
674 int max_cluster_width_;
681 for (
int i = 0; i < clusters.
size(); i++) {
682 if (abs(value - clusters[i].center) <
683 abs(value - clusters[best_index].center))
692 for (
int i = 0; i < values_.
size();) {
696 while (++i < values_.
size() && values_[i] <= lo + max_cluster_width_) {
699 clusters->
push_back(Cluster((hi + lo) / 2, i - orig_i));
706 int row_start,
int row_end,
int tolerance,
709 if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
716 for (
int i = row_start; i < row_end; i++) {
717 initial_lefts.Add((*rows)[i].lindent_);
718 initial_rights.Add((*rows)[i].rindent_);
720 initial_lefts.GetClusters(&initial_left_tabs);
721 initial_rights.GetClusters(&initial_right_tabs);
736 int infrequent_enough_to_ignore = 0;
737 if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
738 if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
740 for (
int i = row_start; i < row_end; i++) {
741 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
742 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
743 if (initial_left_tabs[lidx].
count > infrequent_enough_to_ignore ||
744 initial_right_tabs[ridx].
count > infrequent_enough_to_ignore) {
745 lefts.Add((*rows)[i].lindent_);
746 rights.Add((*rows)[i].rindent_);
749 lefts.GetClusters(left_tabs);
750 rights.GetClusters(right_tabs);
752 if ((left_tabs->
size() == 1 && right_tabs->
size() >= 4) ||
753 (right_tabs->
size() == 1 && left_tabs->
size() >= 4)) {
758 for (
int i = row_start; i < row_end; i++) {
759 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
760 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
761 if (!(initial_left_tabs[lidx].
count > infrequent_enough_to_ignore ||
762 initial_right_tabs[ridx].
count > infrequent_enough_to_ignore)) {
763 lefts.Add((*rows)[i].lindent_);
764 rights.Add((*rows)[i].rindent_);
768 lefts.GetClusters(left_tabs);
769 rights.GetClusters(right_tabs);
773 if (left_tabs->
size() == 3 && right_tabs->
size() >= 4) {
775 for (
int i = left_tabs->
size() - 1; i >= 0; i--) {
777 (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
782 (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
783 left_tabs->
remove(to_prune);
786 if (right_tabs->
size() == 3 && left_tabs->
size() >= 4) {
788 for (
int i = right_tabs->
size() - 1; i >= 0; i--) {
790 (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
795 (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
796 right_tabs->
remove(to_prune);
821 int row_start,
int row_end,
823 bool ltr,
int eop_threshold) {
824 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
826 for (
int row = row_start; row < row_end; row++) {
829 if (valid_first && !valid_body) {
830 (*rows)[row].AddStartLine(model);
831 }
else if (valid_body && !valid_first) {
832 (*rows)[row].AddBodyLine(model);
833 }
else if (valid_body && valid_first) {
834 bool after_eop = (row == row_start);
835 if (row > row_start) {
836 if (eop_threshold > 0) {
838 after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
840 after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
848 (*rows)[row].AddStartLine(model);
850 (*rows)[row].AddBodyLine(model);
866 struct GeometricClassifierState {
869 int r_start,
int r_end)
872 CalculateTabStops(r, r_start, r_end,
tolerance,
875 tprintf(
"Geometry: TabStop cluster tolerance = %d; "
876 "%d left tabs; %d right tabs\n",
879 ltr = (*r)[r_start].ri_->ltr;
911 return ClosestCluster(
left_tabs, (*
rows)[i].lindent_) == 0 &&
928 void Fail(
int min_debug_level,
const char *why)
const {
996 static void GeometricClassifyThreeTabStopTextBlock(
1000 int num_rows = s.row_end - s.row_start;
1001 int num_full_rows = 0;
1002 int last_row_full = 0;
1003 for (
int i = s.row_start; i < s.row_end; i++) {
1004 if (s.IsFullRow(i)) {
1006 if (i == s.row_end - 1) last_row_full++;
1010 if (num_full_rows < 0.7 * num_rows) {
1011 s.Fail(1,
"Not enough full lines to know which lines start paras.");
1016 s.eop_threshold = 0;
1019 s.AssumeLeftJustification();
1021 s.AssumeRightJustification();
1024 if (debug_level > 0) {
1025 tprintf(
"# Not enough variety for clear outline classification. "
1026 "Guessing these are %s aligned based on script.\n",
1027 s.ltr ?
"left" :
"right");
1031 if (s.AlignTabs().size() == 2) {
1032 s.first_indent = s.AlignTabs()[1].center;
1033 s.body_indent = s.AlignTabs()[0].center;
1035 if (num_rows - 1 == num_full_rows - last_row_full) {
1038 (*s.rows)[s.row_start].AddStartLine(model);
1039 for (
int i = s.row_start + 1; i < s.row_end; i++) {
1040 (*s.rows)[i].AddBodyLine(model);
1045 s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1046 s.eop_threshold = (s.OffsideTabs()[0].center +
1047 s.OffsideTabs()[1].center) / 2;
1051 MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,
1052 s.ltr, s.eop_threshold);
1088 static void GeometricClassify(
int debug_level,
1090 int row_start,
int row_end,
1092 if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
1094 if (debug_level > 1) {
1095 tprintf(
"###############################################\n");
1096 tprintf(
"##### GeometricClassify( rows[%d:%d) ) ####\n",
1097 row_start, row_end);
1098 tprintf(
"###############################################\n");
1102 GeometricClassifierState s(debug_level, rows, row_start, row_end);
1103 if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
1104 s.Fail(2,
"Too much variety for simple outline classification.");
1107 if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
1108 s.Fail(1,
"Not enough variety for simple outline classification.");
1111 if (s.left_tabs.size() + s.right_tabs.size() == 3) {
1112 GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
1124 if (s.right_tabs.size() > 2) {
1125 s.AssumeLeftJustification();
1126 }
else if (s.left_tabs.size() > 2) {
1127 s.AssumeRightJustification();
1129 s.AssumeLeftJustification();
1131 s.AssumeRightJustification();
1134 if (s.AlignTabs().size() == 2) {
1137 int firsts[2] = {0, 0};
1139 firsts[s.AlignsideTabIndex(s.row_start)]++;
1142 bool jam_packed =
true;
1143 for (
int i = s.row_start + 1; i < s.row_end; i++) {
1144 if (s.FirstWordWouldHaveFit(i - 1, i)) {
1145 firsts[s.AlignsideTabIndex(i)]++;
1153 if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
1154 firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
1157 int percent0firsts, percent1firsts;
1158 percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
1159 percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
1162 if ((percent0firsts < 20 && 30 < percent1firsts) ||
1163 percent0firsts + 30 < percent1firsts) {
1164 s.first_indent = s.AlignTabs()[1].center;
1165 s.body_indent = s.AlignTabs()[0].center;
1166 }
else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1167 percent1firsts + 30 < percent0firsts) {
1168 s.first_indent = s.AlignTabs()[0].center;
1169 s.body_indent = s.AlignTabs()[1].center;
1172 if (debug_level > 1) {
1173 tprintf(
"# Cannot determine %s indent likely to start paragraphs.\n",
1175 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1176 s.AlignTabs()[0].center, percent0firsts);
1177 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1178 s.AlignTabs()[1].center, percent1firsts);
1185 s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1195 (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1197 if (s.AlignTabs().size() == 2) {
1199 for (
int i = s.row_start; i < s.row_end - 1; i++) {
1202 (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
1204 s.eop_threshold = 0;
1210 for (
int i = s.row_start; i < s.row_end - 1; i++) {
1211 if (!s.FirstWordWouldHaveFit(i, i + 1) &&
1213 (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
1215 s.eop_threshold = 0;
1220 MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
1226 for (
int i = 0; i < models_->
size(); i++) {
1227 if ((*models_)[i]->Comparable(model))
1228 return (*models_)[i];
1237 for (
int i = models_->
size() - 1; i >= 0; i--) {
1239 if (!used_models.contains(m) && models_we_added_.
contains(m)) {
1252 for (
int m = 0; m < models_->
size(); m++) {
1262 for (
int m = 0; m < models_->
size(); m++) {
1270 for (
int i = 0; i < models_->
size(); i++) {
1271 if ((*models_)[i] == model)
1280 tprintf(
"ValidFirstLine() should only be called with strong models!\n");
1284 (*rows)[row].lmargin_, (*rows)[row].lindent_,
1285 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1291 tprintf(
"ValidBodyLine() should only be called with strong models!\n");
1295 (*rows)[row].lmargin_, (*rows)[row].lindent_,
1296 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1302 tprintf(
"CrownCompatible() should only be called with crown models!\n");
1305 RowScratchRegisters &row_a = (*rows)[a];
1306 RowScratchRegisters &row_b = (*rows)[b];
1308 return NearlyEqual(row_a.rindent_ + row_a.rmargin_,
1309 row_b.rindent_ + row_b.rmargin_,
1310 Epsilon(row_a.ri_->average_interword_space));
1312 return NearlyEqual(row_a.lindent_ + row_a.lmargin_,
1313 row_b.lindent_ + row_b.lmargin_,
1314 Epsilon(row_a.ri_->average_interword_space));
1322 int row_start,
int row_end, ParagraphTheory *theory)
1323 : theory_(theory), rows_(rows), row_start_(row_start),
1325 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1331 for (
int row = row_start - 1; row <= row_end; row++) {
1332 open_models_.push_back(no_models);
1337 void ParagraphModelSmearer::CalculateOpenModels(
int row_start,
int row_end) {
1339 if (row_start < row_start_) row_start = row_start_;
1340 if (row_end > row_end_) row_end = row_end_;
1342 for (
int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
1344 if ((*rows_)[row].ri_->num_words == 0) {
1345 OpenModels(row + 1) = no_models;
1348 (*rows_)[row].StartHypotheses(&opened);
1352 for (
int m = 0; m < opened.
size(); m++) {
1361 OpenModels(row + 1) = still_open;
1367 void ParagraphModelSmearer::Smear() {
1368 CalculateOpenModels(row_start_, row_end_);
1373 for (
int i = row_start_; i < row_end_; i++) {
1382 bool left_align_open =
false;
1383 bool right_align_open =
false;
1384 for (
int m = 0; m < OpenModels(i).size(); m++) {
1385 switch (OpenModels(i)[m]->justification()) {
1388 default: left_align_open = right_align_open =
true;
1396 likely_start =
true;
1398 if ((left_align_open && right_align_open) ||
1399 (!left_align_open && !right_align_open)) {
1400 likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1402 LikelyParagraphStart((*rows_)[i - 1], row,
1404 }
else if (left_align_open) {
1405 likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1408 likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1419 for (
int m = 0; m < OpenModels(i).size(); m++) {
1428 (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1430 theory_->NonCenteredModels(&last_line_models);
1432 for (
int m = 0; m < last_line_models.size(); m++) {
1446 theory_->NonCenteredModels(&all_models);
1447 for (
int m = 0; m < all_models.size(); m++) {
1457 CalculateOpenModels(i + 1, row_end_);
1467 ParagraphTheory *theory) {
1469 for (
int i = 0; i < rows.
size(); i++) {
1470 rows[i].StrongHypotheses(&used_models);
1472 theory->DiscardUnusedModels(used_models);
1499 static void DowngradeWeakestToCrowns(
int debug_level, ParagraphTheory *theory,
1502 for (
int end = rows->
size(); end > 0; end = start) {
1506 (model = (*rows)[end - 1].UniqueBodyHypothesis()) ==
nullptr) {
1509 if (end == 0)
break;
1511 while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1514 if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
1540 (*rows)[start].SetUnknown();
1541 (*rows)[start].AddStartLine(crown_model);
1542 for (
int row = start + 1; row < end; row++) {
1543 (*rows)[row].SetUnknown();
1544 (*rows)[row].AddBodyLine(crown_model);
1548 DiscardUnusedModels(*rows, theory);
1571 if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
1574 int lmin, lmax, rmin, rmax;
1575 lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1576 rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1577 for (
int i = start; i < end; i++) {
1578 RowScratchRegisters &sr = (*rows)[i];
1580 if (sr.ri_->num_words == 0)
1582 UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
1585 STATS lefts(lmin, lmax + 1);
1586 STATS rights(rmin, rmax + 1);
1587 for (
int i = start; i < end; i++) {
1588 RowScratchRegisters &sr = (*rows)[i];
1589 if (sr.ri_->num_words == 0)
1591 lefts.add(sr.lmargin_ + sr.lindent_, 1);
1592 rights.add(sr.rmargin_ + sr.rindent_, 1);
1594 int ignorable_left = lefts.ile(
ClipToRange(percentile, 0, 100) / 100.0);
1595 int ignorable_right = rights.ile(
ClipToRange(percentile, 0, 100) / 100.0);
1596 for (
int i = start; i < end; i++) {
1597 RowScratchRegisters &sr = (*rows)[i];
1598 int ldelta = ignorable_left - sr.lmargin_;
1599 sr.lmargin_ += ldelta;
1600 sr.lindent_ -= ldelta;
1601 int rdelta = ignorable_right - sr.rmargin_;
1602 sr.rmargin_ += rdelta;
1603 sr.rindent_ -= rdelta;
1609 int row_start,
int row_end) {
1610 if (row_end < row_start + 1)
return 1;
1611 int word_height = (rows[row_start].ri_->lword_box.height() +
1612 rows[row_end - 1].ri_->lword_box.height()) / 2;
1613 int word_width = (rows[row_start].ri_->lword_box.width() +
1614 rows[row_end - 1].ri_->lword_box.width()) / 2;
1615 STATS spacing_widths(0, 5 + word_width);
1616 for (
int i = row_start; i < row_end; i++) {
1617 if (rows[i].ri_->num_words > 1) {
1618 spacing_widths.add(rows[i].ri_->average_interword_space, 1);
1621 int minimum_reasonable_space = word_height / 3;
1622 if (minimum_reasonable_space < 2)
1623 minimum_reasonable_space = 2;
1624 int median = spacing_widths.median();
1625 return (median > minimum_reasonable_space)
1626 ? median : minimum_reasonable_space;
1632 const RowScratchRegisters &after,
1634 if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
1638 tprintf(
"Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1640 int available_space;
1642 available_space = before.lindent_ + before.rindent_;
1644 available_space = before.OffsideIndent(justification);
1646 available_space -= before.ri_->average_interword_space;
1648 if (before.ri_->ltr)
1649 return after.ri_->lword_box.width() < available_space;
1650 return after.ri_->rword_box.width() < available_space;
1657 const RowScratchRegisters &after) {
1658 if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
1661 int available_space = before.lindent_;
1662 if (before.rindent_ > available_space)
1663 available_space = before.rindent_;
1664 available_space -= before.ri_->average_interword_space;
1666 if (before.ri_->ltr)
1667 return after.ri_->lword_box.width() < available_space;
1668 return after.ri_->rword_box.width() < available_space;
1682 static bool LikelyParagraphStart(
const RowScratchRegisters &before,
1683 const RowScratchRegisters &after,
1685 return before.ri_->num_words == 0 ||
1687 TextSupportsBreak(before, after));
1697 int start,
int end,
int tolerance,
bool *consistent) {
1698 int ltr_line_count = 0;
1699 for (
int i = start; i < end; i++) {
1700 ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
1702 bool ltr = (ltr_line_count >= (end - start) / 2);
1705 if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
1710 int lmargin = (*rows)[start].lmargin_;
1711 int rmargin = (*rows)[start].rmargin_;
1712 int lmin, lmax, rmin, rmax, cmin, cmax;
1713 lmin = lmax = (*rows)[start + 1].lindent_;
1714 rmin = rmax = (*rows)[start + 1].rindent_;
1716 for (
int i = start + 1; i < end; i++) {
1717 if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1718 tprintf(
"Margins don't match! Software error.\n");
1719 *consistent =
false;
1724 UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1726 int ldiff = lmax - lmin;
1727 int rdiff = rmax - rmin;
1728 int cdiff = cmax - cmin;
1729 if (rdiff > tolerance && ldiff > tolerance) {
1730 if (cdiff < tolerance * 2) {
1731 if (end - start < 3)
1735 *consistent =
false;
1738 if (end - start < 3)
1743 bool body_admits_left_alignment = ldiff < tolerance;
1744 bool body_admits_right_alignment = rdiff < tolerance;
1748 (lmin + lmax) / 2, tolerance);
1751 (rmin + rmax) / 2, tolerance);
1755 bool text_admits_left_alignment = ltr || left_model.
is_flush();
1756 bool text_admits_right_alignment = !ltr || right_model.
is_flush();
1761 if (tolerance < rdiff) {
1762 if (body_admits_left_alignment && text_admits_left_alignment)
1764 *consistent =
false;
1767 if (tolerance < ldiff) {
1768 if (body_admits_right_alignment && text_admits_right_alignment)
1770 *consistent =
false;
1778 int first_left = (*rows)[start].lindent_;
1779 int first_right = (*rows)[start].rindent_;
1781 if (ltr && body_admits_left_alignment &&
1782 (first_left < lmin || first_left > lmax))
1784 if (!ltr && body_admits_right_alignment &&
1785 (first_right < rmin || first_right > rmax))
1788 *consistent =
false;
1799 int start,
int end,
int tolerance) {
1800 bool unused_consistent;
1802 rows, start, end, tolerance, &unused_consistent);
1804 tprintf(
"Could not determine a model for this paragraph:\n");
1805 PrintRowRange(*rows, start, end);
1813 if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
1816 for (
int i = start + 1 ; i < end; i++) {
1834 int row_start,
int row_end) {
1836 for (
int i = row_start + 1; i < row_end; i++) {
1874 for (
int i = row_start + 1; i < row_end - 1; i++) {
1882 LikelyParagraphStart(prev, curr, j)) {
1894 LikelyParagraphStart(prev, curr, j)) {
1903 static void ModelStrongEvidence(
int debug_level,
1905 int row_start,
int row_end,
1906 bool allow_flush_models,
1907 ParagraphTheory *theory) {
1908 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
1911 int start = row_start;
1912 while (start < row_end) {
1913 while (start < row_end && (*rows)[start].GetLineType() !=
LT_START)
1915 if (start >= row_end - 1)
1918 int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1921 bool next_consistent;
1927 if (end < row_end - 1) {
1928 RowScratchRegisters &next = (*rows)[end];
1930 next_consistent = lt ==
LT_BODY ||
1934 next_consistent =
false;
1936 if (next_consistent) {
1938 rows, start, end + 1, tolerance, &next_consistent);
1939 if (((*rows)[start].ri_->ltr &&
1942 (!(*rows)[start].ri_->ltr &&
1945 next_consistent =
false;
1947 last_model = next_model;
1949 next_consistent =
false;
1951 }
while (next_consistent && end < row_end);
1955 if (end > start + 1) {
1959 debug_level, rows, start, end,
1964 if (end == start + 2) {
1967 }
else if (start == row_start) {
1974 }
else if (allow_flush_models) {
1975 model = theory->AddModel(new_model);
1978 model = theory->AddModel(new_model);
1981 (*rows)[start].AddStartLine(model);
1982 for (
int i = start + 1; i < end; i++) {
1983 (*rows)[i].AddBodyLine(model);
1998 static void StrongEvidenceClassify(
int debug_level,
2000 int row_start,
int row_end,
2001 ParagraphTheory *theory) {
2002 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
2005 if (debug_level > 1) {
2006 tprintf(
"#############################################\n");
2007 tprintf(
"# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2008 tprintf(
"#############################################\n");
2012 MarkStrongEvidence(rows, row_start, row_end);
2014 DebugDump(debug_level > 2,
"Initial strong signals.", *theory, *rows);
2017 ModelStrongEvidence(debug_level, rows, row_start, row_end,
false, theory);
2019 DebugDump(debug_level > 2,
"Unsmeared hypotheses.s.", *theory, *rows);
2024 ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
2029 int row_start,
int row_end,
2030 ParagraphTheory *theory) {
2031 for (
int i = row_start + 1; i < row_end - 1; i++) {
2032 if ((*rows)[i - 1].ri_->has_leaders &&
2033 (*rows)[i].ri_->has_leaders &&
2034 (*rows)[i + 1].ri_->has_leaders) {
2037 (*rows)[i].AddStartLine(model);
2044 static void ConvertHypothesizedModelRunsToParagraphs(
2048 ParagraphTheory *theory) {
2049 int end = rows.
size();
2051 for (; end > 0; end = start) {
2055 bool single_line_paragraph =
false;
2057 rows[start].NonNullHypotheses(&models);
2058 if (!models.empty()) {
2060 if (rows[start].GetLineType(model) !=
LT_BODY)
2061 single_line_paragraph =
true;
2063 if (model && !single_line_paragraph) {
2065 while (--start > 0 && rows[start].GetLineType(model) ==
LT_BODY) {
2068 if (start < 0 || rows[start].GetLineType(model) !=
LT_START) {
2072 if (model ==
nullptr) {
2082 for (
int row = end; row < rows.
size(); row++) {
2083 if ((*row_owners)[row] &&
2087 model = (*row_owners)[row]->model;
2095 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2100 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2103 rows[start].SetUnknown();
2104 rows[start].AddStartLine(model);
2105 for (
int i = start + 1; i < end; i++) {
2106 rows[i].SetUnknown();
2107 rows[i].AddBodyLine(model);
2113 ? rows[start].ri_->rword_indicates_list_item
2114 : rows[start].ri_->lword_indicates_list_item;
2115 for (
int row = start; row < end; row++) {
2116 if ((*row_owners)[row] !=
nullptr) {
2117 tprintf(
"Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2118 "more than once!\n");
2119 delete (*row_owners)[row];
2121 (*row_owners)[row] = p;
2127 Interval() : begin(0), end(0) {}
2128 Interval(
int b,
int e) : begin(b), end(e) {}
2146 rows[row].StrongHypotheses(&row_models);
2148 for (
int m = 0; m < row_models.
size(); m++) {
2149 bool all_starts = rows[row].GetLineType();
2151 bool continues =
true;
2152 for (
int i = row - 1; i >= 0 && continues; i--) {
2154 rows[i].NonNullHypotheses(&models);
2155 switch (rows[i].GetLineType(row_models[m])) {
2156 case LT_START: run_length++;
break;
2158 case LT_BODY: run_length++; all_starts =
false;
break;
2160 default: continues =
false;
2164 for (
int i = row + 1; i < rows.
size() && continues; i++) {
2166 rows[i].NonNullHypotheses(&models);
2167 switch (rows[i].GetLineType(row_models[m])) {
2168 case LT_START: run_length++;
break;
2170 case LT_BODY: run_length++; all_starts =
false;
break;
2172 default: continues =
false;
2175 if (run_length > 2 || (!all_starts && run_length > 1))
return false;
2188 int row_start,
int row_end) {
2190 for (
int i = row_start; i < row_end; i++) {
2191 bool needs_fixing =
false;
2195 rows[i].StrongHypotheses(&models);
2196 rows[i].NonNullHypotheses(&models_w_crowns);
2197 if (models.empty() && !models_w_crowns.empty()) {
2199 for (
int end = i + 1; end < rows.
size(); end++) {
2202 rows[end].NonNullHypotheses(&end_models);
2203 rows[end].StrongHypotheses(&strong_end_models);
2204 if (end_models.empty()) {
2205 needs_fixing =
true;
2207 }
else if (!strong_end_models.empty()) {
2208 needs_fixing =
false;
2212 }
else if (models.empty() && rows[i].ri_->num_words > 0) {
2214 needs_fixing =
true;
2217 if (!needs_fixing && !models.empty()) {
2218 needs_fixing = RowIsStranded(rows, i);
2222 if (!to_fix->
empty() && to_fix->
back().end == i - 1)
2223 to_fix->
back().end = i;
2229 for (
int i = 0; i < to_fix->
size(); i++) {
2230 (*to_fix)[i].end = (*to_fix)[i].end + 1;
2239 PARA_LIST *paragraphs) {
2241 paragraphs->
clear();
2242 PARA_IT out(paragraphs);
2243 PARA *formerly_null =
nullptr;
2244 for (
int i = 0; i < rows.
size(); i++) {
2245 if (rows[i] ==
nullptr) {
2246 if (i == 0 || rows[i - 1] != formerly_null) {
2247 rows[i] = formerly_null =
new PARA();
2249 rows[i] = formerly_null;
2252 }
else if (i > 0 && rows[i - 1] == rows[i]) {
2255 out.add_after_then_move(rows[i]);
2272 PARA_LIST *paragraphs,
2275 ParagraphTheory theory(models);
2282 for (
int i = 0; i < row_infos->
size(); i++) {
2283 rows[i].Init((*row_infos)[i]);
2291 SeparateSimpleLeaderLines(&rows, 0, rows.
size(), &theory);
2293 DebugDump(debug_level > 1,
"End of Pass 1", theory, rows);
2296 LeftoverSegments(rows, &leftovers, 0, rows.
size());
2297 for (
int i = 0; i < leftovers.
size(); i++) {
2303 StrongEvidenceClassify(debug_level, &rows,
2304 leftovers[i].begin, leftovers[i].end, &theory);
2311 LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
2312 bool pass2a_was_useful = leftovers2.
size() > 1 ||
2313 (leftovers2.
size() == 1 &&
2314 (leftovers2[0].begin != 0 || leftovers2[0].end != rows.
size()));
2315 if (pass2a_was_useful) {
2316 for (
int j = 0; j < leftovers2.
size(); j++) {
2317 StrongEvidenceClassify(debug_level, &rows,
2318 leftovers2[j].begin, leftovers2[j].end,
2324 DebugDump(debug_level > 1,
"End of Pass 2", theory, rows);
2330 LeftoverSegments(rows, &leftovers, 0, rows.
size());
2331 for (
int i = 0; i < leftovers.
size(); i++) {
2332 GeometricClassify(debug_level, &rows,
2333 leftovers[i].begin, leftovers[i].end, &theory);
2337 DowngradeWeakestToCrowns(debug_level, &theory, &rows);
2339 DebugDump(debug_level > 1,
"End of Pass 3", theory, rows);
2343 LeftoverSegments(rows, &leftovers, 0, rows.
size());
2344 for (
int i = 0; i < leftovers.
size(); i++) {
2345 for (
int j = leftovers[i].begin; j < leftovers[i].end; j++) {
2346 rows[j].SetUnknown();
2350 DebugDump(debug_level > 1,
"End of Pass 4", theory, rows);
2353 ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
2356 DebugDump(debug_level > 0,
"Final Paragraph Segmentation", theory, rows);
2364 static void InitializeTextAndBoxesPreRecognition(
const MutableIterator &it,
2368 PageIterator pit(static_cast<const PageIterator&>(it));
2369 bool first_word =
true;
2373 if (first_word) info->lword_text +=
"x";
2374 info->rword_text +=
"x";
2378 info->rword_text =
"";
2384 if (fake_text.
size() == 0)
return;
2386 int lspaces = info->pix_ldistance / info->average_interword_space;
2387 for (
int i = 0; i < lspaces; i++) {
2390 info->text += fake_text;
2399 info->num_words = 0;
2402 if (!lword) lword = word_res;
2403 if (rword != word_res) info->num_words++;
2406 word_res = page_res_it.
forward();
2407 }
while (page_res_it.
row() == this_row);
2416 static void InitializeRowInfo(
bool after_recognition,
2417 const MutableIterator &it, RowInfo *info) {
2418 if (it.PageResIt()->row() !=
nullptr) {
2419 ROW *row = it.PageResIt()->row()->row;
2420 info->pix_ldistance = row->
lmargin();
2421 info->pix_rdistance = row->
rmargin();
2422 info->average_interword_space =
2424 info->pix_xheight = row->
x_height();
2425 info->has_leaders =
false;
2429 info->pix_ldistance = info->pix_rdistance = 0;
2430 info->average_interword_space = 1;
2431 info->pix_xheight = 1.0;
2432 info->has_leaders =
false;
2433 info->has_drop_cap =
false;
2437 info->num_words = 0;
2438 info->lword_indicates_list_item =
false;
2439 info->lword_likely_starts_idea =
false;
2440 info->lword_likely_ends_idea =
false;
2441 info->rword_indicates_list_item =
false;
2442 info->rword_likely_starts_idea =
false;
2443 info->rword_likely_ends_idea =
false;
2444 info->has_leaders =
false;
2447 if (!after_recognition) {
2448 InitializeTextAndBoxesPreRecognition(it, info);
2452 const std::unique_ptr<const char[]> text(it.GetUTF8Text(
RIL_TEXTLINE));
2453 int trailing_ws_idx = strlen(text.get());
2454 while (trailing_ws_idx > 0 &&
2456 isascii(text[trailing_ws_idx - 1]) &&
2457 isspace(text[trailing_ws_idx - 1]))
2459 if (trailing_ws_idx > 0) {
2460 int lspaces = info->pix_ldistance / info->average_interword_space;
2461 for (
int i = 0; i < lspaces; i++)
2463 for (
int i = 0; i < trailing_ws_idx; i++)
2464 info->text += text[i];
2467 if (info->text.size() == 0) {
2475 int num_leaders = 0;
2485 word_res = page_res_it.
forward();
2486 }
while (page_res_it.
row() == this_row);
2487 info->ltr = ltr >= rtl;
2488 info->has_leaders = num_leaders > 3;
2489 info->num_words = werds.
size();
2490 if (!werds.
empty()) {
2491 WERD_RES *lword = werds[0], *rword = werds[werds.
size() - 1];
2493 info->rword_text = rword->best_choice->unichar_string().c_str();
2495 info->rword_box = rword->word->bounding_box();
2498 &info->lword_indicates_list_item,
2499 &info->lword_likely_starts_idea,
2500 &info->lword_likely_ends_idea);
2503 &info->rword_indicates_list_item,
2504 &info->rword_likely_starts_idea,
2505 &info->rword_likely_ends_idea);
2513 bool after_text_recognition,
2514 const MutableIterator *block_start,
2520 BLOCK *block = block_start->PageResIt()->block()->block;
2526 MutableIterator row(*block_start);
2532 if (!row.PageResIt()->row())
2534 row.PageResIt()->row()->row->
set_para(
nullptr);
2537 InitializeRowInfo(after_text_recognition, row, &ri);
2543 if (!row_infos.
empty()) {
2544 int min_lmargin = row_infos[0].pix_ldistance;
2545 int min_rmargin = row_infos[0].pix_rdistance;
2546 for (
int i = 1; i < row_infos.
size(); i++) {
2547 if (row_infos[i].pix_ldistance < min_lmargin)
2548 min_lmargin = row_infos[i].pix_ldistance;
2549 if (row_infos[i].pix_rdistance < min_rmargin)
2550 min_rmargin = row_infos[i].pix_rdistance;
2552 if (min_lmargin > 0 || min_rmargin > 0) {
2553 for (
int i = 0; i < row_infos.
size(); i++) {
2554 row_infos[i].pix_ldistance -= min_lmargin;
2555 row_infos[i].pix_rdistance -= min_rmargin;
2563 if (!is_image_block) {
2573 for (
int i = 0; i < row_owners.
size(); i++) {
2574 while (!row.PageResIt()->row())
2576 row.PageResIt()->row()->row->
set_para(row_owners[i]);