tesseract  5.0.0-alpha-619-ge9db
paragraphs_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include <string> // for std::string
13 
14 #include "absl/strings/str_cat.h" // for absl::StrCat
15 #include "absl/strings/str_join.h" // for absl::StrJoin
16 #include "absl/strings/str_split.h" // for absl::StrSplit
17 
18 #include "include_gunit.h" // for TEST
19 #include "log.h" // for LOG
20 
22 // ccmain
23 #include "paragraphs.h"
24 #include "paragraphs_internal.h"
25 // ccstruct
26 #include "ocrpara.h"
27 
28 namespace { // anonymous namespace
29 
30 // Functions for making monospace ASCII trial text for the paragraph detector.
35 const tesseract::ParagraphJustification kUnknown =
37 
39  PCONT = 0, // Continuation line of a paragraph (default).
40  PSTART = 1, // First line of a paragraph.
41  PNONE = 2, // Not a paragraph line.
42 };
43 
44 struct TextAndModel {
45  const char* ascii;
46  TextModelInputType model_type;
47 
48  // fields corresponding to PARA (see ccstruct/ocrpara.h)
49  ParagraphModel model;
50  bool is_very_first_or_continuation;
51  bool is_list_item;
52 };
53 
54 // Imagine that the given text is typewriter ASCII with each character ten
55 // pixels wide and twenty pixels high and return an appropriate row_info.
56 void AsciiToRowInfo(const char* text, int row_number,
57  tesseract::RowInfo* info) {
58  const int kCharWidth = 10;
59  const int kLineSpace = 30;
60  info->text = text;
61  info->has_leaders =
62  strstr(text, "...") != nullptr || strstr(text, ". . .") != nullptr;
63  info->has_drop_cap = false;
64  info->pix_ldistance = info->pix_rdistance = 0;
65  info->average_interword_space = kCharWidth;
66  info->pix_xheight = kCharWidth;
67  info->lword_text = info->rword_text = "";
68  info->ltr = true;
69 
70  std::vector<std::string> words = absl::StrSplit(text, ' ', absl::SkipEmpty());
71  info->num_words = words.size();
72  if (info->num_words < 1) return;
73 
74  info->lword_text = words[0].c_str();
75  info->rword_text = words[words.size() - 1].c_str();
76  int lspace = 0;
77  while (lspace < info->text.size() && text[lspace] == ' ') {
78  lspace++;
79  }
80  int rspace = 0;
81  while (rspace < info->text.size() &&
82  text[info->text.size() - rspace - 1] == ' ') {
83  rspace++;
84  }
85 
86  int top = -kLineSpace * row_number;
87  int bottom = top - kLineSpace;
88  int row_right = kCharWidth * info->text.size();
89  int lword_width = kCharWidth * info->lword_text.size();
90  int rword_width = kCharWidth * info->rword_text.size();
91  info->pix_ldistance = lspace * kCharWidth;
92  info->pix_rdistance = rspace * kCharWidth;
93  info->lword_box =
94  TBOX(info->pix_ldistance, bottom, info->pix_ldistance + lword_width, top);
95  info->rword_box = TBOX(row_right - info->pix_rdistance - rword_width, bottom,
96  row_right - info->pix_rdistance, top);
98  nullptr, nullptr, info->lword_text, &info->lword_indicates_list_item,
101  nullptr, nullptr, info->rword_text, &info->rword_indicates_list_item,
103 }
104 
105 void MakeAsciiRowInfos(const TextAndModel* row_infos, int n,
107  output->clear();
108  tesseract::RowInfo info;
109  for (int i = 0; i < n; i++) {
110  AsciiToRowInfo(row_infos[i].ascii, i, &info);
111  output->push_back(info);
112  }
113 }
114 
115 // Given n rows of reference ground truth, evaluate whether the n rows
116 // of PARA * pointers yield the same paragraph breakpoints.
117 void EvaluateParagraphDetection(const TextAndModel* correct, int n,
118  const GenericVector<PARA*>& detector_output) {
119  int incorrect_breaks = 0;
120  int missed_breaks = 0;
121  int poorly_matched_models = 0;
122  int bad_crowns = 0;
123  int bad_list_items = 0;
124  ASSERT_EQ(detector_output.size(), n);
125  for (int i = 1; i < n; i++) {
126  bool has_break = correct[i].model_type != PCONT;
127  bool detected_break = (detector_output[i - 1] != detector_output[i]);
128  if (has_break && !detected_break) missed_breaks++;
129  if (detected_break && !has_break) incorrect_breaks++;
130  if (has_break) {
131  if (correct[i].model_type == PNONE) {
132  if (detector_output[i]->model != nullptr) {
133  poorly_matched_models++;
134  }
135  } else {
136  if (correct[i].model.justification() != kUnknown &&
137  (detector_output[i]->model == nullptr ||
138  !correct[i].model.Comparable(*detector_output[i]->model))) {
139  poorly_matched_models++;
140  }
141  }
142  if (correct[i].is_very_first_or_continuation ^
143  detector_output[i]->is_very_first_or_continuation) {
144  bad_crowns++;
145  }
146  if (correct[i].is_list_item ^ detector_output[i]->is_list_item) {
147  bad_list_items++;
148  }
149  }
150  }
151  EXPECT_EQ(incorrect_breaks, 0);
152  EXPECT_EQ(missed_breaks, 0);
153  EXPECT_EQ(poorly_matched_models, 0);
154  EXPECT_EQ(bad_list_items, 0);
155  EXPECT_EQ(bad_crowns, 0);
156  if (incorrect_breaks || missed_breaks || poorly_matched_models ||
157  bad_list_items || bad_crowns) {
158  std::vector<std::string> dbg_lines;
159  dbg_lines.push_back("# ==========================");
160  dbg_lines.push_back("# Correct paragraph breaks:");
161  dbg_lines.push_back("# ==========================");
162  for (int i = 0; i < n; i++) {
163  if (correct[i].model_type != PCONT) {
164  dbg_lines.push_back(absl::StrCat(
165  correct[i].ascii, " # ", correct[i].model.ToString().c_str(),
166  correct[i].is_very_first_or_continuation ? " crown" : "",
167  correct[i].is_list_item ? " li" : ""));
168  } else {
169  dbg_lines.push_back(correct[i].ascii);
170  }
171  }
172  dbg_lines.push_back("");
173  dbg_lines.push_back("# ==========================");
174  dbg_lines.push_back("# Paragraph detector output:");
175  dbg_lines.push_back("# ==========================");
176  for (int i = 0; i < n; i++) {
177  std::string annotation;
178  if (i == 0 || (detector_output[i - 1] != detector_output[i])) {
179  if (detector_output[i] && detector_output[i]->model) {
180  annotation += absl::StrCat(
181  " # ", detector_output[i]->model->ToString().c_str(),
182  detector_output[i]->is_very_first_or_continuation ? " crown" : "",
183  detector_output[i]->is_list_item ? " li" : "");
184  } else {
185  annotation = " # Unmodeled paragraph.";
186  }
187  }
188  dbg_lines.push_back(absl::StrCat(correct[i].ascii, annotation));
189  }
190  LOG(INFO) << "Discrepency!\n" << absl::StrJoin(dbg_lines, "\n");
191  }
192 }
193 
194 void TestParagraphDetection(const TextAndModel* correct, int num_rows) {
196  GenericVector<PARA*> row_owners;
197  PARA_LIST paragraphs;
199 
200  MakeAsciiRowInfos(correct, num_rows, &row_infos);
201  int debug_level(3);
202  tesseract::DetectParagraphs(debug_level, &row_infos, &row_owners, &paragraphs,
203  &models);
204  EvaluateParagraphDetection(correct, num_rows, row_owners);
205  models.delete_data_pointers();
206 }
207 
208 TEST(ParagraphsTest, ListItemsIdentified) {
209  EXPECT_TRUE(tesseract::AsciiLikelyListItem("iii"));
210  EXPECT_TRUE(tesseract::AsciiLikelyListItem("A."));
211  EXPECT_TRUE(tesseract::AsciiLikelyListItem("B."));
212  EXPECT_TRUE(tesseract::AsciiLikelyListItem("C."));
213  EXPECT_TRUE(tesseract::AsciiLikelyListItem("1."));
214  EXPECT_TRUE(tesseract::AsciiLikelyListItem("2."));
215  EXPECT_TRUE(tesseract::AsciiLikelyListItem("3."));
216  EXPECT_TRUE(tesseract::AsciiLikelyListItem("1"));
217  EXPECT_TRUE(tesseract::AsciiLikelyListItem("2"));
218  EXPECT_TRUE(tesseract::AsciiLikelyListItem("3"));
219  EXPECT_TRUE(tesseract::AsciiLikelyListItem("[[1]]"));
220  EXPECT_TRUE(tesseract::AsciiLikelyListItem("A-1."));
221  EXPECT_TRUE(tesseract::AsciiLikelyListItem("A-2"));
222  EXPECT_TRUE(tesseract::AsciiLikelyListItem("(A)(i)"));
223 
224  EXPECT_FALSE(tesseract::AsciiLikelyListItem("The"));
225  EXPECT_FALSE(tesseract::AsciiLikelyListItem("first"));
226  EXPECT_FALSE(tesseract::AsciiLikelyListItem("house"));
227  EXPECT_FALSE(tesseract::AsciiLikelyListItem("Oregonian."));
228  EXPECT_FALSE(tesseract::AsciiLikelyListItem("on."));
229 }
230 
231 typedef ParagraphModel PModel;
232 
233 const TextAndModel kTwoSimpleParagraphs[] = {
234  {" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
235  {"This paragraph starts at the top", PCONT, PModel(), false, false},
236  {"of the page and takes 3 lines. ", PCONT, PModel(), false, false},
237  {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
238  {"which indicates that the first ", PCONT, PModel(), false, false},
239  {"paragraph is not a continuation ", PCONT, PModel(), false, false},
240  {"from a previous page, as it is ", PCONT, PModel(), false, false},
241  {"indented just like this second ", PCONT, PModel(), false, false},
242  {"paragraph. ", PCONT, PModel(), false, false},
243 };
244 
245 TEST(ParagraphsTest, TestSimpleParagraphDetection) {
246  TestParagraphDetection(kTwoSimpleParagraphs,
247  ABSL_ARRAYSIZE(kTwoSimpleParagraphs));
248 }
249 
250 const TextAndModel kFewCluesWithCrown[] = {
251  {"This paragraph starts at the top", PSTART, PModel(kLeft, 0, 20, 0, 0),
252  true, false},
253  {"of the page and takes two lines.", PCONT, PModel(), false, false},
254  {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
255  {"which indicates that the first ", PCONT, PModel(), false, false},
256  {"paragraph is a continuation from", PCONT, PModel(), false, false},
257  {"a previous page, as it is ", PCONT, PModel(), false, false},
258  {"indented just like this second ", PCONT, PModel(), false, false},
259  {"paragraph. ", PCONT, PModel(), false, false},
260 };
261 
262 TEST(ParagraphsTest, TestFewCluesWithCrown) {
263  TestParagraphDetection(kFewCluesWithCrown,
264  ABSL_ARRAYSIZE(kFewCluesWithCrown));
265 }
266 
267 const TextAndModel kCrownedParagraph[] = {
268  {"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0),
269  true, false},
270  {"often not indented as the rest ", PCONT, PModel(), false, false},
271  {"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false},
272  {"less it should be counted as the", PCONT, PModel(), false, false},
273  {"same type of paragraph. ", PCONT, PModel(), false, false},
274  {" The second and third para- ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
275  {"graphs are both indented two ", PCONT, PModel(), false, false},
276  {"spaces. ", PCONT, PModel(), false, false},
277  {" The first paragraph has what ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
278  {"fmt refers to as a 'crown.' ", PCONT, PModel(), false, false},
279 };
280 
281 TEST(ParagraphsTest, TestCrownParagraphDetection) {
282  TestParagraphDetection(kCrownedParagraph, ABSL_ARRAYSIZE(kCrownedParagraph));
283 }
284 
285 const TextAndModel kFlushLeftParagraphs[] = {
286  {"It is sometimes the case that", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false},
287  {"flush left paragraphs (those", PCONT, PModel(), false, false},
288  {"with no body indent) are not", PCONT, PModel(), false, false},
289  {"actually crowns. ", PCONT, PModel(), false, false},
290  {"Instead, further paragraphs are", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false},
291  {"also flush left aligned. Usual-", PCONT, PModel(), false, false},
292  {"ly, these paragraphs are set", PCONT, PModel(), false, false},
293  {"apart vertically by some white-", PCONT, PModel(), false, false},
294  {"space, but you can also detect", PCONT, PModel(), false, false},
295  {"them by observing the big empty", PCONT, PModel(), false, false},
296  {"space at the ends of the para-", PCONT, PModel(), false, false},
297  {"graphs. ", PCONT, PModel(), false, false},
298 };
299 
300 TEST(ParagraphsText, TestRealFlushLeftParagraphs) {
301  TestParagraphDetection(kFlushLeftParagraphs,
302  ABSL_ARRAYSIZE(kFlushLeftParagraphs));
303 }
304 
305 const TextAndModel kSingleFullPageContinuation[] = {
306  {"sometimes a page is one giant", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},
307  {"continuation. It flows from", PCONT, PModel(), false, false},
308  {"line to line, using the full", PCONT, PModel(), false, false},
309  {"column width with no clear", PCONT, PModel(), false, false},
310  {"paragraph break, because it", PCONT, PModel(), false, false},
311  {"actually doesn't have one. It", PCONT, PModel(), false, false},
312  {"is the middle of one monster", PCONT, PModel(), false, false},
313  {"paragraph continued from the", PCONT, PModel(), false, false},
314  {"previous page and continuing", PCONT, PModel(), false, false},
315  {"onto the next page. There-", PCONT, PModel(), false, false},
316  {"fore, it ends up getting", PCONT, PModel(), false, false},
317  {"marked as a crown and then", PCONT, PModel(), false, false},
318  {"getting re-marked as any ex-", PCONT, PModel(), false, false},
319  {"isting model. Not great, but", PCONT, PModel(), false, false},
320 };
321 
322 TEST(ParagraphsTest, TestSingleFullPageContinuation) {
323  const TextAndModel* correct = kSingleFullPageContinuation;
324  int num_rows = ABSL_ARRAYSIZE(kSingleFullPageContinuation);
326  GenericVector<PARA*> row_owners;
327  PARA_LIST paragraphs;
329  models.push_back(new ParagraphModel(kLeft, 0, 20, 0, 10));
330  MakeAsciiRowInfos(correct, num_rows, &row_infos);
331  tesseract::DetectParagraphs(3, &row_infos, &row_owners, &paragraphs, &models);
332  EvaluateParagraphDetection(correct, num_rows, row_owners);
333  models.delete_data_pointers();
334 }
335 
336 const TextAndModel kRightAligned[] = {
337  {"Right-aligned paragraphs are", PSTART, PModel(kRight, 0, 0, 0, 0), false, false},
338  {" uncommon in Left-to-Right", PCONT, PModel(), false, false},
339  {" languages, but they do", PCONT, PModel(), false, false},
340  {" exist.", PCONT, PModel(), false, false},
341  {" Mostly, however, they're", PSTART, PModel(kRight, 0, 0, 0, 0), false, false},
342  {" horribly tiny paragraphs in", PCONT, PModel(), false, false},
343  {" tables on which we have no", PCONT, PModel(), false, false},
344  {" chance anyways.", PCONT, PModel(), false, false},
345 };
346 
347 TEST(ParagraphsTest, TestRightAlignedParagraph) {
348  TestParagraphDetection(kRightAligned, ABSL_ARRAYSIZE(kRightAligned));
349 }
350 
351 const TextAndModel kTinyParagraphs[] = {
352  {" Occasionally, interspersed with", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
353  {"obvious paragraph text, you might", PCONT, PModel(), false, false},
354  {"find short exchanges of dialogue ", PCONT, PModel(), false, false},
355  {"between characters. ", PCONT, PModel(), false, false},
356  {" 'Oh?' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
357  {" 'Don't be confused!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
358  {" 'Not me!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
359  {" One naive approach would be to ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
360  {"mark a new paragraph whenever one", PCONT, PModel(), false, false},
361  {"of the statistics (left, right or", PCONT, PModel(), false, false},
362  {"center) changes from one text-", PCONT, PModel(), false, false},
363  {"line to the next. Such an", PCONT, PModel(), false, false},
364  {"approach would misclassify the", PCONT, PModel(), false, false},
365  {"tiny paragraphs above as a single", PCONT, PModel(), false, false},
366  {"paragraph. ", PCONT, PModel(), false, false},
367 };
368 
369 TEST(ParagraphsTest, TestTinyParagraphs) {
370  TestParagraphDetection(kTinyParagraphs, ABSL_ARRAYSIZE(kTinyParagraphs));
371 }
372 
373 const TextAndModel kComplexPage1[] = {
374  {" Awesome ", PSTART, PModel(kCenter, 0, 0, 0, 0), false, false},
375  {" Centered Title ", PCONT, PModel(), false, false},
376  {" Paragraph Detection ", PCONT, PModel(), false, false},
377  {" OCR TEAM ", PCONT, PModel(), false, false},
378  {" 10 November 2010 ", PCONT, PModel(), false, false},
379  {" ", PNONE, PModel(), false, false},
380  {" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
381  {"This paragraph starts at the top", PCONT, PModel(), false, false},
382  {"of the page and takes 3 lines. ", PCONT, PModel(), false, false},
383  {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
384  {"which indicates that the first ", PCONT, PModel(), false, false},
385  {"paragraph is not a continuation ", PCONT, PModel(), false, false},
386  {"from a previous page, as it is ", PCONT, PModel(), false, false},
387  {"indented just like this second ", PCONT, PModel(), false, false},
388  {"paragraph. ", PCONT, PModel(), false, false},
389  {" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0),
390  true, false},
391  {" looks like the prior text ", PCONT, PModel(), false, false},
392  {" but it is indented more ", PCONT, PModel(), false, false},
393  {" and is fully justified. ", PCONT, PModel(), false, false},
394  {" So how does one deal with ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
395  {"centered text, block quotes, ", PCONT, PModel(), false, false},
396  {"normal paragraphs, and lists ", PCONT, PModel(), false, false},
397  {"like what follows? ", PCONT, PModel(), false, false},
398  {"1. Make a plan. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
399  false, true},
400  {"2. Use a heuristic, for example,", PSTART, PModel(kLeft, 0, 0, 30, 0),
401  false, true},
402  {" looking for lines where the ", PCONT, PModel(), false, false},
403  {" first word of the next line ", PCONT, PModel(), false, false},
404  {" would fit on the previous ", PCONT, PModel(), false, false},
405  {" line. ", PCONT, PModel(), false, false},
406  {"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0),
407  false, true},
408  {" Python and try it out. ", PCONT, PModel(), false, false},
409  {"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0),
410  false, true},
411  {" mistakes. ", PCONT, PModel(), false, false},
412  {"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
413  false, true},
414  {" For extra painful penalty work", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
415  {"you can try to identify source ", PCONT, PModel(), false, false},
416  {"code. Ouch! ", PCONT, PModel(), false, false},
417 };
418 
419 TEST(ParagraphsTest, TestComplexPage1) {
420  TestParagraphDetection(kComplexPage1, ABSL_ARRAYSIZE(kComplexPage1));
421 }
422 
423 // The same as above, but wider.
424 const TextAndModel kComplexPage2[] = {
425  {" Awesome ", PSTART,
426  PModel(kCenter, 0, 0, 0, 0), false, false},
427  {" Centered Title ", PCONT, PModel(), false, false},
428  {" Paragraph Detection ", PCONT, PModel(), false, false},
429  {" OCR TEAM ", PCONT, PModel(), false, false},
430  {" 10 November 2010 ", PCONT, PModel(), false, false},
431  {" ", PNONE, PModel(), false, false},
432  {" Look here, I have a paragraph. ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
433  {"This paragraph starts at the top of", PCONT, PModel(), false, false},
434  {"the page and takes 3 lines. ", PCONT, PModel(), false, false},
435  {" Here I have a second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
436  {"which indicates that the first ", PCONT, PModel(), false, false},
437  {"paragraph is not a continuation ", PCONT, PModel(), false, false},
438  {"from a previous page, as it is in- ", PCONT, PModel(), false, false},
439  {"dented just like this second para- ", PCONT, PModel(), false, false},
440  {"graph. ", PCONT, PModel(), false, false},
441  {" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0),
442  true, false},
443  {" looks like the prior text ", PCONT, PModel(), false, false},
444  {" but it is indented more ", PCONT, PModel(), false, false},
445  {" and is fully justified. ", PCONT, PModel(), false, false},
446  {" So how does one deal with center-", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
447  {"ed text, block quotes, normal para-", PCONT, PModel(), false, false},
448  {"graphs, and lists like what follow?", PCONT, PModel(), false, false},
449  {"1. Make a plan. ", PCONT, PModel(), false, false}, // BUG!!
450  {"2. Use a heuristic, for example, ", PSTART, PModel(kLeft, 0, 0, 30, 0),
451  false, true},
452  {" looking for lines where the ", PCONT, PModel(), false, false},
453  {" first word of the next line ", PCONT, PModel(), false, false},
454  {" would fit on the previous line. ", PCONT, PModel(), false, false},
455  {"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0),
456  false, true},
457  {" Python and try it out. ", PCONT, PModel(), false, false},
458  {"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0),
459  false, true},
460  {" mistakes. ", PCONT, PModel(), false, false},
461  {"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
462  false, true},
463  {" For extra painful penalty work ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
464  {"you can try to identify source ", PCONT, PModel(), false, false},
465  {"code. Ouch! ", PCONT, PModel(), false, false},
466 };
467 
468 TEST(ParagraphsTest, TestComplexPage2) {
469  TestParagraphDetection(kComplexPage2, ABSL_ARRAYSIZE(kComplexPage2));
470 }
471 
472 const TextAndModel kSubtleCrown[] = {
473  {"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0),
474  true, false},
475  {"often not indented as the rest ", PCONT, PModel(), false, false},
476  {"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false},
477  {"less it should be counted as the", PCONT, PModel(), false, false},
478  {"same type of paragraph. ", PCONT, PModel(), false, false},
479  {" Even a short second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
480  {"should suffice. ", PCONT, PModel(), false, false},
481  {" 1235 ", PNONE, PModel(), false, false},
482 };
483 
484 TEST(ParagraphsTest, TestSubtleCrown) {
485  TestParagraphDetection(kSubtleCrown, ABSL_ARRAYSIZE(kSubtleCrown) - 1);
486 }
487 
488 TEST(ParagraphsTest, TestStrayLineInBlock) {
489  TestParagraphDetection(kSubtleCrown, ABSL_ARRAYSIZE(kSubtleCrown));
490 }
491 
492 const TextAndModel kUnlvRep3AO[] = {
493  {" Defined contribution plans cover employees in Australia, New", PSTART,
494  PModel(kLeft, 0, 50, 0, 0), false, false},
495  {"Zealand, Spain, the United Kingdom and some U.S. subsidiaries. ", PCONT, PModel(), false, false},
496  {"In addition, employees in the U.S. are eligible to participate in ", PCONT, PModel(), false, false},
497  {"defined contribution plans (Employee Savings Plans) by contribut-", PCONT, PModel(), false, false},
498  {"ing a portion of their compensation. The Company matches com- ", PCONT, PModel(), false, false},
499  {"pensation, depending on Company profit levels. Contributions ", PCONT, PModel(), false, false},
500  {"charged to income for defined contribution plans were $92 in ", PCONT, PModel(), false, false},
501  {"1993, $98 in 1992 and $89 in 1991. ", PCONT, PModel(), false, false},
502  {" In addition to providing pension benefits, the Company pro- ", PSTART,
503  PModel(kLeft, 0, 50, 0, 0), false, false},
504  {"vides certain health care and life insurance benefits to retired ", PCONT, PModel(), false, false},
505  {"employees. As discussed in Note A, the Company adopted FASB ", PCONT, PModel(), false, false},
506  {"Statement No. 106 effective January 1, 1992. Previously, the ", PCONT, PModel(), false, false},
507  {"Company recognized the cost of providing these benefits as the ", PCONT, PModel(), false, false},
508  {"benefits were paid. These pretax costs amounted to $53 in 1991. ", PCONT, PModel(), false, false},
509  {"The Company continues to fund most of the cost of these medical ", PCONT, PModel(), false, false},
510  {"and life insurance benefits in the year incurred. ", PCONT, PModel(), false, false},
511  {" The U.S. plan covering the parent company is the largest plan.",
512  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
513  {"It provides medical and life insurance benefits including hospital, ", PCONT, PModel(), false, false},
514  {"physicians’ services and major medical expense benefits and life ", PCONT, PModel(), false, false},
515  {"insurance benefits. The plan provides benefits supplemental to ", PCONT, PModel(), false, false},
516  {"Medicare after retirees are eligible for these benefits. The cost of ", PCONT, PModel(), false, false},
517  {"these benefits are shared by the Company and the retiree, with the ", PCONT, PModel(), false, false},
518  {"Company portion increasing as the retiree has increased years of ", PCONT, PModel(), false, false},
519  {"credited service. The Company has the ability to change these ", PCONT, PModel(), false, false},
520  {"benefits at any time. ", PCONT, PModel(), false, false},
521  {" Effective October 1993, the Company amended its health ", PSTART,
522  PModel(kLeft, 0, 50, 0, 0), false, false},
523  {"benefits plan in the U.S. to cap the cost absorbed by the Company ", PCONT, PModel(), false, false},
524  {"at approximately twice the 1993 cost per person for employees who", PCONT, PModel(), false, false},
525  {"retire after December 31, 1993. The effect of this amendment was ", PCONT, PModel(), false, false},
526  {"to reduce the December 31, 1993 accumulated postretirement ", PCONT, PModel(), false, false},
527  {"benefit obligation by $327. It also reduced the net periodic postre- ", PCONT, PModel(), false, false},
528  {"tirement cost by $21 for 1993 and is estimated to reduce this cost ", PCONT, PModel(), false, false},
529  {"for 1994 by approximately $83. ", PCONT, PModel(), false, false},
530 };
531 
532 TEST(ParagraphsTest, TestUnlvInsurance) {
533  TestParagraphDetection(kUnlvRep3AO, ABSL_ARRAYSIZE(kUnlvRep3AO));
534 }
535 
536 // The basic outcome we want for something with a bunch of leader dots is that
537 // we group each logical entry as a separate item. Without knowledge of
538 // leaders, we would most likely mark the text below as a simple right aligned
539 // paragraph or two.
540 // This example comes from Volume 9886293, Page 5
541 const TextAndModel kTableOfContents[] = {
542  {"1 Hmong People ........... 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
543  {" Hmong Origins . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
544  {" Language . . . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
545  {" Proverbs . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
546  {" Discussion . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
547  {" Riddles . . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
548  {" Discussion . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
549  {" Appearance . . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
550  {" Hmong History . . . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
551  {" Hmong in SE Asia . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
552  {" Hmong in the West . . .5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
553  {" Hmong in the USA . . . 5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
554  {" Discussion . . . . 6", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
555 };
556 
557 TEST(ParagraphsTest, TestSplitsOutLeaderLines) {
558  TestParagraphDetection(kTableOfContents, ABSL_ARRAYSIZE(kTableOfContents));
559 }
560 
561 const TextAndModel kTextWithSourceCode[] = {
562  {" A typical page of a programming book may contain", PSTART,
563  PModel(kLeft, 0, 20, 0, 0), false, false},
564  {"examples of source code to exemplify an algorithm ", PCONT, PModel(), false, false},
565  {"being described in prose. Such examples should be", PCONT, PModel(), false, false},
566  {"rendered as lineated text, meaning text with ", PCONT, PModel(), false, false},
567  {"explicit line breaks but without extra inter-line ", PCONT, PModel(), false, false},
568  {"spacing. Accidentally finding stray paragraphs in", PCONT, PModel(), false, false},
569  {"source code would lead to a bad reading experience", PCONT, PModel(), false, false},
570  {"when the text is re-flowed. ", PCONT, PModel(), false, false},
571  {" Let's show this by describing the function fact-", PSTART,
572  PModel(kLeft, 0, 20, 0, 0), false, false},
573  {"orial. Factorial is a simple recursive function ", PCONT, PModel(), false, false},
574  {"which grows very quickly. So quickly, in fact, ", PCONT, PModel(), false, false},
575  {"that the typical C implementation will only work ", PCONT, PModel(), false, false},
576  {"for values less than about 12: ", PCONT, PModel(), false, false},
577  {" ", PNONE, PModel(), false, false},
578  {" # Naive implementation in C ", PCONT, PModel(), false, false},
579  {" int factorial(int n) { ", PCONT, PModel(), false, false},
580  {" if (n < 2) ", PCONT, PModel(), false, false},
581  {" return 1; ", PCONT, PModel(), false, false},
582  {" return n * factorial(n - 1); ", PCONT, PModel(), false, false},
583  {" } ", PCONT, PModel(), false, false},
584  {" ", PCONT, PModel(), false, false},
585  {" The C programming language does not have built- ", PSTART,
586  PModel(kLeft, 0, 20, 0, 0), false, false},
587  {"in support for detecting integer overflow, so this", PCONT, PModel(), false, false},
588  {"naive implementation simply returns random values ", PCONT, PModel(), false, false},
589  {"if even a moderate sized n is provided. ", PCONT, PModel(), false, false},
590 };
591 
592 TEST(ParagraphsTest, NotDistractedBySourceCode) {
593  TestParagraphDetection(kTextWithSourceCode,
594  ABSL_ARRAYSIZE(kTextWithSourceCode));
595 }
596 
597 const TextAndModel kOldManAndSea[] = {
598  {"royal palm which are called guano and in it there was a bed, a",
599  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
600  {"table, one chair, and a place on the dirt floor to cook with charcoal.", PCONT, PModel(), false, false},
601  {"On the brown walls of the flattened, overlapping leaves of the", PCONT, PModel(), false, false},
602  {"sturdy fibered guano there was a picture in color of the Sacred", PCONT, PModel(), false, false},
603  {"Heart of Jesus and another of the Virgin of Cobre. These were", PCONT, PModel(), false, false},
604  {"relics of his wife. Once there had been a tinted photograph of his", PCONT, PModel(), false, false},
605  {"wife on the wall but he had taken it down because it made him too", PCONT, PModel(), false, false},
606  {"lonely to see it and it was on the shelf in the corner under his clean", PCONT, PModel(), false, false},
607  {"shirt. ", PCONT, PModel(), false, false},
608  {" \"What do you have to eat?\" the boy asked. ",
609  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
610  {" \"A pot of yellow rice with fish. Do you want some?\" ",
611  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
612  {" \"No. I will eat at home. Do you want me to make the fire?\" ",
613  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
614  {" \"No. I will make it later on. Or I may eat the rice cold.\" ",
615  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
616  {" \"May I take the cast net?\" ",
617  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
618  {" \"Of course.\" ",
619  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
620  {" There was no cast net and the boy remembered when they had",
621  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
622  {"sold it. But they went through this fiction every day. There was no", PCONT, PModel(), false, false},
623  {"pot of yellow rice and fish and the boy knew this too. "
624  " ", PCONT, PModel(), false, false},
625  {" \"Eighty-five is a lucky number,\" the old man said. \"How",
626  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
627  {"would you like to see me bring one in that dressed out over a "
628  "thou-", PCONT, PModel(), false, false},
629  {"sand pounds? "
630  " ", PCONT, PModel(), false, false},
631  {" \"I'll get the cast net and go for sardines. Will you sit in the "
632  "sun",
633  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
634  {"in the doorway?\" "
635  " ", PCONT, PModel(), false, false},
636  {" \"Yes. I have yesterday's paper and I will read the baseball.\" ",
637  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
638  {" The boy did not know whether yesterday's paper was a fiction",
639  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
640  {"too. But the old man brought it out from under the bed. ", PCONT, PModel(), false, false},
641  {" \"Pedrico gave it to me at the bodega,\" he explained. "
642  " ",
643  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
644  {" \"I'll be back when I have the sardines. I'll keep yours and mine",
645  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
646  {"together on ice and we can share them in the morning. When I", PCONT, PModel(), false, false},
647  {"come back you can tell me about the baseball.\" ", PCONT, PModel(), false, false},
648  {" \"The Yankees cannot lose.\" ",
649  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
650  {" \"But I fear the Indians of Cleveland.\" ",
651  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
652  {" \"Have faith in the Yankees my son. Think of the great Di-",
653  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
654  {"Maggio.\" ", PCONT, PModel(), false, false},
655  {" \"I fear both the Tigers of Detroit and the Indians of Cleve-",
656  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
657  {"land.\" ", PCONT, PModel(), false, false}
658 };
659 
660 TEST(ParagraphsTest, NotOverlyAggressiveWithBlockQuotes) {
661  TestParagraphDetection(kOldManAndSea, ABSL_ARRAYSIZE(kOldManAndSea));
662 }
663 
664 const TextAndModel kNewZealandIndex[] = {
665  {"Oats, 51 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
666  {"O'Brien, Gregory, 175 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
667  {"Occupational composition, 110,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
668  {" 138 ", PCONT, PModel(), false, false},
669  {"OECD rankings, 155, 172 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
670  {"Okiato (original capital), 47 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
671  {"Oil shock: 1974, xxx, 143; 1979,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
672  {" 145 ", PCONT, PModel(), false, false},
673  {"Old Age Pensions, xxii, 89-90 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
674  {"Old World evils, 77 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
675  {"Oliver, W. H., 39, 77, 89 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
676  {"Olssen, Erik, 45, 64, 84 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
677  {"Olympic Games, 1924, 111, 144 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
678  {"Once on Chunuk Bair, 149 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
679  {"Once Were Warriors, xxxiii, 170", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
680  {"On—shore whaling, xvi ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
681  {"Opotiki, xix ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
682  {"Orakau battle of, xviii, 57 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
683  {"O’Regan, Tipene, 170, 198-99 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
684  {"Organic agriculture, 177 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
685  {"Orwell, George, 151 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
686  {"Otago, xvii, 45, 49-50, 70 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
687  {"Otago block, xvii ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
688  {"Otago Daily Times, 67 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
689  {"Otago Girls’ High School, xix, 61,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
690  {" 85 ", PCONT, PModel(), false, false},
691  {"Otago gold rushes, 61-63 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
692  {"Otago Peninsula, xx ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
693  {"Otago Provincial Council, 68 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
694  {"Otaki, 33 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
695  {"Owls Do Cry, 139 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}
696 };
697 
698 TEST(ParagraphsTest, IndexPageTest) {
699  TestParagraphDetection(kNewZealandIndex, ABSL_ARRAYSIZE(kNewZealandIndex));
700 }
701 
702 // TODO(eger): Add some right-to-left examples, and fix the algorithm as needed.
703 
704 } // namespace
TBOX
Definition: cleanapi_test.cc:19
tesseract::RowInfo::has_leaders
bool has_leaders
Definition: paragraphs.h:47
string
std::string string
Definition: equationdetect_test.cc:21
GenericVector::delete_data_pointers
void delete_data_pointers()
Definition: genericvector.h:872
tesseract::RowInfo::rword_box
TBOX rword_box
Definition: paragraphs.h:56
INFO
Definition: log.h:29
tesseract::RowInfo::rword_indicates_list_item
bool rword_indicates_list_item
Definition: paragraphs.h:75
tesseract::RowInfo::lword_likely_starts_idea
bool lword_likely_starts_idea
Definition: paragraphs.h:72
tesseract::RowInfo::pix_rdistance
int pix_rdistance
Definition: paragraphs.h:50
tesseract::JUSTIFICATION_RIGHT
Definition: publictypes.h:252
tesseract::RowInfo::pix_xheight
float pix_xheight
Definition: paragraphs.h:51
tesseract::LeftWordAttributes
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:423
tesseract::RowInfo::lword_indicates_list_item
bool lword_indicates_list_item
Definition: paragraphs.h:71
tesseract::ParagraphJustification
ParagraphJustification
Definition: publictypes.h:248
tesseract::RowInfo::lword_likely_ends_idea
bool lword_likely_ends_idea
Definition: paragraphs.h:73
tesseract::RowInfo::average_interword_space
int average_interword_space
Definition: paragraphs.h:52
include_gunit.h
tesseract::RightWordAttributes
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:470
tesseract::RowInfo::rword_likely_starts_idea
bool rword_likely_starts_idea
Definition: paragraphs.h:76
tesseract::RowInfo::lword_text
STRING lword_text
Definition: paragraphs.h:58
ParagraphModel
Definition: ocrpara.h:114
tesseract::RowInfo
Definition: paragraphs.h:40
genericvector.h
STRING::size
int32_t size() const
Definition: strngs.h:68
tesseract::JUSTIFICATION_LEFT
Definition: publictypes.h:250
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
tesseract::RowInfo::rword_likely_ends_idea
bool rword_likely_ends_idea
Definition: paragraphs.h:77
tesseract::JUSTIFICATION_UNKNOWN
Definition: publictypes.h:249
tesseract::AsciiLikelyListItem
bool AsciiLikelyListItem(const STRING &word)
Definition: paragraphs.cpp:296
ocrpara.h
tesseract::RowInfo::num_words
int num_words
Definition: paragraphs.h:54
tesseract::RowInfo::has_drop_cap
bool has_drop_cap
Definition: paragraphs.h:48
paragraphs.h
GenericVector
Definition: baseapi.h:40
tesseract::RowInfo::text
STRING text
Definition: paragraphs.h:43
tesseract::RowInfo::lword_box
TBOX lword_box
Definition: paragraphs.h:55
tesseract::RowInfo::rword_text
STRING rword_text
Definition: paragraphs.h:59
TextModelInputType
TextModelInputType
Definition: paragraphs_test.cc:38
GenericVector::clear
void clear()
Definition: genericvector.h:857
tesseract::DetectParagraphs
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel * > *models)
Definition: paragraphs.cpp:2284
tesseract::RowInfo::ltr
bool ltr
Definition: paragraphs.h:44
paragraphs_internal.h
log.h
LOG
Definition: cleanapi_test.cc:19
tesseract::RowInfo::pix_ldistance
int pix_ldistance
Definition: paragraphs.h:49
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::JUSTIFICATION_CENTER
Definition: publictypes.h:251