14 #include "absl/strings/str_cat.h"
15 #include "absl/strings/str_join.h"
16 #include "absl/strings/str_split.h"
50 bool is_very_first_or_continuation;
56 void AsciiToRowInfo(
const char* text,
int row_number,
58 const int kCharWidth = 10;
59 const int kLineSpace = 30;
62 strstr(text,
"...") !=
nullptr || strstr(text,
". . .") !=
nullptr;
70 std::vector<std::string> words = absl::StrSplit(text,
' ', absl::SkipEmpty());
75 info->
rword_text = words[words.size() - 1].c_str();
77 while (lspace < info->text.size() && text[lspace] ==
' ') {
81 while (rspace < info->text.size() &&
82 text[info->
text.
size() - rspace - 1] ==
' ') {
86 int top = -kLineSpace * row_number;
87 int bottom = top - kLineSpace;
88 int row_right = kCharWidth * info->
text.
size();
105 void MakeAsciiRowInfos(
const TextAndModel* row_infos,
int n,
109 for (
int i = 0; i < n; i++) {
110 AsciiToRowInfo(row_infos[i].ascii, i, &info);
117 void EvaluateParagraphDetection(
const TextAndModel* correct,
int n,
119 int incorrect_breaks = 0;
120 int missed_breaks = 0;
121 int poorly_matched_models = 0;
123 int bad_list_items = 0;
124 ASSERT_EQ(detector_output.
size(), n);
125 for (
int i = 1; i < n; i++) {
126 bool has_break = correct[i].model_type != PCONT;
127 bool detected_break = (detector_output[i - 1] != detector_output[i]);
128 if (has_break && !detected_break) missed_breaks++;
129 if (detected_break && !has_break) incorrect_breaks++;
131 if (correct[i].model_type == PNONE) {
132 if (detector_output[i]->model !=
nullptr) {
133 poorly_matched_models++;
136 if (correct[i].model.justification() != kUnknown &&
137 (detector_output[i]->model ==
nullptr ||
138 !correct[i].model.Comparable(*detector_output[i]->model))) {
139 poorly_matched_models++;
142 if (correct[i].is_very_first_or_continuation ^
143 detector_output[i]->is_very_first_or_continuation) {
146 if (correct[i].is_list_item ^ detector_output[i]->is_list_item) {
151 EXPECT_EQ(incorrect_breaks, 0);
152 EXPECT_EQ(missed_breaks, 0);
153 EXPECT_EQ(poorly_matched_models, 0);
154 EXPECT_EQ(bad_list_items, 0);
155 EXPECT_EQ(bad_crowns, 0);
156 if (incorrect_breaks || missed_breaks || poorly_matched_models ||
157 bad_list_items || bad_crowns) {
158 std::vector<std::string> dbg_lines;
159 dbg_lines.push_back(
"# ==========================");
160 dbg_lines.push_back(
"# Correct paragraph breaks:");
161 dbg_lines.push_back(
"# ==========================");
162 for (
int i = 0; i < n; i++) {
163 if (correct[i].model_type != PCONT) {
164 dbg_lines.push_back(absl::StrCat(
165 correct[i].ascii,
" # ", correct[i].model.ToString().c_str(),
166 correct[i].is_very_first_or_continuation ?
" crown" :
"",
167 correct[i].is_list_item ?
" li" :
""));
169 dbg_lines.push_back(correct[i].ascii);
172 dbg_lines.push_back(
"");
173 dbg_lines.push_back(
"# ==========================");
174 dbg_lines.push_back(
"# Paragraph detector output:");
175 dbg_lines.push_back(
"# ==========================");
176 for (
int i = 0; i < n; i++) {
178 if (i == 0 || (detector_output[i - 1] != detector_output[i])) {
179 if (detector_output[i] && detector_output[i]->model) {
180 annotation += absl::StrCat(
181 " # ", detector_output[i]->model->ToString().c_str(),
182 detector_output[i]->is_very_first_or_continuation ?
" crown" :
"",
183 detector_output[i]->is_list_item ?
" li" :
"");
185 annotation =
" # Unmodeled paragraph.";
188 dbg_lines.push_back(absl::StrCat(correct[i].ascii, annotation));
190 LOG(
INFO) <<
"Discrepency!\n" << absl::StrJoin(dbg_lines,
"\n");
194 void TestParagraphDetection(
const TextAndModel* correct,
int num_rows) {
197 PARA_LIST paragraphs;
200 MakeAsciiRowInfos(correct, num_rows, &row_infos);
204 EvaluateParagraphDetection(correct, num_rows, row_owners);
208 TEST(ParagraphsTest, ListItemsIdentified) {
233 const TextAndModel kTwoSimpleParagraphs[] = {
234 {
" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
235 {
"This paragraph starts at the top", PCONT, PModel(),
false,
false},
236 {
"of the page and takes 3 lines. ", PCONT, PModel(),
false,
false},
237 {
" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
238 {
"which indicates that the first ", PCONT, PModel(),
false,
false},
239 {
"paragraph is not a continuation ", PCONT, PModel(),
false,
false},
240 {
"from a previous page, as it is ", PCONT, PModel(),
false,
false},
241 {
"indented just like this second ", PCONT, PModel(),
false,
false},
242 {
"paragraph. ", PCONT, PModel(),
false,
false},
245 TEST(ParagraphsTest, TestSimpleParagraphDetection) {
246 TestParagraphDetection(kTwoSimpleParagraphs,
247 ABSL_ARRAYSIZE(kTwoSimpleParagraphs));
250 const TextAndModel kFewCluesWithCrown[] = {
251 {
"This paragraph starts at the top", PSTART, PModel(kLeft, 0, 20, 0, 0),
253 {
"of the page and takes two lines.", PCONT, PModel(),
false,
false},
254 {
" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
255 {
"which indicates that the first ", PCONT, PModel(),
false,
false},
256 {
"paragraph is a continuation from", PCONT, PModel(),
false,
false},
257 {
"a previous page, as it is ", PCONT, PModel(),
false,
false},
258 {
"indented just like this second ", PCONT, PModel(),
false,
false},
259 {
"paragraph. ", PCONT, PModel(),
false,
false},
262 TEST(ParagraphsTest, TestFewCluesWithCrown) {
263 TestParagraphDetection(kFewCluesWithCrown,
264 ABSL_ARRAYSIZE(kFewCluesWithCrown));
267 const TextAndModel kCrownedParagraph[] = {
268 {
"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0),
270 {
"often not indented as the rest ", PCONT, PModel(),
false,
false},
271 {
"of the paragraphs are. Nonethe-", PCONT, PModel(),
false,
false},
272 {
"less it should be counted as the", PCONT, PModel(),
false,
false},
273 {
"same type of paragraph. ", PCONT, PModel(),
false,
false},
274 {
" The second and third para- ", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
275 {
"graphs are both indented two ", PCONT, PModel(),
false,
false},
276 {
"spaces. ", PCONT, PModel(),
false,
false},
277 {
" The first paragraph has what ", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
278 {
"fmt refers to as a 'crown.' ", PCONT, PModel(),
false,
false},
281 TEST(ParagraphsTest, TestCrownParagraphDetection) {
282 TestParagraphDetection(kCrownedParagraph, ABSL_ARRAYSIZE(kCrownedParagraph));
285 const TextAndModel kFlushLeftParagraphs[] = {
286 {
"It is sometimes the case that", PSTART, PModel(kLeft, 0, 0, 0, 0),
false,
false},
287 {
"flush left paragraphs (those", PCONT, PModel(),
false,
false},
288 {
"with no body indent) are not", PCONT, PModel(),
false,
false},
289 {
"actually crowns. ", PCONT, PModel(),
false,
false},
290 {
"Instead, further paragraphs are", PSTART, PModel(kLeft, 0, 0, 0, 0),
false,
false},
291 {
"also flush left aligned. Usual-", PCONT, PModel(),
false,
false},
292 {
"ly, these paragraphs are set", PCONT, PModel(),
false,
false},
293 {
"apart vertically by some white-", PCONT, PModel(),
false,
false},
294 {
"space, but you can also detect", PCONT, PModel(),
false,
false},
295 {
"them by observing the big empty", PCONT, PModel(),
false,
false},
296 {
"space at the ends of the para-", PCONT, PModel(),
false,
false},
297 {
"graphs. ", PCONT, PModel(),
false,
false},
300 TEST(ParagraphsText, TestRealFlushLeftParagraphs) {
301 TestParagraphDetection(kFlushLeftParagraphs,
302 ABSL_ARRAYSIZE(kFlushLeftParagraphs));
305 const TextAndModel kSingleFullPageContinuation[] = {
306 {
"sometimes a page is one giant", PSTART, PModel(kLeft, 0, 20, 0, 0),
true,
false},
307 {
"continuation. It flows from", PCONT, PModel(),
false,
false},
308 {
"line to line, using the full", PCONT, PModel(),
false,
false},
309 {
"column width with no clear", PCONT, PModel(),
false,
false},
310 {
"paragraph break, because it", PCONT, PModel(),
false,
false},
311 {
"actually doesn't have one. It", PCONT, PModel(),
false,
false},
312 {
"is the middle of one monster", PCONT, PModel(),
false,
false},
313 {
"paragraph continued from the", PCONT, PModel(),
false,
false},
314 {
"previous page and continuing", PCONT, PModel(),
false,
false},
315 {
"onto the next page. There-", PCONT, PModel(),
false,
false},
316 {
"fore, it ends up getting", PCONT, PModel(),
false,
false},
317 {
"marked as a crown and then", PCONT, PModel(),
false,
false},
318 {
"getting re-marked as any ex-", PCONT, PModel(),
false,
false},
319 {
"isting model. Not great, but", PCONT, PModel(),
false,
false},
322 TEST(ParagraphsTest, TestSingleFullPageContinuation) {
323 const TextAndModel* correct = kSingleFullPageContinuation;
324 int num_rows = ABSL_ARRAYSIZE(kSingleFullPageContinuation);
327 PARA_LIST paragraphs;
330 MakeAsciiRowInfos(correct, num_rows, &row_infos);
332 EvaluateParagraphDetection(correct, num_rows, row_owners);
336 const TextAndModel kRightAligned[] = {
337 {
"Right-aligned paragraphs are", PSTART, PModel(kRight, 0, 0, 0, 0),
false,
false},
338 {
" uncommon in Left-to-Right", PCONT, PModel(),
false,
false},
339 {
" languages, but they do", PCONT, PModel(),
false,
false},
340 {
" exist.", PCONT, PModel(),
false,
false},
341 {
" Mostly, however, they're", PSTART, PModel(kRight, 0, 0, 0, 0),
false,
false},
342 {
" horribly tiny paragraphs in", PCONT, PModel(),
false,
false},
343 {
" tables on which we have no", PCONT, PModel(),
false,
false},
344 {
" chance anyways.", PCONT, PModel(),
false,
false},
347 TEST(ParagraphsTest, TestRightAlignedParagraph) {
348 TestParagraphDetection(kRightAligned, ABSL_ARRAYSIZE(kRightAligned));
351 const TextAndModel kTinyParagraphs[] = {
352 {
" Occasionally, interspersed with", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
353 {
"obvious paragraph text, you might", PCONT, PModel(),
false,
false},
354 {
"find short exchanges of dialogue ", PCONT, PModel(),
false,
false},
355 {
"between characters. ", PCONT, PModel(),
false,
false},
356 {
" 'Oh?' ", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
357 {
" 'Don't be confused!' ", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
358 {
" 'Not me!' ", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
359 {
" One naive approach would be to ", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
360 {
"mark a new paragraph whenever one", PCONT, PModel(),
false,
false},
361 {
"of the statistics (left, right or", PCONT, PModel(),
false,
false},
362 {
"center) changes from one text-", PCONT, PModel(),
false,
false},
363 {
"line to the next. Such an", PCONT, PModel(),
false,
false},
364 {
"approach would misclassify the", PCONT, PModel(),
false,
false},
365 {
"tiny paragraphs above as a single", PCONT, PModel(),
false,
false},
366 {
"paragraph. ", PCONT, PModel(),
false,
false},
369 TEST(ParagraphsTest, TestTinyParagraphs) {
370 TestParagraphDetection(kTinyParagraphs, ABSL_ARRAYSIZE(kTinyParagraphs));
373 const TextAndModel kComplexPage1[] = {
374 {
" Awesome ", PSTART, PModel(kCenter, 0, 0, 0, 0),
false,
false},
375 {
" Centered Title ", PCONT, PModel(),
false,
false},
376 {
" Paragraph Detection ", PCONT, PModel(),
false,
false},
377 {
" OCR TEAM ", PCONT, PModel(),
false,
false},
378 {
" 10 November 2010 ", PCONT, PModel(),
false,
false},
379 {
" ", PNONE, PModel(),
false,
false},
380 {
" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
381 {
"This paragraph starts at the top", PCONT, PModel(),
false,
false},
382 {
"of the page and takes 3 lines. ", PCONT, PModel(),
false,
false},
383 {
" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
384 {
"which indicates that the first ", PCONT, PModel(),
false,
false},
385 {
"paragraph is not a continuation ", PCONT, PModel(),
false,
false},
386 {
"from a previous page, as it is ", PCONT, PModel(),
false,
false},
387 {
"indented just like this second ", PCONT, PModel(),
false,
false},
388 {
"paragraph. ", PCONT, PModel(),
false,
false},
389 {
" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0),
391 {
" looks like the prior text ", PCONT, PModel(),
false,
false},
392 {
" but it is indented more ", PCONT, PModel(),
false,
false},
393 {
" and is fully justified. ", PCONT, PModel(),
false,
false},
394 {
" So how does one deal with ", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
395 {
"centered text, block quotes, ", PCONT, PModel(),
false,
false},
396 {
"normal paragraphs, and lists ", PCONT, PModel(),
false,
false},
397 {
"like what follows? ", PCONT, PModel(),
false,
false},
398 {
"1. Make a plan. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
400 {
"2. Use a heuristic, for example,", PSTART, PModel(kLeft, 0, 0, 30, 0),
402 {
" looking for lines where the ", PCONT, PModel(),
false,
false},
403 {
" first word of the next line ", PCONT, PModel(),
false,
false},
404 {
" would fit on the previous ", PCONT, PModel(),
false,
false},
405 {
" line. ", PCONT, PModel(),
false,
false},
406 {
"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0),
408 {
" Python and try it out. ", PCONT, PModel(),
false,
false},
409 {
"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0),
411 {
" mistakes. ", PCONT, PModel(),
false,
false},
412 {
"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
414 {
" For extra painful penalty work", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
415 {
"you can try to identify source ", PCONT, PModel(),
false,
false},
416 {
"code. Ouch! ", PCONT, PModel(),
false,
false},
419 TEST(ParagraphsTest, TestComplexPage1) {
420 TestParagraphDetection(kComplexPage1, ABSL_ARRAYSIZE(kComplexPage1));
424 const TextAndModel kComplexPage2[] = {
425 {
" Awesome ", PSTART,
426 PModel(kCenter, 0, 0, 0, 0),
false,
false},
427 {
" Centered Title ", PCONT, PModel(),
false,
false},
428 {
" Paragraph Detection ", PCONT, PModel(),
false,
false},
429 {
" OCR TEAM ", PCONT, PModel(),
false,
false},
430 {
" 10 November 2010 ", PCONT, PModel(),
false,
false},
431 {
" ", PNONE, PModel(),
false,
false},
432 {
" Look here, I have a paragraph. ", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
433 {
"This paragraph starts at the top of", PCONT, PModel(),
false,
false},
434 {
"the page and takes 3 lines. ", PCONT, PModel(),
false,
false},
435 {
" Here I have a second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
436 {
"which indicates that the first ", PCONT, PModel(),
false,
false},
437 {
"paragraph is not a continuation ", PCONT, PModel(),
false,
false},
438 {
"from a previous page, as it is in- ", PCONT, PModel(),
false,
false},
439 {
"dented just like this second para- ", PCONT, PModel(),
false,
false},
440 {
"graph. ", PCONT, PModel(),
false,
false},
441 {
" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0),
443 {
" looks like the prior text ", PCONT, PModel(),
false,
false},
444 {
" but it is indented more ", PCONT, PModel(),
false,
false},
445 {
" and is fully justified. ", PCONT, PModel(),
false,
false},
446 {
" So how does one deal with center-", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
447 {
"ed text, block quotes, normal para-", PCONT, PModel(),
false,
false},
448 {
"graphs, and lists like what follow?", PCONT, PModel(),
false,
false},
449 {
"1. Make a plan. ", PCONT, PModel(),
false,
false},
450 {
"2. Use a heuristic, for example, ", PSTART, PModel(kLeft, 0, 0, 30, 0),
452 {
" looking for lines where the ", PCONT, PModel(),
false,
false},
453 {
" first word of the next line ", PCONT, PModel(),
false,
false},
454 {
" would fit on the previous line. ", PCONT, PModel(),
false,
false},
455 {
"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0),
457 {
" Python and try it out. ", PCONT, PModel(),
false,
false},
458 {
"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0),
460 {
" mistakes. ", PCONT, PModel(),
false,
false},
461 {
"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
463 {
" For extra painful penalty work ", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
464 {
"you can try to identify source ", PCONT, PModel(),
false,
false},
465 {
"code. Ouch! ", PCONT, PModel(),
false,
false},
468 TEST(ParagraphsTest, TestComplexPage2) {
469 TestParagraphDetection(kComplexPage2, ABSL_ARRAYSIZE(kComplexPage2));
472 const TextAndModel kSubtleCrown[] = {
473 {
"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0),
475 {
"often not indented as the rest ", PCONT, PModel(),
false,
false},
476 {
"of the paragraphs are. Nonethe-", PCONT, PModel(),
false,
false},
477 {
"less it should be counted as the", PCONT, PModel(),
false,
false},
478 {
"same type of paragraph. ", PCONT, PModel(),
false,
false},
479 {
" Even a short second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0),
false,
false},
480 {
"should suffice. ", PCONT, PModel(),
false,
false},
481 {
" 1235 ", PNONE, PModel(),
false,
false},
484 TEST(ParagraphsTest, TestSubtleCrown) {
485 TestParagraphDetection(kSubtleCrown, ABSL_ARRAYSIZE(kSubtleCrown) - 1);
488 TEST(ParagraphsTest, TestStrayLineInBlock) {
489 TestParagraphDetection(kSubtleCrown, ABSL_ARRAYSIZE(kSubtleCrown));
492 const TextAndModel kUnlvRep3AO[] = {
493 {
" Defined contribution plans cover employees in Australia, New", PSTART,
494 PModel(kLeft, 0, 50, 0, 0),
false,
false},
495 {
"Zealand, Spain, the United Kingdom and some U.S. subsidiaries. ", PCONT, PModel(),
false,
false},
496 {
"In addition, employees in the U.S. are eligible to participate in ", PCONT, PModel(),
false,
false},
497 {
"defined contribution plans (Employee Savings Plans) by contribut-", PCONT, PModel(),
false,
false},
498 {
"ing a portion of their compensation. The Company matches com- ", PCONT, PModel(),
false,
false},
499 {
"pensation, depending on Company profit levels. Contributions ", PCONT, PModel(),
false,
false},
500 {
"charged to income for defined contribution plans were $92 in ", PCONT, PModel(),
false,
false},
501 {
"1993, $98 in 1992 and $89 in 1991. ", PCONT, PModel(),
false,
false},
502 {
" In addition to providing pension benefits, the Company pro- ", PSTART,
503 PModel(kLeft, 0, 50, 0, 0),
false,
false},
504 {
"vides certain health care and life insurance benefits to retired ", PCONT, PModel(),
false,
false},
505 {
"employees. As discussed in Note A, the Company adopted FASB ", PCONT, PModel(),
false,
false},
506 {
"Statement No. 106 effective January 1, 1992. Previously, the ", PCONT, PModel(),
false,
false},
507 {
"Company recognized the cost of providing these benefits as the ", PCONT, PModel(),
false,
false},
508 {
"benefits were paid. These pretax costs amounted to $53 in 1991. ", PCONT, PModel(),
false,
false},
509 {
"The Company continues to fund most of the cost of these medical ", PCONT, PModel(),
false,
false},
510 {
"and life insurance benefits in the year incurred. ", PCONT, PModel(),
false,
false},
511 {
" The U.S. plan covering the parent company is the largest plan.",
512 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
513 {
"It provides medical and life insurance benefits including hospital, ", PCONT, PModel(),
false,
false},
514 {
"physicians’ services and major medical expense benefits and life ", PCONT, PModel(),
false,
false},
515 {
"insurance benefits. The plan provides benefits supplemental to ", PCONT, PModel(),
false,
false},
516 {
"Medicare after retirees are eligible for these benefits. The cost of ", PCONT, PModel(),
false,
false},
517 {
"these benefits are shared by the Company and the retiree, with the ", PCONT, PModel(),
false,
false},
518 {
"Company portion increasing as the retiree has increased years of ", PCONT, PModel(),
false,
false},
519 {
"credited service. The Company has the ability to change these ", PCONT, PModel(),
false,
false},
520 {
"benefits at any time. ", PCONT, PModel(),
false,
false},
521 {
" Effective October 1993, the Company amended its health ", PSTART,
522 PModel(kLeft, 0, 50, 0, 0),
false,
false},
523 {
"benefits plan in the U.S. to cap the cost absorbed by the Company ", PCONT, PModel(),
false,
false},
524 {
"at approximately twice the 1993 cost per person for employees who", PCONT, PModel(),
false,
false},
525 {
"retire after December 31, 1993. The effect of this amendment was ", PCONT, PModel(),
false,
false},
526 {
"to reduce the December 31, 1993 accumulated postretirement ", PCONT, PModel(),
false,
false},
527 {
"benefit obligation by $327. It also reduced the net periodic postre- ", PCONT, PModel(),
false,
false},
528 {
"tirement cost by $21 for 1993 and is estimated to reduce this cost ", PCONT, PModel(),
false,
false},
529 {
"for 1994 by approximately $83. ", PCONT, PModel(),
false,
false},
532 TEST(ParagraphsTest, TestUnlvInsurance) {
533 TestParagraphDetection(kUnlvRep3AO, ABSL_ARRAYSIZE(kUnlvRep3AO));
541 const TextAndModel kTableOfContents[] = {
542 {
"1 Hmong People ........... 1", PSTART, PModel(kUnknown, 0, 0, 0, 0),
false,
false},
543 {
" Hmong Origins . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0),
false,
false},
544 {
" Language . . . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0),
false,
false},
545 {
" Proverbs . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0),
false,
false},
546 {
" Discussion . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0),
false,
false},
547 {
" Riddles . . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0),
false,
false},
548 {
" Discussion . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0),
false,
false},
549 {
" Appearance . . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0),
false,
false},
550 {
" Hmong History . . . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0),
false,
false},
551 {
" Hmong in SE Asia . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0),
false,
false},
552 {
" Hmong in the West . . .5", PSTART, PModel(kUnknown, 0, 0, 0, 0),
false,
false},
553 {
" Hmong in the USA . . . 5", PSTART, PModel(kUnknown, 0, 0, 0, 0),
false,
false},
554 {
" Discussion . . . . 6", PSTART, PModel(kUnknown, 0, 0, 0, 0),
false,
false},
557 TEST(ParagraphsTest, TestSplitsOutLeaderLines) {
558 TestParagraphDetection(kTableOfContents, ABSL_ARRAYSIZE(kTableOfContents));
561 const TextAndModel kTextWithSourceCode[] = {
562 {
" A typical page of a programming book may contain", PSTART,
563 PModel(kLeft, 0, 20, 0, 0),
false,
false},
564 {
"examples of source code to exemplify an algorithm ", PCONT, PModel(),
false,
false},
565 {
"being described in prose. Such examples should be", PCONT, PModel(),
false,
false},
566 {
"rendered as lineated text, meaning text with ", PCONT, PModel(),
false,
false},
567 {
"explicit line breaks but without extra inter-line ", PCONT, PModel(),
false,
false},
568 {
"spacing. Accidentally finding stray paragraphs in", PCONT, PModel(),
false,
false},
569 {
"source code would lead to a bad reading experience", PCONT, PModel(),
false,
false},
570 {
"when the text is re-flowed. ", PCONT, PModel(),
false,
false},
571 {
" Let's show this by describing the function fact-", PSTART,
572 PModel(kLeft, 0, 20, 0, 0),
false,
false},
573 {
"orial. Factorial is a simple recursive function ", PCONT, PModel(),
false,
false},
574 {
"which grows very quickly. So quickly, in fact, ", PCONT, PModel(),
false,
false},
575 {
"that the typical C implementation will only work ", PCONT, PModel(),
false,
false},
576 {
"for values less than about 12: ", PCONT, PModel(),
false,
false},
577 {
" ", PNONE, PModel(),
false,
false},
578 {
" # Naive implementation in C ", PCONT, PModel(),
false,
false},
579 {
" int factorial(int n) { ", PCONT, PModel(),
false,
false},
580 {
" if (n < 2) ", PCONT, PModel(),
false,
false},
581 {
" return 1; ", PCONT, PModel(),
false,
false},
582 {
" return n * factorial(n - 1); ", PCONT, PModel(),
false,
false},
583 {
" } ", PCONT, PModel(),
false,
false},
584 {
" ", PCONT, PModel(),
false,
false},
585 {
" The C programming language does not have built- ", PSTART,
586 PModel(kLeft, 0, 20, 0, 0),
false,
false},
587 {
"in support for detecting integer overflow, so this", PCONT, PModel(),
false,
false},
588 {
"naive implementation simply returns random values ", PCONT, PModel(),
false,
false},
589 {
"if even a moderate sized n is provided. ", PCONT, PModel(),
false,
false},
592 TEST(ParagraphsTest, NotDistractedBySourceCode) {
593 TestParagraphDetection(kTextWithSourceCode,
594 ABSL_ARRAYSIZE(kTextWithSourceCode));
597 const TextAndModel kOldManAndSea[] = {
598 {
"royal palm which are called guano and in it there was a bed, a",
599 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
600 {
"table, one chair, and a place on the dirt floor to cook with charcoal.", PCONT, PModel(),
false,
false},
601 {
"On the brown walls of the flattened, overlapping leaves of the", PCONT, PModel(),
false,
false},
602 {
"sturdy fibered guano there was a picture in color of the Sacred", PCONT, PModel(),
false,
false},
603 {
"Heart of Jesus and another of the Virgin of Cobre. These were", PCONT, PModel(),
false,
false},
604 {
"relics of his wife. Once there had been a tinted photograph of his", PCONT, PModel(),
false,
false},
605 {
"wife on the wall but he had taken it down because it made him too", PCONT, PModel(),
false,
false},
606 {
"lonely to see it and it was on the shelf in the corner under his clean", PCONT, PModel(),
false,
false},
607 {
"shirt. ", PCONT, PModel(),
false,
false},
608 {
" \"What do you have to eat?\" the boy asked. ",
609 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
610 {
" \"A pot of yellow rice with fish. Do you want some?\" ",
611 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
612 {
" \"No. I will eat at home. Do you want me to make the fire?\" ",
613 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
614 {
" \"No. I will make it later on. Or I may eat the rice cold.\" ",
615 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
616 {
" \"May I take the cast net?\" ",
617 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
619 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
620 {
" There was no cast net and the boy remembered when they had",
621 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
622 {
"sold it. But they went through this fiction every day. There was no", PCONT, PModel(),
false,
false},
623 {
"pot of yellow rice and fish and the boy knew this too. "
624 " ", PCONT, PModel(),
false,
false},
625 {
" \"Eighty-five is a lucky number,\" the old man said. \"How",
626 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
627 {
"would you like to see me bring one in that dressed out over a "
628 "thou-", PCONT, PModel(),
false,
false},
630 " ", PCONT, PModel(),
false,
false},
631 {
" \"I'll get the cast net and go for sardines. Will you sit in the "
633 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
634 {
"in the doorway?\" "
635 " ", PCONT, PModel(),
false,
false},
636 {
" \"Yes. I have yesterday's paper and I will read the baseball.\" ",
637 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
638 {
" The boy did not know whether yesterday's paper was a fiction",
639 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
640 {
"too. But the old man brought it out from under the bed. ", PCONT, PModel(),
false,
false},
641 {
" \"Pedrico gave it to me at the bodega,\" he explained. "
643 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
644 {
" \"I'll be back when I have the sardines. I'll keep yours and mine",
645 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
646 {
"together on ice and we can share them in the morning. When I", PCONT, PModel(),
false,
false},
647 {
"come back you can tell me about the baseball.\" ", PCONT, PModel(),
false,
false},
648 {
" \"The Yankees cannot lose.\" ",
649 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
650 {
" \"But I fear the Indians of Cleveland.\" ",
651 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
652 {
" \"Have faith in the Yankees my son. Think of the great Di-",
653 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
654 {
"Maggio.\" ", PCONT, PModel(),
false,
false},
655 {
" \"I fear both the Tigers of Detroit and the Indians of Cleve-",
656 PSTART, PModel(kLeft, 0, 50, 0, 0),
false,
false},
657 {
"land.\" ", PCONT, PModel(),
false,
false}
660 TEST(ParagraphsTest, NotOverlyAggressiveWithBlockQuotes) {
661 TestParagraphDetection(kOldManAndSea, ABSL_ARRAYSIZE(kOldManAndSea));
664 const TextAndModel kNewZealandIndex[] = {
665 {
"Oats, 51 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
666 {
"O'Brien, Gregory, 175 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
667 {
"Occupational composition, 110,", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
668 {
" 138 ", PCONT, PModel(),
false,
false},
669 {
"OECD rankings, 155, 172 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
670 {
"Okiato (original capital), 47 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
671 {
"Oil shock: 1974, xxx, 143; 1979,", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
672 {
" 145 ", PCONT, PModel(),
false,
false},
673 {
"Old Age Pensions, xxii, 89-90 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
674 {
"Old World evils, 77 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
675 {
"Oliver, W. H., 39, 77, 89 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
676 {
"Olssen, Erik, 45, 64, 84 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
677 {
"Olympic Games, 1924, 111, 144 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
678 {
"Once on Chunuk Bair, 149 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
679 {
"Once Were Warriors, xxxiii, 170", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
680 {
"On—shore whaling, xvi ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
681 {
"Opotiki, xix ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
682 {
"Orakau battle of, xviii, 57 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
683 {
"O’Regan, Tipene, 170, 198-99 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
684 {
"Organic agriculture, 177 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
685 {
"Orwell, George, 151 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
686 {
"Otago, xvii, 45, 49-50, 70 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
687 {
"Otago block, xvii ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
688 {
"Otago Daily Times, 67 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
689 {
"Otago Girls’ High School, xix, 61,", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
690 {
" 85 ", PCONT, PModel(),
false,
false},
691 {
"Otago gold rushes, 61-63 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
692 {
"Otago Peninsula, xx ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
693 {
"Otago Provincial Council, 68 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
694 {
"Otaki, 33 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false},
695 {
"Owls Do Cry, 139 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
false,
false}
698 TEST(ParagraphsTest, IndexPageTest) {
699 TestParagraphDetection(kNewZealandIndex, ABSL_ARRAYSIZE(kNewZealandIndex));