tesseract  5.0.0-alpha-619-ge9db
PAGE_RES_IT Class Reference

#include <pageres.h>

Public Member Functions

 PAGE_RES_IT ()=default
 
 PAGE_RES_IT (PAGE_RES *the_page_res)
 
bool operator== (const PAGE_RES_IT &other) const
 
bool operator!= (const PAGE_RES_IT &other) const
 
int cmp (const PAGE_RES_IT &other) const
 
WERD_RESrestart_page ()
 
WERD_RESrestart_page_with_empties ()
 
WERD_RESstart_page (bool empty_ok)
 
WERD_RESrestart_row ()
 
WERD_RESInsertSimpleCloneWord (const WERD_RES &clone_res, WERD *new_word)
 
void ReplaceCurrentWord (tesseract::PointerVector< WERD_RES > *words)
 
void DeleteCurrentWord ()
 
void MakeCurrentWordFuzzy ()
 
WERD_RESforward ()
 
WERD_RESforward_with_empties ()
 
WERD_RESforward_paragraph ()
 
WERD_RESforward_block ()
 
WERD_RESprev_word () const
 
ROW_RESprev_row () const
 
BLOCK_RESprev_block () const
 
WERD_RESword () const
 
ROW_RESrow () const
 
BLOCK_RESblock () const
 
WERD_RESnext_word () const
 
ROW_RESnext_row () const
 
BLOCK_RESnext_block () const
 
void rej_stat_word ()
 
void ResetWordIterator ()
 

Public Attributes

PAGE_RESpage_res
 

Detailed Description

Definition at line 668 of file pageres.h.

Constructor & Destructor Documentation

◆ PAGE_RES_IT() [1/2]

PAGE_RES_IT::PAGE_RES_IT ( )
default

◆ PAGE_RES_IT() [2/2]

PAGE_RES_IT::PAGE_RES_IT ( PAGE_RES the_page_res)
inline

Definition at line 675 of file pageres.h.

675  :
676  PAGE_RES * page_res; // page being iterated
677 
678  PAGE_RES_IT() = default;

Member Function Documentation

◆ block()

BLOCK_RES* PAGE_RES_IT::block ( ) const
inline

Definition at line 754 of file pageres.h.

756  { // row of current word

◆ cmp()

int PAGE_RES_IT::cmp ( const PAGE_RES_IT other) const

Definition at line 1141 of file pageres.cpp.

1144  {
1145  ASSERT_HOST(page_res == other.page_res);
1146  if (other.block_res == nullptr) {
1147  // other points to the end of the page.
1148  if (block_res == nullptr)
1149  return 0;
1150  return -1;
1151  }
1152  if (block_res == nullptr) {
1153  return 1; // we point to the end of the page.
1154  }
1155  if (block_res == other.block_res) {
1156  if (other.row_res == nullptr || row_res == nullptr) {
1157  // this should only happen if we hit an image block.
1158  return 0;
1159  }
1160  if (row_res == other.row_res) {
1161  // we point to the same block and row.
1162  ASSERT_HOST(other.word_res != nullptr && word_res != nullptr);
1163  if (word_res == other.word_res) {
1164  // we point to the same word!
1165  return 0;
1166  }
1167 
1168  WERD_RES_IT word_res_it(&row_res->word_res_list);
1169  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
1170  word_res_it.forward()) {
1171  if (word_res_it.data() == word_res) {
1172  return -1;
1173  } else if (word_res_it.data() == other.word_res) {
1174  return 1;
1175  }
1176  }
1177  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1178  }
1179 
1180  // we both point to the same block, but different rows.
1181  ROW_RES_IT row_res_it(&block_res->row_res_list);
1182  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
1183  row_res_it.forward()) {
1184  if (row_res_it.data() == row_res) {
1185  return -1;
1186  } else if (row_res_it.data() == other.row_res) {
1187  return 1;
1188  }
1189  }
1190  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1191  }
1192 
1193  // We point to different blocks.
1194  BLOCK_RES_IT block_res_it(&page_res->block_res_list);
1195  for (block_res_it.mark_cycle_pt();
1196  !block_res_it.cycled_list(); block_res_it.forward()) {
1197  if (block_res_it.data() == block_res) {
1198  return -1;
1199  } else if (block_res_it.data() == other.block_res) {
1200  return 1;
1201  }
1202  }
1203  // Shouldn't happen...

◆ DeleteCurrentWord()

void PAGE_RES_IT::DeleteCurrentWord ( )

Definition at line 1436 of file pageres.cpp.

1439  {
1440  // Check that this word is as we expect. part_of_combos are NEVER iterated
1441  // by the normal iterator, so we should never be trying to delete them.
1442  ASSERT_HOST(!word_res->part_of_combo);
1443  if (!word_res->combination) {
1444  // Combinations own their own word, so we won't find the word on the
1445  // row's word_list, but it is legitimate to try to delete them.
1446  // Delete word from the ROW when not a combination.
1447  WERD_IT w_it(row()->row->word_list());
1448  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1449  if (w_it.data() == word_res->word) {
1450  break;
1451  }
1452  }
1453  ASSERT_HOST(!w_it.cycled_list());
1454  delete w_it.extract();
1455  }
1456  // Remove the WERD_RES for the new_word.
1457  // Remove the WORD_RES from the ROW_RES.
1458  WERD_RES_IT wr_it(&row()->word_res_list);
1459  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1460  if (wr_it.data() == word_res) {
1461  word_res = nullptr;
1462  break;
1463  }
1464  }
1465  ASSERT_HOST(!wr_it.cycled_list());

◆ forward()

WERD_RES* PAGE_RES_IT::forward ( )
inline

Definition at line 728 of file pageres.h.

733  { // Get next word.

◆ forward_block()

WERD_RES * PAGE_RES_IT::forward_block ( )

Definition at line 1651 of file pageres.cpp.

◆ forward_paragraph()

WERD_RES * PAGE_RES_IT::forward_paragraph ( )

Definition at line 1637 of file pageres.cpp.

1644  {

◆ forward_with_empties()

WERD_RES* PAGE_RES_IT::forward_with_empties ( )
inline

Definition at line 732 of file pageres.h.

733  { // Get next word.
734  return internal_forward(false, false);

◆ InsertSimpleCloneWord()

WERD_RES * PAGE_RES_IT::InsertSimpleCloneWord ( const WERD_RES clone_res,
WERD new_word 
)

Definition at line 1209 of file pageres.cpp.

1213  {
1214  // Make a WERD_RES for the new_word.
1215  auto* new_res = new WERD_RES(new_word);
1216  new_res->CopySimpleFields(clone_res);
1217  new_res->combination = true;
1218  // Insert into the appropriate place in the ROW_RES.
1219  WERD_RES_IT wr_it(&row()->word_res_list);
1220  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1221  WERD_RES* word = wr_it.data();
1222  if (word == word_res)
1223  break;
1224  }
1225  ASSERT_HOST(!wr_it.cycled_list());
1226  wr_it.add_before_then_move(new_res);
1227  if (wr_it.at_first()) {
1228  // This is the new first word, so reset the member iterator so it
1229  // detects the cycled_list state correctly.

◆ MakeCurrentWordFuzzy()

void PAGE_RES_IT::MakeCurrentWordFuzzy ( )

Definition at line 1469 of file pageres.cpp.

1472  {
1473  WERD* real_word = word_res->word;
1474  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
1475  real_word->set_flag(W_FUZZY_SP, true);
1476  if (word_res->combination) {
1477  // The next word should be the corresponding part of combo, but we have
1478  // already stepped past it, so find it by search.
1479  WERD_RES_IT wr_it(&row()->word_res_list);
1480  for (wr_it.mark_cycle_pt();
1481  !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
1482  }
1483  wr_it.forward();
1484  ASSERT_HOST(wr_it.data()->part_of_combo);
1485  real_word = wr_it.data()->word;
1486  ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
1487  !real_word->flag(W_FUZZY_NON));
1488  real_word->set_flag(W_FUZZY_SP, true);

◆ next_block()

BLOCK_RES* PAGE_RES_IT::next_block ( ) const
inline

Definition at line 763 of file pageres.h.

765  { // row of next word

◆ next_row()

ROW_RES* PAGE_RES_IT::next_row ( ) const
inline

Definition at line 760 of file pageres.h.

762  { // next word

◆ next_word()

WERD_RES* PAGE_RES_IT::next_word ( ) const
inline

Definition at line 757 of file pageres.h.

759  { // block of cur. word

◆ operator!=()

bool PAGE_RES_IT::operator!= ( const PAGE_RES_IT other) const
inline

Definition at line 687 of file pageres.h.

687 {

◆ operator==()

bool PAGE_RES_IT::operator== ( const PAGE_RES_IT other) const
inline

Definition at line 682 of file pageres.h.

687  {

◆ prev_block()

BLOCK_RES* PAGE_RES_IT::prev_block ( ) const
inline

Definition at line 745 of file pageres.h.

747  { // row of prev word

◆ prev_row()

ROW_RES* PAGE_RES_IT::prev_row ( ) const
inline

Definition at line 742 of file pageres.h.

744  { // previous word

◆ prev_word()

WERD_RES* PAGE_RES_IT::prev_word ( ) const
inline

Definition at line 739 of file pageres.h.

744  { // previous word

◆ rej_stat_word()

void PAGE_RES_IT::rej_stat_word ( )

Definition at line 1658 of file pageres.cpp.

1659  {
1660  while (block_res == next_block_res) {
1661  internal_forward(false, true);
1662  }
1663  return internal_forward(false, true);
1664 }
1665 
1667  int16_t chars_in_word;
1668  int16_t rejects_in_word = 0;
1669 
1670  chars_in_word = word_res->reject_map.length ();
1671  page_res->char_count += chars_in_word;
1672  block_res->char_count += chars_in_word;
1673  row_res->char_count += chars_in_word;
1674 

◆ ReplaceCurrentWord()

void PAGE_RES_IT::ReplaceCurrentWord ( tesseract::PointerVector< WERD_RES > *  words)

Definition at line 1329 of file pageres.cpp.

1333  {
1334  if (words->empty()) {
1336  return;
1337  }
1338  WERD_RES* input_word = word();
1339  // Set the BOL/EOL flags on the words from the input word.
1340  if (input_word->word->flag(W_BOL)) {
1341  (*words)[0]->word->set_flag(W_BOL, true);
1342  } else {
1343  (*words)[0]->word->set_blanks(input_word->word->space());
1344  }
1345  words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
1346 
1347  // Move the blobs from the input word to the new set of words.
1348  // If the input word_res is a combination, then the replacements will also be
1349  // combinations, and will own their own words. If the input word_res is not a
1350  // combination, then the final replacements will not be either, (although it
1351  // is allowed for the input words to be combinations) and their words
1352  // will get put on the row list. This maintains the ownership rules.
1353  WERD_IT w_it(row()->row->word_list());
1354  if (!input_word->combination) {
1355  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1356  WERD* word = w_it.data();
1357  if (word == input_word->word)
1358  break;
1359  }
1360  // w_it is now set to the input_word's word.
1361  ASSERT_HOST(!w_it.cycled_list());
1362  }
1363  // Insert into the appropriate place in the ROW_RES.
1364  WERD_RES_IT wr_it(&row()->word_res_list);
1365  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1366  WERD_RES* word = wr_it.data();
1367  if (word == input_word)
1368  break;
1369  }
1370  ASSERT_HOST(!wr_it.cycled_list());
1371  // Since we only have an estimate of the bounds between blobs, use the blob
1372  // x-middle as the determiner of where to put the blobs
1373  C_BLOB_IT src_b_it(input_word->word->cblob_list());
1374  src_b_it.sort(&C_BLOB::SortByXMiddle);
1375  C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
1376  rej_b_it.sort(&C_BLOB::SortByXMiddle);
1377  TBOX clip_box;
1378  for (int w = 0; w < words->size(); ++w) {
1379  WERD_RES* word_w = (*words)[w];
1380  clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
1381  // Compute blob boundaries.
1382  GenericVector<int> blob_ends;
1383  C_BLOB_LIST* next_word_blobs =
1384  w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
1385  ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
1386  // Remove the fake blobs on the current word, but keep safe for back-up if
1387  // no blob can be found.
1388  C_BLOB_LIST fake_blobs;
1389  C_BLOB_IT fake_b_it(&fake_blobs);
1390  fake_b_it.add_list_after(word_w->word->cblob_list());
1391  fake_b_it.move_to_first();
1392  word_w->word->cblob_list()->clear();
1393  C_BLOB_IT dest_it(word_w->word->cblob_list());
1394  // Build the box word as we move the blobs.
1395  auto* box_word = new tesseract::BoxWord;
1396  for (int i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
1397  int end_x = blob_ends[i];
1398  TBOX blob_box;
1399  // Add the blobs up to end_x.
1400  while (!src_b_it.empty() &&
1401  src_b_it.data()->bounding_box().x_middle() < end_x) {
1402  blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
1403  src_b_it.forward();
1404  }
1405  while (!rej_b_it.empty() &&
1406  rej_b_it.data()->bounding_box().x_middle() < end_x) {
1407  blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
1408  rej_b_it.forward();
1409  }
1410  if (blob_box.null_box()) {
1411  // Use the original box as a back-up.
1412  blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
1413  }
1414  box_word->InsertBox(i, blob_box);
1415  }
1416  delete word_w->box_word;
1417  word_w->box_word = box_word;
1418  if (!input_word->combination) {
1419  // Insert word_w->word into the ROW. It doesn't own its word, so the
1420  // ROW needs to own it.
1421  w_it.add_before_stay_put(word_w->word);
1422  word_w->combination = false;
1423  }
1424  (*words)[w] = nullptr; // We are taking ownership.
1425  wr_it.add_before_stay_put(word_w);
1426  }
1427  // We have taken ownership of the words.
1428  words->clear();
1429  // Delete the current word, which has been replaced. We could just call
1430  // DeleteCurrentWord, but that would iterate both lists again, and we know
1431  // we are already in the right place.
1432  if (!input_word->combination)
1433  delete w_it.extract();

◆ ResetWordIterator()

void PAGE_RES_IT::ResetWordIterator ( )

Definition at line 1518 of file pageres.cpp.

1522  {
1523  if (row_res == next_row_res) {
1524  // Reset the member iterator so it can move forward and detect the
1525  // cycled_list state correctly.
1526  word_res_it.move_to_first();
1527  for (word_res_it.mark_cycle_pt();
1528  !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
1529  word_res_it.forward()) {
1530  if (!word_res_it.data()->part_of_combo) {
1531  if (prev_row_res == row_res) prev_word_res = word_res;
1532  word_res = word_res_it.data();
1533  }
1534  }
1535  ASSERT_HOST(!word_res_it.cycled_list());
1536  wr_it_of_next_word = word_res_it;
1537  word_res_it.forward();
1538  } else {
1539  // word_res_it is OK, but reset word_res and prev_word_res if needed.
1540  WERD_RES_IT wr_it(&row_res->word_res_list);
1541  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1542  if (!wr_it.data()->part_of_combo) {
1543  if (prev_row_res == row_res) prev_word_res = word_res;
1544  word_res = wr_it.data();

◆ restart_page()

WERD_RES* PAGE_RES_IT::restart_page ( )
inline

Definition at line 695 of file pageres.h.

700  {

◆ restart_page_with_empties()

WERD_RES* PAGE_RES_IT::restart_page_with_empties ( )
inline

Definition at line 698 of file pageres.h.

700  {

◆ restart_row()

WERD_RES * PAGE_RES_IT::restart_row ( )

Definition at line 1623 of file pageres.cpp.

1629  {
1630  ROW_RES *row = this->row();

◆ row()

ROW_RES* PAGE_RES_IT::row ( ) const
inline

Definition at line 751 of file pageres.h.

753  { // current word

◆ start_page()

WERD_RES * PAGE_RES_IT::start_page ( bool  empty_ok)

Definition at line 1495 of file pageres.cpp.

1499  {
1500  block_res_it.set_to_list(&page_res->block_res_list);
1501  block_res_it.mark_cycle_pt();
1502  prev_block_res = nullptr;
1503  prev_row_res = nullptr;
1504  prev_word_res = nullptr;
1505  block_res = nullptr;
1506  row_res = nullptr;
1507  word_res = nullptr;
1508  next_block_res = nullptr;
1509  next_row_res = nullptr;

◆ word()

WERD_RES* PAGE_RES_IT::word ( ) const
inline

Definition at line 748 of file pageres.h.

750  { // block of prev word

Member Data Documentation

◆ page_res

PAGE_RES* PAGE_RES_IT::page_res

Definition at line 671 of file pageres.h.


The documentation for this class was generated from the following files:
WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:266
BLOCK_RES::char_count
int32_t char_count
Definition: pageres.h:114
WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:116
PAGE_RES_IT::PAGE_RES_IT
PAGE_RES_IT()=default
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:751
WERD_RES::combination
bool combination
Definition: pageres.h:333
WERD_RES
Definition: pageres.h:160
tesseract::PointerVector::clear
void clear()
Definition: genericvector.h:490
C_BLOB::SortByXMiddle
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:124
PAGE_RES::block_res_list
BLOCK_RES_LIST block_res_list
Definition: pageres.h:78
GenericVector::back
T & back() const
Definition: genericvector.h:728
WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:94
ROW_RES::char_count
int32_t char_count
Definition: pageres.h:137
REJMAP::length
int32_t length() const
Definition: rejctmap.h:222
W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:54
PAGE_RES::char_count
int32_t char_count
Definition: pageres.h:76
WERD::space
uint8_t space()
Definition: werd.h:98
TBOX::null_box
bool null_box() const
Definition: rect.h:49
WERD::set_flag
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:117
W_EOL
end of line
Definition: werd.h:47
PAGE_RES_IT::DeleteCurrentWord
void DeleteCurrentWord()
Definition: pageres.cpp:1436
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:288
PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:748
PAGE_RES
Definition: pageres.h:73
PAGE_RES_IT::rej_stat_word
void rej_stat_word()
Definition: pageres.cpp:1658
GenericVector< int >
BLOCK_RES::row_res_list
ROW_RES_LIST row_res_list
Definition: pageres.h:122
W_FUZZY_SP
fuzzy space
Definition: werd.h:53
ROW_RES
Definition: pageres.h:133
WERD
Definition: werd.h:55
ROW_RES::word_res_list
WERD_RES_LIST word_res_list
Definition: pageres.h:140
WERD_RES::part_of_combo
bool part_of_combo
Definition: pageres.h:334
WERD_RES::word
WERD * word
Definition: pageres.h:180
GenericVector::size
int size() const
Definition: genericvector.h:71
PAGE_RES_IT::page_res
PAGE_RES * page_res
Definition: pageres.h:671
WERD::rej_cblob_list
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:89
PAGE_RES_IT::ResetWordIterator
void ResetWordIterator()
Definition: pageres.cpp:1518
W_BOL
start of line
Definition: werd.h:46
TBOX
Definition: rect.h:33
tesseract::BoxWord
Definition: boxword.h:36