tesseract  5.0.0-alpha-619-ge9db
linerec.cpp
Go to the documentation of this file.
1 // File: linerec.cpp
3 // Description: Top-level line-based recognition module for Tesseract.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2013, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
17 
18 #include "tesseractclass.h"
19 
20 #include "allheaders.h"
21 #include "boxread.h"
22 #include "imagedata.h"
23 #ifndef ANDROID_BUILD
24 #include "lstmrecognizer.h"
25 #include "recodebeam.h"
26 #endif
27 #include "pageres.h"
28 #include "tprintf.h"
29 
30 #include <algorithm>
31 
32 namespace tesseract {
33 
34 // Scale factor to make certainty more comparable to Tesseract.
35 const float kCertaintyScale = 7.0f;
36 // Worst acceptable certainty for a dictionary word.
37 const float kWorstDictCertainty = -25.0f;
38 
39 // Generates training data for training a line recognizer, eg LSTM.
40 // Breaks the page into lines, according to the boxes, and writes them to a
41 // serialized DocumentData based on output_basename.
42 // Return true if successful, false if an error occurred.
43 bool Tesseract::TrainLineRecognizer(const STRING& input_imagename,
44  const STRING& output_basename,
45  BLOCK_LIST *block_list) {
46  STRING lstmf_name = output_basename + ".lstmf";
47  DocumentData images(lstmf_name);
48  if (applybox_page > 0) {
49  // Load existing document for the previous pages.
50  if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
51  tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
52  return false;
53  }
54  }
55  GenericVector<TBOX> boxes;
57  // Get the boxes for this page, if there are any.
58  if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr,
59  nullptr) ||
60  boxes.empty()) {
61  tprintf("Failed to read boxes from %s\n", input_imagename.c_str());
62  return false;
63  }
64  TrainFromBoxes(boxes, texts, block_list, &images);
65  if (images.PagesSize() == 0) {
66  tprintf("Failed to read pages from %s\n", input_imagename.c_str());
67  return false;
68  }
69  images.Shuffle();
70  if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
71  tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
72  return false;
73  }
74  return true;
75 }
76 
77 // Generates training data for training a line recognizer, eg LSTM.
78 // Breaks the boxes into lines, normalizes them, converts to ImageData and
79 // appends them to the given training_data.
81  const GenericVector<STRING>& texts,
82  BLOCK_LIST *block_list,
83  DocumentData* training_data) {
84  int box_count = boxes.size();
85  // Process all the text lines in this page, as defined by the boxes.
86  int end_box = 0;
87  // Don't let \t, which marks newlines in the box file, get into the line
88  // content, as that makes the line unusable in training.
89  while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
90  for (int start_box = end_box; start_box < box_count; start_box = end_box) {
91  // Find the textline of boxes starting at start and their bounding box.
92  TBOX line_box = boxes[start_box];
93  STRING line_str = texts[start_box];
94  for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t";
95  ++end_box) {
96  line_box += boxes[end_box];
97  line_str += texts[end_box];
98  }
99  // Find the most overlapping block.
100  BLOCK* best_block = nullptr;
101  int best_overlap = 0;
102  BLOCK_IT b_it(block_list);
103  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
104  BLOCK* block = b_it.data();
105  if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText())
106  continue; // Not a text block.
107  TBOX block_box = block->pdblk.bounding_box();
108  block_box.rotate(block->re_rotation());
109  if (block_box.major_overlap(line_box)) {
110  TBOX overlap_box = line_box.intersection(block_box);
111  if (overlap_box.area() > best_overlap) {
112  best_overlap = overlap_box.area();
113  best_block = block;
114  }
115  }
116  }
117  ImageData* imagedata = nullptr;
118  if (best_block == nullptr) {
119  tprintf("No block overlapping textline: %s\n", line_str.c_str());
120  } else {
121  imagedata = GetLineData(line_box, boxes, texts, start_box, end_box,
122  *best_block);
123  }
124  if (imagedata != nullptr)
125  training_data->AddPageToDocument(imagedata);
126  // Don't let \t, which marks newlines in the box file, get into the line
127  // content, as that makes the line unusable in training.
128  while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
129  }
130 }
131 
132 // Returns an Imagedata containing the image of the given box,
133 // and ground truth boxes/truth text if available in the input.
134 // The image is not normalized in any way.
136  const GenericVector<TBOX>& boxes,
137  const GenericVector<STRING>& texts,
138  int start_box, int end_box,
139  const BLOCK& block) {
140  TBOX revised_box;
141  ImageData* image_data = GetRectImage(line_box, block, kImagePadding,
142  &revised_box);
143  if (image_data == nullptr) return nullptr;
144  image_data->set_page_number(applybox_page);
145  // Copy the boxes and shift them so they are relative to the image.
146  FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
147  ICOORD shift = -revised_box.botleft();
148  GenericVector<TBOX> line_boxes;
149  GenericVector<STRING> line_texts;
150  for (int b = start_box; b < end_box; ++b) {
151  TBOX box = boxes[b];
152  box.rotate(block_rotation);
153  box.move(shift);
154  line_boxes.push_back(box);
155  line_texts.push_back(texts[b]);
156  }
157  GenericVector<int> page_numbers;
158  page_numbers.init_to_size(line_boxes.size(), applybox_page);
159  image_data->AddBoxes(line_boxes, line_texts, page_numbers);
160  return image_data;
161 }
162 
163 // Helper gets the image of a rectangle, using the block.re_rotation() if
164 // needed to get to the image, and rotating the result back to horizontal
165 // layout. (CJK characters will be on their left sides) The vertical text flag
166 // is set in the returned ImageData if the text was originally vertical, which
167 // can be used to invoke a different CJK recognition engine. The revised_box
168 // is also returned to enable calculation of output bounding boxes.
169 ImageData* Tesseract::GetRectImage(const TBOX& box, const BLOCK& block,
170  int padding, TBOX* revised_box) const {
171  TBOX wbox = box;
172  wbox.pad(padding, padding);
173  *revised_box = wbox;
174  // Number of clockwise 90 degree rotations needed to get back to tesseract
175  // coords from the clipped image.
176  int num_rotations = 0;
177  if (block.re_rotation().y() > 0.0f)
178  num_rotations = 1;
179  else if (block.re_rotation().x() < 0.0f)
180  num_rotations = 2;
181  else if (block.re_rotation().y() < 0.0f)
182  num_rotations = 3;
183  // Handle two cases automatically: 1 the box came from the block, 2 the box
184  // came from a box file, and refers to the image, which the block may not.
185  if (block.pdblk.bounding_box().major_overlap(*revised_box))
186  revised_box->rotate(block.re_rotation());
187  // Now revised_box always refers to the image.
188  // BestPix is never colormapped, but may be of any depth.
189  Pix* pix = BestPix();
190  int width = pixGetWidth(pix);
191  int height = pixGetHeight(pix);
192  TBOX image_box(0, 0, width, height);
193  // Clip to image bounds;
194  *revised_box &= image_box;
195  if (revised_box->null_box()) return nullptr;
196  Box* clip_box = boxCreate(revised_box->left(), height - revised_box->top(),
197  revised_box->width(), revised_box->height());
198  Pix* box_pix = pixClipRectangle(pix, clip_box, nullptr);
199  if (box_pix == nullptr) return nullptr;
200  boxDestroy(&clip_box);
201  if (num_rotations > 0) {
202  Pix* rot_pix = pixRotateOrth(box_pix, num_rotations);
203  pixDestroy(&box_pix);
204  box_pix = rot_pix;
205  }
206  // Convert sub-8-bit images to 8 bit.
207  int depth = pixGetDepth(box_pix);
208  if (depth < 8) {
209  Pix* grey;
210  grey = pixConvertTo8(box_pix, false);
211  pixDestroy(&box_pix);
212  box_pix = grey;
213  }
214  bool vertical_text = false;
215  if (num_rotations > 0) {
216  // Rotated the clipped revised box back to internal coordinates.
217  FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
218  revised_box->rotate(rotation);
219  if (num_rotations != 2)
220  vertical_text = true;
221  }
222  return new ImageData(vertical_text, box_pix);
223 }
224 
225 #ifndef ANDROID_BUILD
226 // Recognizes a word or group of words, converting to WERD_RES in *words.
227 // Analogous to classify_word_pass1, but can handle a group of words as well.
228 void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
229  PointerVector<WERD_RES>* words) {
230  TBOX word_box = word->word->bounding_box();
231  // Get the word image - no frills.
234  // In single word mode, use the whole image without any other row/word
235  // interpretation.
236  word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
237  } else {
238  float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
239  if (baseline + row->descenders() < word_box.bottom())
240  word_box.set_bottom(baseline + row->descenders());
241  if (baseline + row->x_height() + row->ascenders() > word_box.top())
242  word_box.set_top(baseline + row->x_height() + row->ascenders());
243  }
244  ImageData* im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
245  if (im_data == nullptr) return;
246 
247  bool do_invert = tessedit_do_invert;
248  lstm_recognizer_->RecognizeLine(*im_data, do_invert, classify_debug_level > 0,
250  word_box, words, lstm_choice_mode,
252  delete im_data;
253  SearchWords(words);
254 }
255 
256 // Apply segmentation search to the given set of words, within the constraints
257 // of the existing ratings matrix. If there is already a best_choice on a word
258 // leaves it untouched and just sets the done/accepted etc flags.
260  // Run the segmentation search on the network outputs and make a BoxWord
261  // for each of the output words.
262  // If we drop a word as junk, then there is always a space in front of the
263  // next.
264  const Dict* stopper_dict = lstm_recognizer_->GetDict();
265  if (stopper_dict == nullptr) stopper_dict = &getDict();
266  bool any_nonspace_delimited = false;
267  for (int w = 0; w < words->size(); ++w) {
268  WERD_RES* word = (*words)[w];
269  if (word->best_choice != nullptr &&
271  any_nonspace_delimited = true;
272  break;
273  }
274  }
275  for (int w = 0; w < words->size(); ++w) {
276  WERD_RES* word = (*words)[w];
277  if (word->best_choice == nullptr) {
278  // It is a dud.
279  word->SetupFake(lstm_recognizer_->GetUnicharset());
280  } else {
281  // Set the best state.
282  for (int i = 0; i < word->best_choice->length(); ++i) {
283  int length = word->best_choice->state(i);
284  word->best_state.push_back(length);
285  }
286  word->reject_map.initialise(word->best_choice->length());
287  word->tess_failed = false;
288  word->tess_accepted = true;
289  word->tess_would_adapt = false;
290  word->done = true;
291  word->tesseract = this;
292  float word_certainty = std::min(word->space_certainty,
293  word->best_choice->certainty());
294  word_certainty *= kCertaintyScale;
295  if (getDict().stopper_debug_level >= 1) {
296  tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
297  word->best_choice->certainty(), word->space_certainty,
298  std::min(word->space_certainty, word->best_choice->certainty()) *
300  word_certainty);
301  word->best_choice->print();
302  }
303  word->best_choice->set_certainty(word_certainty);
304 
305  word->tess_accepted = stopper_dict->AcceptableResult(word);
306  }
307  }
308 }
309 #endif // ANDROID_BUILD
310 
311 } // namespace tesseract.
WERD_RES::done
bool done
Definition: pageres.h:299
TBOX
Definition: cleanapi_test.cc:19
tesseract::DocumentData::Shuffle
void Shuffle()
Definition: imagedata.cpp:495
TBOX::move
void move(const ICOORD vec)
Definition: rect.h:156
pageres.h
TBOX::intersection
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:83
ROW::base_line
float base_line(float xpos) const
Definition: ocrrow.h:58
ROW::descenders
float descenders() const
Definition: ocrrow.h:84
PDBLK::bounding_box
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:58
boxread.h
tesseract::Tesseract::lstm_choice_mode
int lstm_choice_mode
Definition: tesseractclass.h:1086
POLY_BLOCK::IsText
bool IsText() const
Definition: polyblk.h:62
REJMAP::initialise
void initialise(int16_t length)
Definition: rejctmap.cpp:272
tesseract::Tesseract::tessedit_do_invert
bool tessedit_do_invert
Definition: tesseractclass.h:795
tesseractclass.h
WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:147
tesseract::DocumentData::PagesSize
size_t PagesSize() const
Definition: imagedata.h:236
tesseract::PSM_RAW_LINE
Definition: publictypes.h:176
baseline
Definition: mfoutline.h:62
WERD_CHOICE::set_certainty
void set_certainty(float new_val)
Definition: ratngs.h:360
FCOORD::y
float y() const
Definition: points.h:209
ICOORD
integer coordinate
Definition: points.h:30
tesseract::kCertaintyScale
const float kCertaintyScale
Definition: linerec.cpp:35
tesseract::ImageData::AddBoxes
void AddBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, const GenericVector< int > &box_pages)
Definition: imagedata.cpp:311
tesseract::PointerVector< WERD_RES >
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
FCOORD::x
float x() const
Definition: points.h:206
TBOX::top
int16_t top() const
Definition: rect.h:57
TBOX::area
int32_t area() const
Definition: rect.h:121
STRING
Definition: strngs.h:45
tesseract::Dict::stopper_debug_level
int stopper_debug_level
Definition: dict.h:638
recodebeam.h
TBOX::set_top
void set_top(int y)
Definition: rect.h:60
WERD_RES
Definition: pageres.h:160
tesseract::Tesseract::tessedit_pageseg_mode
int tessedit_pageseg_mode
Definition: tesseractclass.h:799
WERD_RES::best_state
GenericVector< int > best_state
Definition: pageres.h:279
tesseract::Tesseract::LSTMRecognizeWord
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:228
WERD_CHOICE::state
int state(int index) const
Definition: ratngs.h:307
FCOORD
Definition: points.h:187
tesseract::LSTMRecognizer::GetUnicharset
const UNICHARSET & GetUnicharset() const
Definition: lstmrecognizer.h:132
tesseract::ImageData
Definition: imagedata.h:104
TBOX::rotate
void rotate(const FCOORD &vec)
Definition: rect.h:196
TBOX::height
int16_t height() const
Definition: rect.h:107
WERD_RES::space_certainty
float space_certainty
Definition: pageres.h:315
tesseract::PSM_SINGLE_WORD
Treat the image as a single word.
Definition: publictypes.h:170
WERD_RES::tess_failed
bool tess_failed
Definition: pageres.h:289
tesseract::Tesseract::lstm_choice_iterations
int lstm_choice_iterations
Definition: tesseractclass.h:1090
tesseract::Tesseract::BestPix
Pix * BestPix() const
Definition: tesseractclass.h:231
tesseract::Dict::AcceptableResult
bool AcceptableResult(WERD_RES *word) const
Definition: stopper.cpp:116
tesseract::DocumentData::SaveDocument
bool SaveDocument(const char *filename, FileWriter writer)
Definition: imagedata.cpp:406
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
BLOCK
Definition: ocrblock.h:28
tesseract::LSTMRecognizer::RecognizeLine
void RecognizeLine(const ImageData &image_data, bool invert, bool debug, double worst_dict_cert, const TBOX &line_box, PointerVector< WERD_RES > *words, int lstm_choice_mode=0, int lstm_choice_amount=5)
Definition: lstmrecognizer.cpp:187
BLOCK::pdblk
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:189
ROW::x_height
float x_height() const
Definition: ocrrow.h:63
tesseract::kImagePadding
const int kImagePadding
Definition: imagedata.h:38
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
tesseract::Tesseract::GetRectImage
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
Definition: linerec.cpp:169
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
PDBLK::poly_block
POLY_BLOCK * poly_block() const
Definition: pdblock.h:54
TBOX::null_box
bool null_box() const
Definition: rect.h:49
tesseract::kWorstDictCertainty
const float kWorstDictCertainty
Definition: linerec.cpp:37
ROW::ascenders
float ascenders() const
Definition: ocrrow.h:81
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
TBOX::width
int16_t width() const
Definition: rect.h:114
WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:297
tesseract::Tesseract::getDict
Dict & getDict() override
Definition: tesseractclass.cpp:564
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
tesseract::Tesseract::applybox_page
int applybox_page
Definition: tesseractclass.h:824
tesseract::Tesseract::ImageHeight
int ImageHeight() const
Definition: tesseractclass.h:253
lstmrecognizer.h
WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:288
tesseract
Definition: baseapi.h:65
WERD_CHOICE::ContainsAnyNonSpaceDelimited
bool ContainsAnyNonSpaceDelimited() const
Definition: ratngs.h:502
tesseract::DocumentData::AddPageToDocument
void AddPageToDocument(ImageData *page)
Definition: imagedata.cpp:424
TBOX::botleft
const ICOORD & botleft() const
Definition: rect.h:91
tesseract::DocumentData::LoadDocument
bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:387
tprintf.h
ReadAllBoxes
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:71
GenericVector< TBOX >
tesseract::Tesseract::TrainFromBoxes
void TrainFromBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
Definition: linerec.cpp:80
WERD_RES::tess_would_adapt
bool tess_would_adapt
Definition: pageres.h:298
tesseract::Dict
Definition: dict.h:91
tesseract::Tesseract::TrainLineRecognizer
bool TrainLineRecognizer(const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
Definition: linerec.cpp:43
WERD_CHOICE::print
void print() const
Definition: ratngs.h:568
WERD_RES::SetupFake
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:348
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
TBOX::pad
void pad(int xpad, int ypad)
Definition: rect.h:130
imagedata.h
WERD_RES::tesseract
tesseract::Tesseract * tesseract
Definition: pageres.h:274
TBOX::major_overlap
bool major_overlap(const TBOX &box) const
Definition: rect.h:362
TBOX::left
int16_t left() const
Definition: rect.h:71
ROW
Definition: ocrrow.h:35
TBOX::right
int16_t right() const
Definition: rect.h:78
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:706
tesseract::ImageData::set_page_number
void set_page_number(int num)
Definition: imagedata.h:134
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::LSTMRecognizer::GetDict
const Dict * GetDict() const
Definition: lstmrecognizer.h:137
tesseract::DocumentData
Definition: imagedata.h:208
tesseract::Tesseract::SearchWords
void SearchWords(PointerVector< WERD_RES > *words)
Definition: linerec.cpp:259
tesseract::Tesseract::ImageWidth
int ImageWidth() const
Definition: tesseractclass.h:250
WERD_RES::word
WERD * word
Definition: pageres.h:180
TBOX::set_bottom
void set_bottom(int y)
Definition: rect.h:67
tesseract::Tesseract::GetLineData
ImageData * GetLineData(const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
Definition: linerec.cpp:135
BLOCK::re_rotation
FCOORD re_rotation() const
Definition: ocrblock.h:133
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::Classify::classify_debug_level
int classify_debug_level
Definition: classify.h:430
TBOX
Definition: rect.h:33