tesseract  5.0.0-alpha-619-ge9db
imagedata.cpp
Go to the documentation of this file.
1 // File: imagedata.cpp
3 // Description: Class to hold information about a single multi-page tiff
4 // training file and its corresponding boxes or text file.
5 // Author: Ray Smith
6 //
7 // (C) Copyright 2013, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #include "imagedata.h"
25 
26 #include <cinttypes> // for PRId64
27 #include <thread> // for std::thread
28 
29 #include "allheaders.h" // for pixDestroy, pixGetHeight, pixGetWidth, lept_...
30 #include "boxread.h" // for ReadMemBoxes
31 #include "callcpp.h" // for window_wait
32 #include <tesseract/helpers.h> // for IntCastRounded, TRand, ClipToRange, Modulo
33 #include "rect.h" // for TBOX
34 #include "scrollview.h" // for ScrollView, ScrollView::CYAN, ScrollView::NONE
35 #include <tesseract/serialis.h> // for TFile
36 #include "tprintf.h" // for tprintf
37 
38 // Number of documents to read ahead while training. Doesn't need to be very
39 // large.
40 const int kMaxReadAhead = 8;
41 
42 namespace tesseract {
43 
44 WordFeature::WordFeature() : x_(0), y_(0), dir_(0) {
45 }
46 
47 WordFeature::WordFeature(const FCOORD& fcoord, uint8_t dir)
48  : x_(IntCastRounded(fcoord.x())),
49  y_(ClipToRange<int>(IntCastRounded(fcoord.y()), 0, UINT8_MAX)),
50  dir_(dir) {
51 }
52 
53 // Computes the maximum x and y value in the features.
55  int* max_x, int* max_y) {
56  *max_x = 0;
57  *max_y = 0;
58  for (int f = 0; f < features.size(); ++f) {
59  if (features[f].x_ > *max_x) *max_x = features[f].x_;
60  if (features[f].y_ > *max_y) *max_y = features[f].y_;
61  }
62 }
63 
64 // Draws the features in the given window.
66  ScrollView* window) {
67 #ifndef GRAPHICS_DISABLED
68  for (int f = 0; f < features.size(); ++f) {
69  FCOORD pos(features[f].x_, features[f].y_);
70  FCOORD dir;
71  dir.from_direction(features[f].dir_);
72  dir *= 8.0f;
73  window->SetCursor(IntCastRounded(pos.x() - dir.x()),
74  IntCastRounded(pos.y() - dir.y()));
75  window->DrawTo(IntCastRounded(pos.x() + dir.x()),
76  IntCastRounded(pos.y() + dir.y()));
77  }
78 #endif
79 }
80 
81 // Writes to the given file. Returns false in case of error.
82 bool WordFeature::Serialize(FILE* fp) const {
83  return tesseract::Serialize(fp, &x_) &&
84  tesseract::Serialize(fp, &y_) &&
85  tesseract::Serialize(fp, &dir_);
86 }
87 
88 // Reads from the given file. Returns false in case of error.
89 bool WordFeature::DeSerialize(bool swap, FILE* fp) {
90  if (!tesseract::DeSerialize(fp, &x_)) return false;
91  if (swap) ReverseN(&x_, sizeof(x_));
92  return tesseract::DeSerialize(fp, &y_) &&
93  tesseract::DeSerialize(fp, &dir_);
94 }
95 
97  const GenericVector<WordFeature>& word_features,
98  GenericVector<FloatWordFeature>* float_features) {
99  for (int i = 0; i < word_features.size(); ++i) {
101  f.x = word_features[i].x();
102  f.y = word_features[i].y();
103  f.dir = word_features[i].dir();
104  f.x_bucket = 0; // Will set it later.
105  float_features->push_back(f);
106  }
107 }
108 
109 // Sort function to sort first by x-bucket, then by y.
110 /* static */
111 int FloatWordFeature::SortByXBucket(const void* v1, const void* v2) {
112  const auto* f1 = static_cast<const FloatWordFeature*>(v1);
113  const auto* f2 = static_cast<const FloatWordFeature*>(v2);
114  int x_diff = f1->x_bucket - f2->x_bucket;
115  if (x_diff == 0) return f1->y - f2->y;
116  return x_diff;
117 }
118 
119 ImageData::ImageData() : page_number_(-1), vertical_text_(false) {
120 }
121 // Takes ownership of the pix and destroys it.
122 ImageData::ImageData(bool vertical, Pix* pix)
123  : page_number_(0), vertical_text_(vertical) {
124  SetPix(pix);
125 }
127 }
128 
129 // Builds and returns an ImageData from the basic data. Note that imagedata,
130 // truth_text, and box_text are all the actual file data, NOT filenames.
131 ImageData* ImageData::Build(const char* name, int page_number, const char* lang,
132  const char* imagedata, int imagedatasize,
133  const char* truth_text, const char* box_text) {
134  auto* image_data = new ImageData();
135  image_data->imagefilename_ = name;
136  image_data->page_number_ = page_number;
137  image_data->language_ = lang;
138  // Save the imagedata.
139  image_data->image_data_.resize_no_init(imagedatasize);
140  memcpy(&image_data->image_data_[0], imagedata, imagedatasize);
141  if (!image_data->AddBoxes(box_text)) {
142  if (truth_text == nullptr || truth_text[0] == '\0') {
143  tprintf("Error: No text corresponding to page %d from image %s!\n",
144  page_number, name);
145  delete image_data;
146  return nullptr;
147  }
148  image_data->transcription_ = truth_text;
149  // If we have no boxes, the transcription is in the 0th box_texts_.
150  image_data->box_texts_.push_back(truth_text);
151  // We will create a box for the whole image on PreScale, to save unpacking
152  // the image now.
153  } else if (truth_text != nullptr && truth_text[0] != '\0' &&
154  image_data->transcription_ != truth_text) {
155  // Save the truth text as it is present and disagrees with the box text.
156  image_data->transcription_ = truth_text;
157  }
158  return image_data;
159 }
160 
161 // Writes to the given file. Returns false in case of error.
162 bool ImageData::Serialize(TFile* fp) const {
163  if (!imagefilename_.Serialize(fp)) return false;
164  if (!fp->Serialize(&page_number_)) return false;
165  if (!image_data_.Serialize(fp)) return false;
166  if (!language_.Serialize(fp)) return false;
167  if (!transcription_.Serialize(fp)) return false;
168  // WARNING: Will not work across different endian machines.
169  if (!boxes_.Serialize(fp)) return false;
170  if (!box_texts_.SerializeClasses(fp)) return false;
171  int8_t vertical = vertical_text_;
172  return fp->Serialize(&vertical);
173 }
174 
175 // Reads from the given file. Returns false in case of error.
176 // If swap is true, assumes a big/little-endian swap is needed.
178  if (!imagefilename_.DeSerialize(fp)) return false;
179  if (!fp->DeSerialize(&page_number_)) return false;
180  if (!image_data_.DeSerialize(fp)) return false;
181  if (!language_.DeSerialize(fp)) return false;
182  if (!transcription_.DeSerialize(fp)) return false;
183  // WARNING: Will not work across different endian machines.
184  if (!boxes_.DeSerialize(fp)) return false;
185  if (!box_texts_.DeSerializeClasses(fp)) return false;
186  int8_t vertical = 0;
187  if (!fp->DeSerialize(&vertical)) return false;
188  vertical_text_ = vertical != 0;
189  return true;
190 }
191 
192 // As DeSerialize, but only seeks past the data - hence a static method.
194  if (!STRING::SkipDeSerialize(fp)) return false;
195  int32_t page_number;
196  if (!fp->DeSerialize(&page_number)) return false;
197  if (!GenericVector<char>::SkipDeSerialize(fp)) return false;
198  if (!STRING::SkipDeSerialize(fp)) return false;
199  if (!STRING::SkipDeSerialize(fp)) return false;
200  if (!GenericVector<TBOX>::SkipDeSerialize(fp)) return false;
201  if (!GenericVector<STRING>::SkipDeSerializeClasses(fp)) return false;
202  int8_t vertical = 0;
203  return fp->DeSerialize(&vertical);
204 }
205 
206 // Saves the given Pix as a PNG-encoded string and destroys it.
207 // In case of missing PNG support in Leptonica use PNM format,
208 // which requires more memory.
209 void ImageData::SetPix(Pix* pix) {
210  SetPixInternal(pix, &image_data_);
211 }
212 
213 // Returns the Pix image for *this. Must be pixDestroyed after use.
214 Pix* ImageData::GetPix() const {
215  return GetPixInternal(image_data_);
216 }
217 
218 // Gets anything and everything with a non-nullptr pointer, prescaled to a
219 // given target_height (if 0, then the original image height), and aligned.
220 // Also returns (if not nullptr) the width and height of the scaled image.
221 // The return value is the scaled Pix, which must be pixDestroyed after use,
222 // and scale_factor (if not nullptr) is set to the scale factor that was applied
223 // to the image to achieve the target_height.
224 Pix* ImageData::PreScale(int target_height, int max_height, float* scale_factor,
225  int* scaled_width, int* scaled_height,
226  GenericVector<TBOX>* boxes) const {
227  int input_width = 0;
228  int input_height = 0;
229  Pix* src_pix = GetPix();
230  ASSERT_HOST(src_pix != nullptr);
231  input_width = pixGetWidth(src_pix);
232  input_height = pixGetHeight(src_pix);
233  if (target_height == 0) {
234  target_height = std::min(input_height, max_height);
235  }
236  float im_factor = static_cast<float>(target_height) / input_height;
237  if (scaled_width != nullptr)
238  *scaled_width = IntCastRounded(im_factor * input_width);
239  if (scaled_height != nullptr)
240  *scaled_height = target_height;
241  // Get the scaled image.
242  Pix* pix = pixScale(src_pix, im_factor, im_factor);
243  if (pix == nullptr) {
244  tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n",
245  input_width, input_height, im_factor);
246  }
247  if (scaled_width != nullptr) *scaled_width = pixGetWidth(pix);
248  if (scaled_height != nullptr) *scaled_height = pixGetHeight(pix);
249  pixDestroy(&src_pix);
250  if (boxes != nullptr) {
251  // Get the boxes.
252  boxes->truncate(0);
253  for (int b = 0; b < boxes_.size(); ++b) {
254  TBOX box = boxes_[b];
255  box.scale(im_factor);
256  boxes->push_back(box);
257  }
258  if (boxes->empty()) {
259  // Make a single box for the whole image.
260  TBOX box(0, 0, im_factor * input_width, target_height);
261  boxes->push_back(box);
262  }
263  }
264  if (scale_factor != nullptr) *scale_factor = im_factor;
265  return pix;
266 }
267 
269  return image_data_.size();
270 }
271 
272 // Draws the data in a new window.
273 void ImageData::Display() const {
274 #ifndef GRAPHICS_DISABLED
275  const int kTextSize = 64;
276  // Draw the image.
277  Pix* pix = GetPix();
278  if (pix == nullptr) return;
279  int width = pixGetWidth(pix);
280  int height = pixGetHeight(pix);
281  auto* win = new ScrollView("Imagedata", 100, 100,
282  2 * (width + 2 * kTextSize),
283  2 * (height + 4 * kTextSize),
284  width + 10, height + 3 * kTextSize, true);
285  win->Image(pix, 0, height - 1);
286  pixDestroy(&pix);
287  // Draw the boxes.
288  win->Pen(ScrollView::RED);
289  win->Brush(ScrollView::NONE);
290  int text_size = kTextSize;
291  if (!boxes_.empty() && boxes_[0].height() * 2 < text_size)
292  text_size = boxes_[0].height() * 2;
293  win->TextAttributes("Arial", text_size, false, false, false);
294  if (!boxes_.empty()) {
295  for (int b = 0; b < boxes_.size(); ++b) {
296  boxes_[b].plot(win);
297  win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].c_str());
298  }
299  } else {
300  // The full transcription.
301  win->Pen(ScrollView::CYAN);
302  win->Text(0, height + kTextSize * 2, transcription_.c_str());
303  }
304  win->Update();
305  window_wait(win);
306 #endif
307 }
308 
309 // Adds the supplied boxes and transcriptions that correspond to the correct
310 // page number.
312  const GenericVector<STRING>& texts,
313  const GenericVector<int>& box_pages) {
314  // Copy the boxes and make the transcription.
315  for (int i = 0; i < box_pages.size(); ++i) {
316  if (page_number_ >= 0 && box_pages[i] != page_number_) continue;
317  transcription_ += texts[i];
318  boxes_.push_back(boxes[i]);
319  box_texts_.push_back(texts[i]);
320  }
321 }
322 
323 // Saves the given Pix as a PNG-encoded string and destroys it.
324 // In case of missing PNG support in Leptonica use PNM format,
325 // which requires more memory.
326 void ImageData::SetPixInternal(Pix* pix, GenericVector<char>* image_data) {
327  l_uint8* data;
328  size_t size;
329  l_int32 ret;
330  ret = pixWriteMem(&data, &size, pix, IFF_PNG);
331  if (ret) {
332  ret = pixWriteMem(&data, &size, pix, IFF_PNM);
333  }
334  pixDestroy(&pix);
335  image_data->resize_no_init(size);
336  memcpy(&(*image_data)[0], data, size);
337  lept_free(data);
338 }
339 
340 // Returns the Pix image for the image_data. Must be pixDestroyed after use.
341 Pix* ImageData::GetPixInternal(const GenericVector<char>& image_data) {
342  Pix* pix = nullptr;
343  if (!image_data.empty()) {
344  // Convert the array to an image.
345  const auto* u_data =
346  reinterpret_cast<const unsigned char*>(&image_data[0]);
347  pix = pixReadMem(u_data, image_data.size());
348  }
349  return pix;
350 }
351 
352 // Parses the text string as a box file and adds any discovered boxes that
353 // match the page number. Returns false on error.
354 bool ImageData::AddBoxes(const char* box_text) {
355  if (box_text != nullptr && box_text[0] != '\0') {
357  GenericVector<STRING> texts;
358  GenericVector<int> box_pages;
359  if (ReadMemBoxes(page_number_, /*skip_blanks*/ false, box_text,
360  /*continue_on_failure*/ true, &boxes, &texts, nullptr,
361  &box_pages)) {
362  AddBoxes(boxes, texts, box_pages);
363  return true;
364  } else {
365  tprintf("Error: No boxes for page %d from image %s!\n",
366  page_number_, imagefilename_.c_str());
367  }
368  }
369  return false;
370 }
371 
373  : document_name_(name),
374  pages_offset_(-1),
375  total_pages_(-1),
376  memory_used_(0),
377  max_memory_(0),
378  reader_(nullptr) {}
379 
381  std::lock_guard<std::mutex> lock_p(pages_mutex_);
382  std::lock_guard<std::mutex> lock_g(general_mutex_);
383 }
384 
385 // Reads all the pages in the given lstmf filename to the cache. The reader
386 // is used to read the file.
387 bool DocumentData::LoadDocument(const char* filename, int start_page,
388  int64_t max_memory, FileReader reader) {
389  SetDocument(filename, max_memory, reader);
390  pages_offset_ = start_page;
391  return ReCachePages();
392 }
393 
394 // Sets up the document, without actually loading it.
395 void DocumentData::SetDocument(const char* filename, int64_t max_memory,
396  FileReader reader) {
397  std::lock_guard<std::mutex> lock_p(pages_mutex_);
398  std::lock_guard<std::mutex> lock(general_mutex_);
399  document_name_ = filename;
400  pages_offset_ = -1;
401  max_memory_ = max_memory;
402  reader_ = reader;
403 }
404 
405 // Writes all the pages to the given filename. Returns false on error.
406 bool DocumentData::SaveDocument(const char* filename, FileWriter writer) {
407  std::lock_guard<std::mutex> lock(pages_mutex_);
408  TFile fp;
409  fp.OpenWrite(nullptr);
410  if (!pages_.Serialize(&fp) || !fp.CloseWrite(filename, writer)) {
411  tprintf("Serialize failed: %s\n", filename);
412  return false;
413  }
414  return true;
415 }
417  std::lock_guard<std::mutex> lock(pages_mutex_);
418  TFile fp;
419  fp.OpenWrite(buffer);
420  return pages_.Serialize(&fp);
421 }
422 
423 // Adds the given page data to this document, counting up memory.
425  std::lock_guard<std::mutex> lock(pages_mutex_);
426  pages_.push_back(page);
427  set_memory_used(memory_used() + page->MemoryUsed());
428 }
429 
430 // If the given index is not currently loaded, loads it using a separate
431 // thread.
433  ImageData* page = nullptr;
434  if (IsPageAvailable(index, &page)) return;
435  std::lock_guard<std::mutex> lock(pages_mutex_);
436  if (pages_offset_ == index) return;
437  pages_offset_ = index;
438  pages_.clear();
439  std::thread t(&tesseract::DocumentData::ReCachePages, this);
440  t.detach();
441 }
442 
443 // Returns a pointer to the page with the given index, modulo the total
444 // number of pages. Blocks until the background load is completed.
445 const ImageData* DocumentData::GetPage(int index) {
446  ImageData* page = nullptr;
447  while (!IsPageAvailable(index, &page)) {
448  // If there is no background load scheduled, schedule one now.
449  pages_mutex_.lock();
450  bool needs_loading = pages_offset_ != index;
451  pages_mutex_.unlock();
452  if (needs_loading) LoadPageInBackground(index);
453  // We can't directly load the page, or the background load will delete it
454  // while the caller is using it, so give it a chance to work.
455  std::this_thread::yield();
456  }
457  return page;
458 }
459 
460 // Returns true if the requested page is available, and provides a pointer,
461 // which may be nullptr if the document is empty. May block, even though it
462 // doesn't guarantee to return true.
463 bool DocumentData::IsPageAvailable(int index, ImageData** page) {
464  std::lock_guard<std::mutex> lock(pages_mutex_);
465  int num_pages = NumPages();
466  if (num_pages == 0 || index < 0) {
467  *page = nullptr; // Empty Document.
468  return true;
469  }
470  if (num_pages > 0) {
471  index = Modulo(index, num_pages);
472  if (pages_offset_ <= index && index < pages_offset_ + pages_.size()) {
473  *page = pages_[index - pages_offset_]; // Page is available already.
474  return true;
475  }
476  }
477  return false;
478 }
479 
480 // Removes all pages from memory and frees the memory, but does not forget
481 // the document metadata.
483  std::lock_guard<std::mutex> lock(pages_mutex_);
484  int64_t memory_saved = memory_used();
485  pages_.clear();
486  pages_offset_ = -1;
487  set_total_pages(-1);
488  set_memory_used(0);
489  tprintf("Unloaded document %s, saving %" PRId64 " memory\n",
490  document_name_.c_str(), memory_saved);
491  return memory_saved;
492 }
493 
494 // Shuffles all the pages in the document.
496  TRand random;
497  // Different documents get shuffled differently, but the same for the same
498  // name.
499  random.set_seed(document_name_.c_str());
500  int num_pages = pages_.size();
501  // Execute one random swap for each page in the document.
502  for (int i = 0; i < num_pages; ++i) {
503  int src = random.IntRand() % num_pages;
504  int dest = random.IntRand() % num_pages;
505  std::swap(pages_[src], pages_[dest]);
506  }
507 }
508 
509 // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
510 // starting at index pages_offset_.
511 bool DocumentData::ReCachePages() {
512  std::lock_guard<std::mutex> lock(pages_mutex_);
513  // Read the file.
514  set_total_pages(0);
515  set_memory_used(0);
516  int loaded_pages = 0;
517  pages_.truncate(0);
518  TFile fp;
519  if (!fp.Open(document_name_, reader_) ||
520  !PointerVector<ImageData>::DeSerializeSize(&fp, &loaded_pages) ||
521  loaded_pages <= 0) {
522  tprintf("Deserialize header failed: %s\n", document_name_.c_str());
523  return false;
524  }
525  pages_offset_ %= loaded_pages;
526  // Skip pages before the first one we want, and load the rest until max
527  // memory and skip the rest after that.
528  int page;
529  for (page = 0; page < loaded_pages; ++page) {
530  if (page < pages_offset_ ||
531  (max_memory_ > 0 && memory_used() > max_memory_)) {
533  tprintf("Deserializeskip failed\n");
534  break;
535  }
536  } else {
537  if (!pages_.DeSerializeElement(&fp)) break;
538  ImageData* image_data = pages_.back();
539  if (image_data->imagefilename().length() == 0) {
540  image_data->set_imagefilename(document_name_);
541  image_data->set_page_number(page);
542  }
543  set_memory_used(memory_used() + image_data->MemoryUsed());
544  }
545  }
546  if (page < loaded_pages) {
547  tprintf("Deserialize failed: %s read %d/%d lines\n",
548  document_name_.c_str(), page, loaded_pages);
549  pages_.truncate(0);
550  } else {
551  tprintf("Loaded %d/%d lines (%d-%d) of document %s\n", pages_.size(),
552  loaded_pages, pages_offset_ + 1, pages_offset_ + pages_.size(),
553  document_name_.c_str());
554  }
555  set_total_pages(loaded_pages);
556  return !pages_.empty();
557 }
558 
559 // A collection of DocumentData that knows roughly how much memory it is using.
560 DocumentCache::DocumentCache(int64_t max_memory)
561  : num_pages_per_doc_(0), max_memory_(max_memory) {}
563 
564 // Adds all the documents in the list of filenames, counting memory.
565 // The reader is used to read the files.
567  CachingStrategy cache_strategy,
568  FileReader reader) {
569  cache_strategy_ = cache_strategy;
570  int64_t fair_share_memory = 0;
571  // In the round-robin case, each DocumentData handles restricting its content
572  // to its fair share of memory. In the sequential case, DocumentCache
573  // determines which DocumentDatas are held entirely in memory.
574  if (cache_strategy_ == CS_ROUND_ROBIN)
575  fair_share_memory = max_memory_ / filenames.size();
576  for (int arg = 0; arg < filenames.size(); ++arg) {
577  STRING filename = filenames[arg];
578  auto* document = new DocumentData(filename);
579  document->SetDocument(filename.c_str(), fair_share_memory, reader);
580  AddToCache(document);
581  }
582  if (!documents_.empty()) {
583  // Try to get the first page now to verify the list of filenames.
584  if (GetPageBySerial(0) != nullptr) return true;
585  tprintf("Load of page 0 failed!\n");
586  }
587  return false;
588 }
589 
590 // Adds document to the cache.
592  documents_.push_back(data);
593  return true;
594 }
595 
596 // Finds and returns a document by name.
597 DocumentData* DocumentCache::FindDocument(const STRING& document_name) const {
598  for (int i = 0; i < documents_.size(); ++i) {
599  if (documents_[i]->document_name() == document_name)
600  return documents_[i];
601  }
602  return nullptr;
603 }
604 
605 // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
606 // strategy, could take a long time.
608  if (cache_strategy_ == CS_SEQUENTIAL) {
609  // In sequential mode, we assume each doc has the same number of pages
610  // whether it is true or not.
611  if (num_pages_per_doc_ == 0) GetPageSequential(0);
612  return num_pages_per_doc_ * documents_.size();
613  }
614  int total_pages = 0;
615  int num_docs = documents_.size();
616  for (int d = 0; d < num_docs; ++d) {
617  // We have to load a page to make NumPages() valid.
618  documents_[d]->GetPage(0);
619  total_pages += documents_[d]->NumPages();
620  }
621  return total_pages;
622 }
623 
624 // Returns a page by serial number, selecting them in a round-robin fashion
625 // from all the documents. Highly disk-intensive, but doesn't need samples
626 // to be shuffled between files to begin with.
627 const ImageData* DocumentCache::GetPageRoundRobin(int serial) {
628  int num_docs = documents_.size();
629  int doc_index = serial % num_docs;
630  const ImageData* doc = documents_[doc_index]->GetPage(serial / num_docs);
631  for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) {
632  doc_index = (serial + offset) % num_docs;
633  int page = (serial + offset) / num_docs;
634  documents_[doc_index]->LoadPageInBackground(page);
635  }
636  return doc;
637 }
638 
639 // Returns a page by serial number, selecting them in sequence from each file.
640 // Requires the samples to be shuffled between the files to give a random or
641 // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
642 const ImageData* DocumentCache::GetPageSequential(int serial) {
643  int num_docs = documents_.size();
644  ASSERT_HOST(num_docs > 0);
645  if (num_pages_per_doc_ == 0) {
646  // Use the pages in the first doc as the number of pages in each doc.
647  documents_[0]->GetPage(0);
648  num_pages_per_doc_ = documents_[0]->NumPages();
649  if (num_pages_per_doc_ == 0) {
650  tprintf("First document cannot be empty!!\n");
651  ASSERT_HOST(num_pages_per_doc_ > 0);
652  }
653  // Get rid of zero now if we don't need it.
654  if (serial / num_pages_per_doc_ % num_docs > 0) documents_[0]->UnCache();
655  }
656  int doc_index = serial / num_pages_per_doc_ % num_docs;
657  const ImageData* doc =
658  documents_[doc_index]->GetPage(serial % num_pages_per_doc_);
659  // Count up total memory. Background loading makes it more complicated to
660  // keep a running count.
661  int64_t total_memory = 0;
662  for (int d = 0; d < num_docs; ++d) {
663  total_memory += documents_[d]->memory_used();
664  }
665  if (total_memory >= max_memory_) {
666  // Find something to un-cache.
667  // If there are more than 3 in front, then serial is from the back reader
668  // of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then
669  // we create a hole between them and then un-caching the backmost occupied
670  // will work for both.
671  int num_in_front = CountNeighbourDocs(doc_index, 1);
672  for (int offset = num_in_front - 2;
673  offset > 1 && total_memory >= max_memory_; --offset) {
674  int next_index = (doc_index + offset) % num_docs;
675  total_memory -= documents_[next_index]->UnCache();
676  }
677  // If that didn't work, the best solution is to un-cache from the back. If
678  // we take away the document that a 2nd reader is using, it will put it
679  // back and make a hole between.
680  int num_behind = CountNeighbourDocs(doc_index, -1);
681  for (int offset = num_behind; offset < 0 && total_memory >= max_memory_;
682  ++offset) {
683  int next_index = (doc_index + offset + num_docs) % num_docs;
684  total_memory -= documents_[next_index]->UnCache();
685  }
686  }
687  int next_index = (doc_index + 1) % num_docs;
688  if (!documents_[next_index]->IsCached() && total_memory < max_memory_) {
689  documents_[next_index]->LoadPageInBackground(0);
690  }
691  return doc;
692 }
693 
694 // Helper counts the number of adjacent cached neighbours of index looking in
695 // direction dir, ie index+dir, index+2*dir etc.
696 int DocumentCache::CountNeighbourDocs(int index, int dir) {
697  int num_docs = documents_.size();
698  for (int offset = dir; abs(offset) < num_docs; offset += dir) {
699  int offset_index = (index + offset + num_docs) % num_docs;
700  if (!documents_[offset_index]->IsCached()) return offset - dir;
701  }
702  return num_docs;
703 }
704 
705 } // namespace tesseract.
tesseract::DocumentData::Shuffle
void Shuffle()
Definition: imagedata.cpp:495
ClipToRange
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:106
tesseract::WordFeature::Serialize
bool Serialize(FILE *fp) const
Definition: imagedata.cpp:82
ScrollView
Definition: scrollview.h:97
tesseract::PointerVector::DeSerializeSize
static bool DeSerializeSize(TFile *fp, int32_t *size)
Definition: genericvector.h:592
tesseract::FileWriter
bool(*)(const GenericVector< char > &data, const char *filename) FileWriter
Definition: serialis.h:51
tesseract::DocumentCache::FindDocument
DocumentData * FindDocument(const STRING &document_name) const
Definition: imagedata.cpp:597
tesseract::ImageData::Serialize
bool Serialize(TFile *fp) const
Definition: imagedata.cpp:162
tesseract::ImageData::PreScale
Pix * PreScale(int target_height, int max_height, float *scale_factor, int *scaled_width, int *scaled_height, GenericVector< TBOX > *boxes) const
Definition: imagedata.cpp:224
tesseract::DocumentData::IsPageAvailable
bool IsPageAvailable(int index, ImageData **page)
Definition: imagedata.cpp:463
boxread.h
tesseract::PointerVector::DeSerializeSkip
static bool DeSerializeSkip(TFile *fp)
Definition: genericvector.h:616
tesseract::CS_ROUND_ROBIN
Definition: imagedata.h:53
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::DocumentData::NumPages
int NumPages() const
Definition: imagedata.h:232
FCOORD::y
float y() const
Definition: points.h:209
tesseract::WordFeature::DeSerialize
bool DeSerialize(bool swap, FILE *fp)
Definition: imagedata.cpp:89
tesseract::FloatWordFeature::x
float x
Definition: imagedata.h:93
tesseract::DocumentCache::LoadDocuments
bool LoadDocuments(const GenericVector< STRING > &filenames, CachingStrategy cache_strategy, FileReader reader)
Definition: imagedata.cpp:566
tesseract::ImageData::AddBoxes
void AddBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, const GenericVector< int > &box_pages)
Definition: imagedata.cpp:311
FCOORD::x
float x() const
Definition: points.h:206
tesseract::TRand::IntRand
int32_t IntRand()
Definition: helpers.h:80
tesseract::WordFeature::dir
int dir() const
Definition: imagedata.h:71
STRING
Definition: strngs.h:45
tesseract::DocumentData::UnCache
int64_t UnCache()
Definition: imagedata.cpp:482
ScrollView::CYAN
Definition: scrollview.h:107
tesseract::DocumentData::~DocumentData
~DocumentData()
Definition: imagedata.cpp:380
ScrollView::NONE
Definition: scrollview.h:101
GenericVector::Serialize
bool Serialize(FILE *fp) const
Definition: genericvector.h:929
tesseract::FileReader
bool(*)(const char *filename, GenericVector< char > *data) FileReader
Definition: serialis.h:47
ScrollView::DrawTo
void DrawTo(int x, int y)
Definition: scrollview.cpp:524
tesseract::FloatWordFeature::FromWordFeatures
static void FromWordFeatures(const GenericVector< WordFeature > &word_features, GenericVector< FloatWordFeature > *float_features)
Definition: imagedata.cpp:96
IntCastRounded
int IntCastRounded(double x)
Definition: helpers.h:173
tesseract::CS_SEQUENTIAL
Definition: imagedata.h:48
rect.h
FCOORD
Definition: points.h:187
tesseract::ImageData
Definition: imagedata.h:104
tesseract::DocumentData::GetPage
const ImageData * GetPage(int index)
Definition: imagedata.cpp:445
STRING::DeSerialize
bool DeSerialize(bool swap, FILE *fp)
Definition: strngs.cpp:157
STRING::Serialize
bool Serialize(FILE *fp) const
Definition: strngs.cpp:144
tesseract::DocumentData::SetDocument
void SetDocument(const char *filename, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:395
tesseract::TFile::Open
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:210
kMaxReadAhead
const int kMaxReadAhead
Definition: imagedata.cpp:40
tesseract::DocumentCache::~DocumentCache
~DocumentCache()
Definition: imagedata.cpp:562
GenericVector::DeSerializeClasses
bool DeSerializeClasses(bool swap, FILE *fp)
Definition: genericvector.h:1038
tesseract::DocumentData::SaveDocument
bool SaveDocument(const char *filename, FileWriter writer)
Definition: imagedata.cpp:406
tesseract::ImageData::boxes
const GenericVector< TBOX > & boxes() const
Definition: imagedata.h:149
tesseract::DocumentCache::GetPageBySerial
const ImageData * GetPageBySerial(int serial)
Definition: imagedata.h:343
tesseract::ImageData::SkipDeSerialize
static bool SkipDeSerialize(TFile *fp)
Definition: imagedata.cpp:193
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
GenericVector::SerializeClasses
bool SerializeClasses(FILE *fp) const
Definition: genericvector.h:1008
tesseract::TFile::CloseWrite
bool CloseWrite(const STRING &filename, FileWriter writer)
Definition: serialis.cpp:324
tesseract::ImageData::~ImageData
~ImageData()
Definition: imagedata.cpp:126
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::DocumentCache::TotalPages
int TotalPages()
Definition: imagedata.cpp:607
tesseract::TFile::DeSerialize
bool DeSerialize(char *data, size_t count=1)
Definition: serialis.cpp:117
GenericVector::DeSerialize
bool DeSerialize(bool swap, FILE *fp)
Definition: genericvector.h:954
tesstrain_utils.int
int
Definition: tesstrain_utils.py:154
tesseract::DocumentData::memory_used
int64_t memory_used() const
Definition: imagedata.h:239
tesseract::TFile::Serialize
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:161
tesseract::ImageData::box_text
const STRING & box_text(int index) const
Definition: imagedata.h:155
tesseract::DocumentCache::AddToCache
bool AddToCache(DocumentData *data)
Definition: imagedata.cpp:591
GenericVector::resize_no_init
void resize_no_init(int size)
Definition: genericvector.h:65
tesseract::TFile
Definition: serialis.h:75
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
tesseract::ImageData::image_data
const GenericVector< char > & image_data() const
Definition: imagedata.h:137
tesseract::WordFeature::ComputeSize
static void ComputeSize(const GenericVector< WordFeature > &features, int *max_x, int *max_y)
Definition: imagedata.cpp:54
tesseract::FloatWordFeature::x_bucket
int x_bucket
Definition: imagedata.h:96
ReadMemBoxes
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:87
helpers.h
tesseract
Definition: baseapi.h:65
tesseract::ImageData::page_number
int page_number() const
Definition: imagedata.h:131
tesseract::FloatWordFeature::y
float y
Definition: imagedata.h:94
tesseract::WordFeature::WordFeature
WordFeature()
Definition: imagedata.cpp:44
tesseract::ImageData::ImageData
ImageData()
Definition: imagedata.cpp:119
tesseract::DocumentData::AddPageToDocument
void AddPageToDocument(ImageData *page)
Definition: imagedata.cpp:424
ScrollView::RED
Definition: scrollview.h:104
tesseract::WordFeature::Draw
static void Draw(const GenericVector< WordFeature > &features, ScrollView *window)
Definition: imagedata.cpp:65
tesseract::DocumentData::LoadDocument
bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:387
tprintf.h
callcpp.h
GenericVector
Definition: baseapi.h:40
tesseract::ImageData::DeSerialize
bool DeSerialize(TFile *fp)
Definition: imagedata.cpp:177
tesseract::DocumentData::LoadPageInBackground
void LoadPageInBackground(int index)
Definition: imagedata.cpp:432
tesseract::TRand::set_seed
void set_seed(uint64_t seed)
Definition: helpers.h:70
imagedata.h
STRING::SkipDeSerialize
static bool SkipDeSerialize(tesseract::TFile *fp)
Definition: strngs.cpp:177
tesseract::FloatWordFeature::dir
float dir
Definition: imagedata.h:95
GenericVector::truncate
void truncate(int size)
Definition: genericvector.h:132
tesseract::ImageData::MemoryUsed
int MemoryUsed() const
Definition: imagedata.cpp:268
tesstrain_utils.dest
dest
Definition: tesstrain_utils.py:139
tesseract::CachingStrategy
CachingStrategy
Definition: imagedata.h:41
tesseract::ImageData::Build
static ImageData * Build(const char *name, int page_number, const char *lang, const char *imagedata, int imagedatasize, const char *truth_text, const char *box_text)
Definition: imagedata.cpp:131
tesseract::DocumentData::DocumentData
DocumentData(const STRING &name)
Definition: imagedata.cpp:372
Modulo
int Modulo(int a, int b)
Definition: helpers.h:156
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::DocumentData
Definition: imagedata.h:208
ScrollView::SetCursor
void SetCursor(int x, int y)
Definition: scrollview.cpp:518
serialis.h
ReverseN
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:183
tesseract::ImageData::Display
void Display() const
Definition: imagedata.cpp:273
tesseract::DocumentCache::DocumentCache
DocumentCache(int64_t max_memory)
Definition: imagedata.cpp:560
tesseract::DeSerialize
bool DeSerialize(FILE *fp, char *data, size_t n=1)
Definition: serialis.cpp:41
tesseract::DocumentData::SaveToBuffer
bool SaveToBuffer(GenericVector< char > *buffer)
Definition: imagedata.cpp:416
tesseract::FloatWordFeature
Definition: imagedata.h:87
tesseract::FloatWordFeature::SortByXBucket
static int SortByXBucket(const void *, const void *)
Definition: imagedata.cpp:111
tesseract::TRand
Definition: helpers.h:50
GenericVector::size
int size() const
Definition: genericvector.h:71
window_wait
char window_wait(ScrollView *win)
Definition: callcpp.cpp:103
tesseract::TFile::OpenWrite
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:309
tesseract::Serialize
bool Serialize(FILE *fp, const char *data, size_t n=1)
Definition: serialis.cpp:73
scrollview.h
tesseract::ImageData::GetPix
Pix * GetPix() const
Definition: imagedata.cpp:214
TBOX::scale
void scale(const float f)
Definition: rect.h:174
tesseract::ImageData::SetPix
void SetPix(Pix *pix)
Definition: imagedata.cpp:209
TBOX
Definition: rect.h:33