tesseract  5.0.0-alpha-619-ge9db
imagedata.h
Go to the documentation of this file.
1 // File: imagedata.h
3 // Description: Class to hold information about a single image and its
4 // corresponding boxes or text file.
5 // Author: Ray Smith
6 //
7 // (C) Copyright 2013, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
18 
19 #ifndef TESSERACT_IMAGE_IMAGEDATA_H_
20 #define TESSERACT_IMAGE_IMAGEDATA_H_
21 
22 #include <mutex> // for std::mutex
23 #include <tesseract/genericvector.h> // for GenericVector, PointerVector, FileReader
24 #include "points.h" // for FCOORD
25 #include <tesseract/strngs.h> // for STRING
26 
27 class ScrollView;
28 class TBOX;
29 struct Pix;
30 
31 namespace tesseract {
32 
33 class TFile;
34 
35 // Amount of padding to apply in output pixels in feature mode.
36 const int kFeaturePadding = 2;
37 // Number of pixels to pad around text boxes.
38 const int kImagePadding = 4;
39 
40 // Enum to determine the caching and data sequencing strategy.
42  // Reads all of one file before moving on to the next. Requires samples to be
43  // shuffled across files. Uses the count of samples in the first file as
44  // the count in all the files to achieve high-speed random access. As a
45  // consequence, if subsequent files are smaller, they get entries used more
46  // than once, and if subsequent files are larger, some entries are not used.
47  // Best for larger data sets that don't fit in memory.
49  // Reads one sample from each file in rotation. Does not require shuffled
50  // samples, but is extremely disk-intensive. Samples in smaller files also
51  // get used more often than samples in larger files.
52  // Best for smaller data sets that mostly fit in memory.
54 };
55 
56 class WordFeature {
57  public:
58  WordFeature();
59  WordFeature(const FCOORD& fcoord, uint8_t dir);
60 
61  // Computes the maximum x and y value in the features.
62  static void ComputeSize(const GenericVector<WordFeature>& features,
63  int* max_x, int* max_y);
64  // Draws the features in the given window.
65  static void Draw(const GenericVector<WordFeature>& features,
66  ScrollView* window);
67 
68  // Accessors.
69  int x() const { return x_; }
70  int y() const { return y_; }
71  int dir() const { return dir_; }
72 
73  // Writes to the given file. Returns false in case of error.
74  bool Serialize(FILE* fp) const;
75  // Reads from the given file. Returns false in case of error.
76  // If swap is true, assumes a big/little-endian swap is needed.
77  bool DeSerialize(bool swap, FILE* fp);
78 
79  private:
80  int16_t x_;
81  uint8_t y_;
82  uint8_t dir_;
83 };
84 
85 // A floating-point version of WordFeature, used as an intermediate during
86 // scaling.
88  static void FromWordFeatures(const GenericVector<WordFeature>& word_features,
89  GenericVector<FloatWordFeature>* float_features);
90  // Sort function to sort first by x-bucket, then by y.
91  static int SortByXBucket(const void*, const void*);
92 
93  float x;
94  float y;
95  float dir;
96  int x_bucket;
97 };
98 
99 // Class to hold information on a single image:
100 // Filename, cached image as a Pix*, character boxes, text transcription.
101 // The text transcription is the ground truth UTF-8 text for the image.
102 // Character boxes are optional and indicate the desired segmentation of
103 // the text into recognition units.
104 class ImageData {
105  public:
106  ImageData();
107  // Takes ownership of the pix.
108  ImageData(bool vertical, Pix* pix);
109  ~ImageData();
110 
111  // Builds and returns an ImageData from the basic data. Note that imagedata,
112  // truth_text, and box_text are all the actual file data, NOT filenames.
113  static ImageData* Build(const char* name, int page_number, const char* lang,
114  const char* imagedata, int imagedatasize,
115  const char* truth_text, const char* box_text);
116 
117  // Writes to the given file. Returns false in case of error.
118  bool Serialize(TFile* fp) const;
119  // Reads from the given file. Returns false in case of error.
120  bool DeSerialize(TFile* fp);
121  // As DeSerialize, but only seeks past the data - hence a static method.
122  static bool SkipDeSerialize(TFile* fp);
123 
124  // Other accessors.
125  const STRING& imagefilename() const {
126  return imagefilename_;
127  }
128  void set_imagefilename(const STRING& name) {
129  imagefilename_ = name;
130  }
131  int page_number() const {
132  return page_number_;
133  }
134  void set_page_number(int num) {
135  page_number_ = num;
136  }
138  return image_data_;
139  }
140  const STRING& language() const {
141  return language_;
142  }
143  void set_language(const STRING& lang) {
144  language_ = lang;
145  }
146  const STRING& transcription() const {
147  return transcription_;
148  }
149  const GenericVector<TBOX>& boxes() const {
150  return boxes_;
151  }
153  return box_texts_;
154  }
155  const STRING& box_text(int index) const {
156  return box_texts_[index];
157  }
158  // Saves the given Pix as a PNG-encoded string and destroys it.
159  // In case of missing PNG support in Leptonica use PNM format,
160  // which requires more memory.
161  void SetPix(Pix* pix);
162  // Returns the Pix image for *this. Must be pixDestroyed after use.
163  Pix* GetPix() const;
164  // Gets anything and everything with a non-nullptr pointer, prescaled to a
165  // given target_height (if 0, then the original image height), and aligned.
166  // Also returns (if not nullptr) the width and height of the scaled image.
167  // The return value is the scaled Pix, which must be pixDestroyed after use,
168  // and scale_factor (if not nullptr) is set to the scale factor that was applied
169  // to the image to achieve the target_height.
170  Pix* PreScale(int target_height, int max_height, float* scale_factor,
171  int* scaled_width, int* scaled_height,
172  GenericVector<TBOX>* boxes) const;
173 
174  int MemoryUsed() const;
175 
176  // Draws the data in a new window.
177  void Display() const;
178 
179  // Adds the supplied boxes and transcriptions that correspond to the correct
180  // page number.
181  void AddBoxes(const GenericVector<TBOX>& boxes,
182  const GenericVector<STRING>& texts,
183  const GenericVector<int>& box_pages);
184 
185  private:
186  // Saves the given Pix as a PNG-encoded string and destroys it.
187  // In case of missing PNG support in Leptonica use PNM format,
188  // which requires more memory.
189  static void SetPixInternal(Pix* pix, GenericVector<char>* image_data);
190  // Returns the Pix image for the image_data. Must be pixDestroyed after use.
191  static Pix* GetPixInternal(const GenericVector<char>& image_data);
192  // Parses the text string as a box file and adds any discovered boxes that
193  // match the page number. Returns false on error.
194  bool AddBoxes(const char* box_text);
195 
196  private:
197  STRING imagefilename_; // File to read image from.
198  int32_t page_number_; // Page number if multi-page tif or -1.
199  GenericVector<char> image_data_; // PNG/PNM file data.
200  STRING language_; // Language code for image.
201  STRING transcription_; // UTF-8 ground truth of image.
202  GenericVector<TBOX> boxes_; // If non-empty boxes of the image.
203  GenericVector<STRING> box_texts_; // String for text in each box.
204  bool vertical_text_; // Image has been rotated from vertical.
205 };
206 
207 // A collection of ImageData that knows roughly how much memory it is using.
209  friend void* ReCachePagesFunc(void* data);
210 
211  public:
212  explicit DocumentData(const STRING& name);
213  ~DocumentData();
214 
215  // Reads all the pages in the given lstmf filename to the cache. The reader
216  // is used to read the file.
217  bool LoadDocument(const char* filename, int start_page, int64_t max_memory,
218  FileReader reader);
219  // Sets up the document, without actually loading it.
220  void SetDocument(const char* filename, int64_t max_memory, FileReader reader);
221  // Writes all the pages to the given filename. Returns false on error.
222  bool SaveDocument(const char* filename, FileWriter writer);
223  bool SaveToBuffer(GenericVector<char>* buffer);
224 
225  // Adds the given page data to this document, counting up memory.
226  void AddPageToDocument(ImageData* page);
227 
228  const STRING& document_name() const {
229  std::lock_guard<std::mutex> lock(general_mutex_);
230  return document_name_;
231  }
232  int NumPages() const {
233  std::lock_guard<std::mutex> lock(general_mutex_);
234  return total_pages_;
235  }
236  size_t PagesSize() const {
237  return pages_.size();
238  }
239  int64_t memory_used() const {
240  std::lock_guard<std::mutex> lock(general_mutex_);
241  return memory_used_;
242  }
243  // If the given index is not currently loaded, loads it using a separate
244  // thread. Note: there are 4 cases:
245  // Document uncached: IsCached() returns false, total_pages_ < 0.
246  // Required page is available: IsPageAvailable returns true. In this case,
247  // total_pages_ > 0 and
248  // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size()
249  // Pages are loaded, but the required one is not.
250  // The requested page is being loaded by LoadPageInBackground. In this case,
251  // index == pages_offset_. Once the loading starts, the pages lock is held
252  // until it completes, at which point IsPageAvailable will unblock and return
253  // true.
254  void LoadPageInBackground(int index);
255  // Returns a pointer to the page with the given index, modulo the total
256  // number of pages. Blocks until the background load is completed.
257  const ImageData* GetPage(int index);
258  // Returns true if the requested page is available, and provides a pointer,
259  // which may be nullptr if the document is empty. May block, even though it
260  // doesn't guarantee to return true.
261  bool IsPageAvailable(int index, ImageData** page);
262  // Takes ownership of the given page index. The page is made nullptr in *this.
263  ImageData* TakePage(int index) {
264  std::lock_guard<std::mutex> lock(pages_mutex_);
265  ImageData* page = pages_[index];
266  pages_[index] = nullptr;
267  return page;
268  }
269  // Returns true if the document is currently loaded or in the process of
270  // loading.
271  bool IsCached() const { return NumPages() >= 0; }
272  // Removes all pages from memory and frees the memory, but does not forget
273  // the document metadata. Returns the memory saved.
274  int64_t UnCache();
275  // Shuffles all the pages in the document.
276  void Shuffle();
277 
278  private:
279  // Sets the value of total_pages_ behind a mutex.
280  void set_total_pages(int total) {
281  std::lock_guard<std::mutex> lock(general_mutex_);
282  total_pages_ = total;
283  }
284  void set_memory_used(int64_t memory_used) {
285  std::lock_guard<std::mutex> lock(general_mutex_);
286  memory_used_ = memory_used;
287  }
288  // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
289  // starting at index pages_offset_.
290  bool ReCachePages();
291 
292  private:
293  // A name for this document.
294  STRING document_name_;
295  // A group of pages that corresponds in some loose way to a document.
296  PointerVector<ImageData> pages_;
297  // Page number of the first index in pages_.
298  int pages_offset_;
299  // Total number of pages in document (may exceed size of pages_.)
300  int total_pages_;
301  // Total of all pix sizes in the document.
302  int64_t memory_used_;
303  // Max memory to use at any time.
304  int64_t max_memory_;
305  // Saved reader from LoadDocument to allow re-caching.
306  FileReader reader_;
307  // Mutex that protects pages_ and pages_offset_ against multiple parallel
308  // loads, and provides a wait for page.
309  std::mutex pages_mutex_;
310  // Mutex that protects other data members that callers want to access without
311  // waiting for a load operation.
312  mutable std::mutex general_mutex_;
313 };
314 
315 // A collection of DocumentData that knows roughly how much memory it is using.
316 // Note that while it supports background read-ahead, it assumes that a single
317 // thread is accessing documents, ie it is not safe for multiple threads to
318 // access different documents in parallel, as one may de-cache the other's
319 // content.
321  public:
322  explicit DocumentCache(int64_t max_memory);
323  ~DocumentCache();
324 
325  // Deletes all existing documents from the cache.
326  void Clear() {
327  documents_.clear();
328  num_pages_per_doc_ = 0;
329  }
330  // Adds all the documents in the list of filenames, counting memory.
331  // The reader is used to read the files.
332  bool LoadDocuments(const GenericVector<STRING>& filenames,
333  CachingStrategy cache_strategy, FileReader reader);
334 
335  // Adds document to the cache.
336  bool AddToCache(DocumentData* data);
337 
338  // Finds and returns a document by name.
339  DocumentData* FindDocument(const STRING& document_name) const;
340 
341  // Returns a page by serial number using the current cache_strategy_ to
342  // determine the mapping from serial number to page.
343  const ImageData* GetPageBySerial(int serial) {
344  if (cache_strategy_ == CS_SEQUENTIAL)
345  return GetPageSequential(serial);
346  else
347  return GetPageRoundRobin(serial);
348  }
349 
351  return documents_;
352  }
353  // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
354  // strategy, could take a long time.
355  int TotalPages();
356 
357  private:
358  // Returns a page by serial number, selecting them in a round-robin fashion
359  // from all the documents. Highly disk-intensive, but doesn't need samples
360  // to be shuffled between files to begin with.
361  const ImageData* GetPageRoundRobin(int serial);
362  // Returns a page by serial number, selecting them in sequence from each file.
363  // Requires the samples to be shuffled between the files to give a random or
364  // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
365  const ImageData* GetPageSequential(int serial);
366 
367  // Helper counts the number of adjacent cached neighbour documents_ of index
368  // looking in direction dir, ie index+dir, index+2*dir etc.
369  int CountNeighbourDocs(int index, int dir);
370 
371  // A group of pages that corresponds in some loose way to a document.
372  PointerVector<DocumentData> documents_;
373  // Strategy to use for caching and serializing data samples.
374  CachingStrategy cache_strategy_;
375  // Number of pages in the first document, used as a divisor in
376  // GetPageSequential to determine the document index.
377  int num_pages_per_doc_;
378  // Max memory allowed in this cache.
379  int64_t max_memory_;
380 };
381 
382 } // namespace tesseract
383 
384 
385 #endif // TESSERACT_IMAGE_IMAGEDATA_H_
tesseract::DocumentData::document_name
const STRING & document_name() const
Definition: imagedata.h:228
tesseract::DocumentData::Shuffle
void Shuffle()
Definition: imagedata.cpp:495
tesseract::WordFeature::Serialize
bool Serialize(FILE *fp) const
Definition: imagedata.cpp:82
ScrollView
Definition: scrollview.h:97
strngs.h
tesseract::FileWriter
bool(*)(const GenericVector< char > &data, const char *filename) FileWriter
Definition: serialis.h:51
tesseract::DocumentCache::FindDocument
DocumentData * FindDocument(const STRING &document_name) const
Definition: imagedata.cpp:597
tesseract::ImageData::Serialize
bool Serialize(TFile *fp) const
Definition: imagedata.cpp:162
tesseract::WordFeature::x
int x() const
Definition: imagedata.h:69
tesseract::ImageData::PreScale
Pix * PreScale(int target_height, int max_height, float *scale_factor, int *scaled_width, int *scaled_height, GenericVector< TBOX > *boxes) const
Definition: imagedata.cpp:224
tesseract::DocumentData::IsPageAvailable
bool IsPageAvailable(int index, ImageData **page)
Definition: imagedata.cpp:463
tesseract::ImageData::set_imagefilename
void set_imagefilename(const STRING &name)
Definition: imagedata.h:128
tesseract::CS_ROUND_ROBIN
Definition: imagedata.h:53
tesseract::DocumentData::NumPages
int NumPages() const
Definition: imagedata.h:232
tesseract::DocumentData::PagesSize
size_t PagesSize() const
Definition: imagedata.h:236
tesseract::WordFeature
Definition: imagedata.h:56
tesseract::WordFeature::DeSerialize
bool DeSerialize(bool swap, FILE *fp)
Definition: imagedata.cpp:89
tesseract::FloatWordFeature::x
float x
Definition: imagedata.h:93
tesseract::DocumentCache::LoadDocuments
bool LoadDocuments(const GenericVector< STRING > &filenames, CachingStrategy cache_strategy, FileReader reader)
Definition: imagedata.cpp:566
tesseract::DocumentData::ReCachePagesFunc
friend void * ReCachePagesFunc(void *data)
tesseract::ImageData::AddBoxes
void AddBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, const GenericVector< int > &box_pages)
Definition: imagedata.cpp:311
tesseract::PointerVector
Definition: genericvector.h:417
tesseract::WordFeature::dir
int dir() const
Definition: imagedata.h:71
STRING
Definition: strngs.h:45
tesseract::DocumentCache
Definition: imagedata.h:320
tesseract::DocumentData::UnCache
int64_t UnCache()
Definition: imagedata.cpp:482
tesseract::DocumentData::~DocumentData
~DocumentData()
Definition: imagedata.cpp:380
tesseract::FileReader
bool(*)(const char *filename, GenericVector< char > *data) FileReader
Definition: serialis.h:47
tesseract::ImageData::box_texts
const GenericVector< STRING > & box_texts() const
Definition: imagedata.h:152
tesseract::FloatWordFeature::FromWordFeatures
static void FromWordFeatures(const GenericVector< WordFeature > &word_features, GenericVector< FloatWordFeature > *float_features)
Definition: imagedata.cpp:96
tesseract::ImageData::imagefilename
const STRING & imagefilename() const
Definition: imagedata.h:125
tesseract::kFeaturePadding
const int kFeaturePadding
Definition: imagedata.h:36
tesseract::CS_SEQUENTIAL
Definition: imagedata.h:48
FCOORD
Definition: points.h:187
tesseract::ImageData
Definition: imagedata.h:104
tesseract::DocumentData::GetPage
const ImageData * GetPage(int index)
Definition: imagedata.cpp:445
tesseract::DocumentData::SetDocument
void SetDocument(const char *filename, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:395
tesseract::DocumentCache::~DocumentCache
~DocumentCache()
Definition: imagedata.cpp:562
tesseract::DocumentData::SaveDocument
bool SaveDocument(const char *filename, FileWriter writer)
Definition: imagedata.cpp:406
genericvector.h
tesseract::ImageData::boxes
const GenericVector< TBOX > & boxes() const
Definition: imagedata.h:149
tesseract::DocumentCache::GetPageBySerial
const ImageData * GetPageBySerial(int serial)
Definition: imagedata.h:343
tesseract::ImageData::SkipDeSerialize
static bool SkipDeSerialize(TFile *fp)
Definition: imagedata.cpp:193
tesseract::ImageData::~ImageData
~ImageData()
Definition: imagedata.cpp:126
tesseract::kImagePadding
const int kImagePadding
Definition: imagedata.h:38
tesseract::ImageData::transcription
const STRING & transcription() const
Definition: imagedata.h:146
tesseract::DocumentCache::TotalPages
int TotalPages()
Definition: imagedata.cpp:607
tesseract::DocumentData::memory_used
int64_t memory_used() const
Definition: imagedata.h:239
tesseract::ImageData::box_text
const STRING & box_text(int index) const
Definition: imagedata.h:155
tesseract::DocumentCache::AddToCache
bool AddToCache(DocumentData *data)
Definition: imagedata.cpp:591
tesseract::TFile
Definition: serialis.h:75
tesseract::ImageData::image_data
const GenericVector< char > & image_data() const
Definition: imagedata.h:137
tesseract::WordFeature::ComputeSize
static void ComputeSize(const GenericVector< WordFeature > &features, int *max_x, int *max_y)
Definition: imagedata.cpp:54
tesseract::FloatWordFeature::x_bucket
int x_bucket
Definition: imagedata.h:96
tesseract
Definition: baseapi.h:65
tesseract::ImageData::page_number
int page_number() const
Definition: imagedata.h:131
tesseract::FloatWordFeature::y
float y
Definition: imagedata.h:94
tesseract::WordFeature::WordFeature
WordFeature()
Definition: imagedata.cpp:44
tesseract::ImageData::ImageData
ImageData()
Definition: imagedata.cpp:119
tesseract::DocumentData::AddPageToDocument
void AddPageToDocument(ImageData *page)
Definition: imagedata.cpp:424
tesseract::WordFeature::Draw
static void Draw(const GenericVector< WordFeature > &features, ScrollView *window)
Definition: imagedata.cpp:65
tesseract::DocumentData::LoadDocument
bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:387
GenericVector
Definition: baseapi.h:40
tesseract::ImageData::DeSerialize
bool DeSerialize(TFile *fp)
Definition: imagedata.cpp:177
tesseract::DocumentData::LoadPageInBackground
void LoadPageInBackground(int index)
Definition: imagedata.cpp:432
tesseract::WordFeature::y
int y() const
Definition: imagedata.h:70
tesseract::FloatWordFeature::dir
float dir
Definition: imagedata.h:95
tesseract::ImageData::MemoryUsed
int MemoryUsed() const
Definition: imagedata.cpp:268
tesseract::CachingStrategy
CachingStrategy
Definition: imagedata.h:41
tesseract::ImageData::Build
static ImageData * Build(const char *name, int page_number, const char *lang, const char *imagedata, int imagedatasize, const char *truth_text, const char *box_text)
Definition: imagedata.cpp:131
tesseract::ImageData::language
const STRING & language() const
Definition: imagedata.h:140
tesseract::DocumentData::DocumentData
DocumentData(const STRING &name)
Definition: imagedata.cpp:372
tesseract::ImageData::set_page_number
void set_page_number(int num)
Definition: imagedata.h:134
tesseract::ImageData::set_language
void set_language(const STRING &lang)
Definition: imagedata.h:143
tesseract::DocumentData
Definition: imagedata.h:208
tesseract::DocumentCache::Clear
void Clear()
Definition: imagedata.h:326
tesseract::ImageData::Display
void Display() const
Definition: imagedata.cpp:273
tesseract::DocumentCache::DocumentCache
DocumentCache(int64_t max_memory)
Definition: imagedata.cpp:560
tesseract::DocumentData::SaveToBuffer
bool SaveToBuffer(GenericVector< char > *buffer)
Definition: imagedata.cpp:416
tesseract::FloatWordFeature
Definition: imagedata.h:87
tesseract::FloatWordFeature::SortByXBucket
static int SortByXBucket(const void *, const void *)
Definition: imagedata.cpp:111
tesseract::DocumentCache::documents
const PointerVector< DocumentData > & documents() const
Definition: imagedata.h:350
tesseract::ImageData::GetPix
Pix * GetPix() const
Definition: imagedata.cpp:214
tesseract::DocumentData::TakePage
ImageData * TakePage(int index)
Definition: imagedata.h:263
tesseract::ImageData::SetPix
void SetPix(Pix *pix)
Definition: imagedata.cpp:209
tesseract::DocumentData::IsCached
bool IsCached() const
Definition: imagedata.h:271
points.h
TBOX
Definition: rect.h:33