tesseract  5.0.0-alpha-619-ge9db
tessdatamanager.cpp
Go to the documentation of this file.
1 // File: tessdatamanager.cpp
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 //
6 // (C) Copyright 2009, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifdef HAVE_CONFIG_H
20 #include "config_auto.h"
21 #endif
22 
23 #include "tessdatamanager.h"
24 
25 #include <cstdio>
26 #include <string>
27 
28 #if defined(HAVE_LIBARCHIVE)
29 #include <archive.h>
30 #include <archive_entry.h>
31 #endif
32 
33 #include "errcode.h"
34 #include <tesseract/helpers.h>
35 #include <tesseract/serialis.h>
36 #include <tesseract/strngs.h>
37 #include "tprintf.h"
38 #include "params.h"
39 
40 namespace tesseract {
41 
42 TessdataManager::TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) {
43  SetVersionString(PACKAGE_VERSION);
44 }
45 
47  : reader_(reader),
48  is_loaded_(false),
49  swap_(false) {
50  SetVersionString(PACKAGE_VERSION);
51 }
52 
53 // Lazily loads from the the given filename. Won't actually read the file
54 // until it needs it.
55 void TessdataManager::LoadFileLater(const char *data_file_name) {
56  Clear();
57  data_file_name_ = data_file_name;
58 }
59 
60 #if defined(HAVE_LIBARCHIVE)
61 bool TessdataManager::LoadArchiveFile(const char *filename) {
62  bool result = false;
63  archive *a = archive_read_new();
64  if (a != nullptr) {
65  archive_read_support_filter_all(a);
66  archive_read_support_format_all(a);
67  if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
68  archive_entry *ae;
69  while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
70  const char *component = archive_entry_pathname(ae);
71  if (component != nullptr) {
73  if (TessdataTypeFromFileName(component, &type)) {
74  int64_t size = archive_entry_size(ae);
75  if (size > 0) {
76  entries_[type].resize_no_init(size);
77  if (archive_read_data(a, &entries_[type][0], size) == size) {
78  is_loaded_ = true;
79  }
80  }
81  }
82  }
83  }
84  result = is_loaded_;
85 #if defined(DEBUG)
86  } else {
87  tprintf("archive_read_open_filename(...,%s,...) failed, %s\n",
88  filename, strerror(archive_errno(a)));
89 #endif
90  }
91  archive_read_free(a);
92  }
93  return result;
94 }
95 #endif
96 
97 bool TessdataManager::Init(const char *data_file_name) {
99  if (reader_ == nullptr) {
100 #if defined(HAVE_LIBARCHIVE)
101  if (LoadArchiveFile(data_file_name)) return true;
102 #endif
103  if (!LoadDataFromFile(data_file_name, &data)) return false;
104  } else {
105  if (!(*reader_)(data_file_name, &data)) return false;
106  }
107  return LoadMemBuffer(data_file_name, &data[0], data.size());
108 }
109 
110 // Loads from the given memory buffer as if a file.
111 bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
112  int size) {
113  // TODO: This method supports only the proprietary file format.
114  Clear();
115  data_file_name_ = name;
116  TFile fp;
117  fp.Open(data, size);
118  uint32_t num_entries;
119  if (!fp.DeSerialize(&num_entries)) return false;
120  swap_ = num_entries > kMaxNumTessdataEntries;
121  fp.set_swap(swap_);
122  if (swap_) ReverseN(&num_entries, sizeof(num_entries));
123  if (num_entries > kMaxNumTessdataEntries) return false;
124  GenericVector<int64_t> offset_table;
125  offset_table.resize_no_init(num_entries);
126  if (!fp.DeSerialize(&offset_table[0], num_entries)) return false;
127  for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
128  if (offset_table[i] >= 0) {
129  int64_t entry_size = size - offset_table[i];
130  unsigned j = i + 1;
131  while (j < num_entries && offset_table[j] == -1) ++j;
132  if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
133  entries_[i].resize_no_init(entry_size);
134  if (!fp.DeSerialize(&entries_[i][0], entry_size)) return false;
135  }
136  }
137  if (entries_[TESSDATA_VERSION].empty()) {
138  SetVersionString("Pre-4.0.0");
139  }
140  is_loaded_ = true;
141  return true;
142 }
143 
144 // Overwrites a single entry of the given type.
146  int size) {
147  is_loaded_ = true;
148  entries_[type].resize_no_init(size);
149  memcpy(&entries_[type][0], data, size);
150 }
151 
152 // Saves to the given filename.
153 bool TessdataManager::SaveFile(const STRING &filename,
154  FileWriter writer) const {
155  // TODO: This method supports only the proprietary file format.
156  ASSERT_HOST(is_loaded_);
157  GenericVector<char> data;
158  Serialize(&data);
159  if (writer == nullptr)
160  return SaveDataToFile(data, filename.c_str());
161  else
162  return (*writer)(data, filename.c_str());
163 }
164 
165 // Serializes to the given vector.
167  // TODO: This method supports only the proprietary file format.
168  ASSERT_HOST(is_loaded_);
169  // Compute the offset_table and total size.
170  int64_t offset_table[TESSDATA_NUM_ENTRIES];
171  int64_t offset = sizeof(int32_t) + sizeof(offset_table);
172  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
173  if (entries_[i].empty()) {
174  offset_table[i] = -1;
175  } else {
176  offset_table[i] = offset;
177  offset += entries_[i].size();
178  }
179  }
180  data->init_to_size(offset, 0);
181  int32_t num_entries = TESSDATA_NUM_ENTRIES;
182  TFile fp;
183  fp.OpenWrite(data);
184  fp.Serialize(&num_entries);
185  fp.Serialize(&offset_table[0], countof(offset_table));
186  for (const auto& entry : entries_) {
187  if (!entry.empty()) {
188  fp.Serialize(&entry[0], entry.size());
189  }
190  }
191 }
192 
193 // Resets to the initial state, keeping the reader.
195  for (auto& entry : entries_) {
196  entry.clear();
197  }
198  is_loaded_ = false;
199 }
200 
201 // Prints a directory of contents.
203  tprintf("Version string:%s\n", VersionString().c_str());
204  int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
205  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
206  if (!entries_[i].empty()) {
207  tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
208  entries_[i].size(), offset);
209  offset += entries_[i].size();
210  }
211  }
212 }
213 
214 // Opens the given TFile pointer to the given component type.
215 // Returns false in case of failure.
217  if (!is_loaded_ && !Init(data_file_name_.c_str())) return false;
218  const TessdataManager *const_this = this;
219  return const_this->GetComponent(type, fp);
220 }
221 
222 // As non-const version except it can't load the component if not already
223 // loaded.
225  ASSERT_HOST(is_loaded_);
226  if (entries_[type].empty()) return false;
227  fp->Open(&entries_[type][0], entries_[type].size());
228  fp->set_swap(swap_);
229  return true;
230 }
231 
232 // Returns the current version string.
234  return std::string(&entries_[TESSDATA_VERSION][0],
235  entries_[TESSDATA_VERSION].size());
236 }
237 
238 // Sets the version string to the given v_str.
240  entries_[TESSDATA_VERSION].resize_no_init(v_str.size());
241  memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
242 }
243 
245  const char *language_data_path_prefix,
246  const char *output_filename) {
247  // Load individual tessdata components from files.
248  for (auto filesuffix : kTessdataFileSuffixes) {
250  ASSERT_HOST(TessdataTypeFromFileSuffix(filesuffix, &type));
251  STRING filename = language_data_path_prefix;
252  filename += filesuffix;
253  FILE *fp = fopen(filename.c_str(), "rb");
254  if (fp != nullptr) {
255  fclose(fp);
256  if (!LoadDataFromFile(filename.c_str(), &entries_[type])) {
257  tprintf("Load of file %s failed!\n", filename.c_str());
258  return false;
259  }
260  }
261  }
262  is_loaded_ = true;
263 
264  // Make sure that the required components are present.
265  if (!IsBaseAvailable() && !IsLSTMAvailable()) {
266  tprintf(
267  "Error: traineddata file must contain at least (a unicharset file"
268  "and inttemp) OR an lstm file.\n");
269  return false;
270  }
271  // Write updated data to the output traineddata file.
272  return SaveFile(output_filename, nullptr);
273 }
274 
276  const char *new_traineddata_filename,
277  char **component_filenames,
278  int num_new_components) {
279  // Open the files with the new components.
280  // TODO: This method supports only the proprietary file format.
281  for (int i = 0; i < num_new_components; ++i) {
283  if (TessdataTypeFromFileName(component_filenames[i], &type)) {
284  if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
285  tprintf("Failed to read component file:%s\n", component_filenames[i]);
286  return false;
287  }
288  }
289  }
290 
291  // Write updated data to the output traineddata file.
292  return SaveFile(new_traineddata_filename, nullptr);
293 }
294 
295 bool TessdataManager::ExtractToFile(const char *filename) {
297  ASSERT_HOST(
298  tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type));
299  if (entries_[type].empty()) return false;
300  return SaveDataToFile(entries_[type], filename);
301 }
302 
303 bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix,
304  TessdataType *type) {
305  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
306  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
307  *type = static_cast<TessdataType>(i);
308  return true;
309  }
310  }
311 #if defined(DEBUG)
312  tprintf("TessdataManager can't determine which tessdata"
313  " component is represented by %s\n", suffix);
314 #endif
315  return false;
316 }
317 
318 bool TessdataManager::TessdataTypeFromFileName(const char *filename,
319  TessdataType *type) {
320  // Get the file suffix (extension)
321  const char *suffix = strrchr(filename, '.');
322  if (suffix == nullptr || *(++suffix) == '\0') return false;
323  return TessdataTypeFromFileSuffix(suffix, type);
324 }
325 
326 } // namespace tesseract
string
std::string string
Definition: equationdetect_test.cc:21
strngs.h
tesseract::FileWriter
bool(*)(const GenericVector< char > &data, const char *filename) FileWriter
Definition: serialis.h:51
tesseract::TessdataManager::OverwriteComponents
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
Definition: tessdatamanager.cpp:275
tesseract::TessdataManager::SetVersionString
void SetVersionString(const std::string &v_str)
Definition: tessdatamanager.cpp:239
tesseract::TessdataManager
Definition: tessdatamanager.h:126
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::LoadDataFromFile
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
Definition: genericvector.h:341
params.h
tesseract::TessdataManager::VersionString
std::string VersionString() const
Definition: tessdatamanager.cpp:233
tesseract::countof
constexpr size_t countof(T const (&)[N]) noexcept
Definition: serialis.h:41
STRING
Definition: strngs.h:45
tesseract::FileReader
bool(*)(const char *filename, GenericVector< char > *data) FileReader
Definition: serialis.h:47
tesseract::SaveDataToFile
bool SaveDataToFile(const GenericVector< char > &data, const char *filename)
Definition: genericvector.h:362
tesseract::TessdataManager::Directory
void Directory() const
Definition: tessdatamanager.cpp:202
tesseract::TessdataManager::IsLSTMAvailable
bool IsLSTMAvailable() const
Definition: tessdatamanager.h:183
tesseract::TFile::Open
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:210
tesseract::TessdataManager::CombineDataFiles
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
Definition: tessdatamanager.cpp:244
tesseract::TessdataManager::OverwriteEntry
void OverwriteEntry(TessdataType type, const char *data, int size)
Definition: tessdatamanager.cpp:145
tesseract::TESSDATA_VERSION
Definition: tessdatamanager.h:80
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::TFile::DeSerialize
bool DeSerialize(char *data, size_t count=1)
Definition: serialis.cpp:117
tesseract::TessdataManager::TessdataManager
TessdataManager()
Definition: tessdatamanager.cpp:42
tesseract::TFile::Serialize
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:161
tesseract::TessdataManager::GetComponent
bool GetComponent(TessdataType type, TFile *fp)
Definition: tessdatamanager.cpp:216
GenericVector::resize_no_init
void resize_no_init(int size)
Definition: genericvector.h:65
tesseract::TFile
Definition: serialis.h:75
tesseract::TessdataManager::LoadMemBuffer
bool LoadMemBuffer(const char *name, const char *data, int size)
Definition: tessdatamanager.cpp:111
helpers.h
tesseract
Definition: baseapi.h:65
tesseract::TessdataType
TessdataType
Definition: tessdatamanager.h:56
tprintf.h
tesseract::TFile::set_swap
void set_swap(bool value)
Definition: serialis.h:89
GenericVector< char >
tesseract::TESSDATA_NUM_ENTRIES
Definition: tessdatamanager.h:82
tesseract::TessdataManager::Init
bool Init(const char *data_file_name)
Definition: tessdatamanager.cpp:97
tesseract::TessdataManager::Clear
void Clear()
Definition: tessdatamanager.cpp:194
tesseract::TessdataManager::LoadFileLater
void LoadFileLater(const char *data_file_name)
Definition: tessdatamanager.cpp:55
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:706
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
errcode.h
tesstrain_utils.type
type
Definition: tesstrain_utils.py:141
serialis.h
ReverseN
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:183
tesseract::TessdataManager::Serialize
void Serialize(GenericVector< char > *data) const
Definition: tessdatamanager.cpp:166
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::TFile::OpenWrite
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:309
tessdatamanager.h
tesseract::TessdataManager::IsBaseAvailable
bool IsBaseAvailable() const
Definition: tessdatamanager.h:177
tesseract::TessdataManager::SaveFile
bool SaveFile(const STRING &filename, FileWriter writer) const
Definition: tessdatamanager.cpp:153
tesseract::TessdataManager::ExtractToFile
bool ExtractToFile(const char *filename)
Definition: tessdatamanager.cpp:295