tesseract  4.0.0-1-g2a2b
tessdatamanager.cpp
Go to the documentation of this file.
1 // File: tessdatamanager.cpp
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 // Created: Wed Jun 03 11:26:43 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #include "tessdatamanager.h"
25 
26 #include <cstdio>
27 
28 #include "errcode.h"
29 #include "helpers.h"
30 #include "serialis.h"
31 #include "strngs.h"
32 #include "tprintf.h"
33 #include "params.h"
34 
35 namespace tesseract {
36 
37 TessdataManager::TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) {
38  SetVersionString(PACKAGE_VERSION);
39 }
40 
42  : reader_(reader),
43  is_loaded_(false),
44  swap_(false) {
45  SetVersionString(PACKAGE_VERSION);
46 }
47 
48 // Lazily loads from the the given filename. Won't actually read the file
49 // until it needs it.
50 void TessdataManager::LoadFileLater(const char *data_file_name) {
51  Clear();
52  data_file_name_ = data_file_name;
53 }
54 
55 bool TessdataManager::Init(const char *data_file_name) {
57  if (reader_ == nullptr) {
58  if (!LoadDataFromFile(data_file_name, &data)) return false;
59  } else {
60  if (!(*reader_)(data_file_name, &data)) return false;
61  }
62  return LoadMemBuffer(data_file_name, &data[0], data.size());
63 }
64 
65 // Loads from the given memory buffer as if a file.
66 bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
67  int size) {
68  Clear();
69  data_file_name_ = name;
70  TFile fp;
71  fp.Open(data, size);
72  uint32_t num_entries;
73  if (!fp.DeSerialize(&num_entries)) return false;
74  swap_ = num_entries > kMaxNumTessdataEntries;
75  fp.set_swap(swap_);
76  if (swap_) ReverseN(&num_entries, sizeof(num_entries));
77  if (num_entries > kMaxNumTessdataEntries) return false;
78  GenericVector<int64_t> offset_table;
79  offset_table.resize_no_init(num_entries);
80  if (!fp.DeSerialize(&offset_table[0], num_entries)) return false;
81  for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
82  if (offset_table[i] >= 0) {
83  int64_t entry_size = size - offset_table[i];
84  int j = i + 1;
85  while (j < num_entries && offset_table[j] == -1) ++j;
86  if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
87  entries_[i].resize_no_init(entry_size);
88  if (!fp.DeSerialize(&entries_[i][0], entry_size)) return false;
89  }
90  }
91  if (entries_[TESSDATA_VERSION].empty()) {
92  SetVersionString("Pre-4.0.0");
93  }
94  is_loaded_ = true;
95  return true;
96 }
97 
98 // Overwrites a single entry of the given type.
99 void TessdataManager::OverwriteEntry(TessdataType type, const char *data,
100  int size) {
101  is_loaded_ = true;
102  entries_[type].resize_no_init(size);
103  memcpy(&entries_[type][0], data, size);
104 }
105 
106 // Saves to the given filename.
107 bool TessdataManager::SaveFile(const STRING &filename,
108  FileWriter writer) const {
109  ASSERT_HOST(is_loaded_);
110  GenericVector<char> data;
111  Serialize(&data);
112  if (writer == nullptr)
113  return SaveDataToFile(data, filename);
114  else
115  return (*writer)(data, filename);
116 }
117 
118 // Serializes to the given vector.
120  ASSERT_HOST(is_loaded_);
121  // Compute the offset_table and total size.
122  int64_t offset_table[TESSDATA_NUM_ENTRIES];
123  int64_t offset = sizeof(int32_t) + sizeof(offset_table);
124  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
125  if (entries_[i].empty()) {
126  offset_table[i] = -1;
127  } else {
128  offset_table[i] = offset;
129  offset += entries_[i].size();
130  }
131  }
132  data->init_to_size(offset, 0);
133  int32_t num_entries = TESSDATA_NUM_ENTRIES;
134  TFile fp;
135  fp.OpenWrite(data);
136  fp.Serialize(&num_entries);
137  fp.Serialize(&offset_table[0], countof(offset_table));
138  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
139  if (!entries_[i].empty()) {
140  fp.Serialize(&entries_[i][0], entries_[i].size());
141  }
142  }
143 }
144 
145 // Resets to the initial state, keeping the reader.
147  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
148  entries_[i].clear();
149  }
150  is_loaded_ = false;
151 }
152 
153 // Prints a directory of contents.
155  tprintf("Version string:%s\n", VersionString().c_str());
156  int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
157  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
158  if (!entries_[i].empty()) {
159  tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
160  entries_[i].size(), offset);
161  offset += entries_[i].size();
162  }
163  }
164 }
165 
166 // Opens the given TFile pointer to the given component type.
167 // Returns false in case of failure.
169  if (!is_loaded_ && !Init(data_file_name_.string())) return false;
170  const TessdataManager *const_this = this;
171  return const_this->GetComponent(type, fp);
172 }
173 
174 // As non-const version except it can't load the component if not already
175 // loaded.
177  ASSERT_HOST(is_loaded_);
178  if (entries_[type].empty()) return false;
179  fp->Open(&entries_[type][0], entries_[type].size());
180  fp->set_swap(swap_);
181  return true;
182 }
183 
184 // Returns the current version string.
185 std::string TessdataManager::VersionString() const {
186  return std::string(&entries_[TESSDATA_VERSION][0],
187  entries_[TESSDATA_VERSION].size());
188 }
189 
190 // Sets the version string to the given v_str.
191 void TessdataManager::SetVersionString(const std::string &v_str) {
192  entries_[TESSDATA_VERSION].resize_no_init(v_str.size());
193  memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
194 }
195 
197  const char *language_data_path_prefix,
198  const char *output_filename) {
199  // Load individual tessdata components from files.
200  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
201  TessdataType type;
202  ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
203  STRING filename = language_data_path_prefix;
204  filename += kTessdataFileSuffixes[i];
205  FILE *fp = fopen(filename.string(), "rb");
206  if (fp != nullptr) {
207  fclose(fp);
208  if (!LoadDataFromFile(filename, &entries_[type])) {
209  tprintf("Load of file %s failed!\n", filename.string());
210  return false;
211  }
212  }
213  }
214  is_loaded_ = true;
215 
216  // Make sure that the required components are present.
217  if (!IsBaseAvailable() && !IsLSTMAvailable()) {
218  tprintf(
219  "Error: traineddata file must contain at least (a unicharset file"
220  "and inttemp) OR an lstm file.\n");
221  return false;
222  }
223  // Write updated data to the output traineddata file.
224  return SaveFile(output_filename, nullptr);
225 }
226 
228  const char *new_traineddata_filename,
229  char **component_filenames,
230  int num_new_components) {
231  // Open the files with the new components.
232  for (int i = 0; i < num_new_components; ++i) {
233  TessdataType type;
234  if (TessdataTypeFromFileName(component_filenames[i], &type)) {
235  if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
236  tprintf("Failed to read component file:%s\n", component_filenames[i]);
237  return false;
238  }
239  }
240  }
241 
242  // Write updated data to the output traineddata file.
243  return SaveFile(new_traineddata_filename, nullptr);
244 }
245 
246 bool TessdataManager::ExtractToFile(const char *filename) {
248  ASSERT_HOST(
250  if (entries_[type].empty()) return false;
251  return SaveDataToFile(entries_[type], filename);
252 }
253 
255  TessdataType *type) {
256  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
257  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
258  *type = static_cast<TessdataType>(i);
259  return true;
260  }
261  }
262  tprintf("TessdataManager can't determine which tessdata"
263  " component is represented by %s\n", suffix);
264  return false;
265 }
266 
268  TessdataType *type) {
269  // Get the file suffix (extension)
270  const char *suffix = strrchr(filename, '.');
271  if (suffix == nullptr || *(++suffix) == '\0') return false;
272  return TessdataTypeFromFileSuffix(suffix, type);
273 }
274 
275 } // namespace tesseract
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type)
void SetVersionString(const std::string &v_str)
void resize_no_init(int size)
Definition: genericvector.h:65
int size() const
Definition: genericvector.h:71
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
bool GetComponent(TessdataType type, TFile *fp)
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:295
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)
void OverwriteEntry(TessdataType type, const char *data, int size)
const char * string() const
Definition: strngs.cpp:196
bool LoadMemBuffer(const char *name, const char *data, int size)
bool DeSerialize(char *data, size_t count=1)
Definition: serialis.cpp:103
void LoadFileLater(const char *data_file_name)
constexpr size_t countof(T const (&)[N]) noexcept
Definition: serialis.h:43
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:178
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
void init_to_size(int size, const T &t)
bool Init(const char *data_file_name)
void set_swap(bool value)
Definition: serialis.h:91
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:147
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool(* FileReader)(const STRING &filename, GenericVector< char > *data)
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:196
Definition: strngs.h:45
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type)
void Serialize(GenericVector< char > *data) const
bool SaveFile(const STRING &filename, FileWriter writer) const
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
std::string VersionString() const
bool ExtractToFile(const char *filename)
bool(* FileWriter)(const GenericVector< char > &data, const STRING &filename)
#define ASSERT_HOST(x)
Definition: errcode.h:84