tesseract  5.0.0-alpha-619-ge9db
tesseract::TessdataManager Class Reference

#include <tessdatamanager.h>

Public Member Functions

 TessdataManager ()
 
 TessdataManager (FileReader reader)
 
 ~TessdataManager ()=default
 
bool swap () const
 
bool is_loaded () const
 
void LoadFileLater (const char *data_file_name)
 
bool Init (const char *data_file_name)
 
bool LoadMemBuffer (const char *name, const char *data, int size)
 
void OverwriteEntry (TessdataType type, const char *data, int size)
 
bool SaveFile (const STRING &filename, FileWriter writer) const
 
void Serialize (GenericVector< char > *data) const
 
void Clear ()
 
void Directory () const
 
bool IsComponentAvailable (TessdataType type) const
 
bool GetComponent (TessdataType type, TFile *fp)
 
bool GetComponent (TessdataType type, TFile *fp) const
 
std::string VersionString () const
 
void SetVersionString (const std::string &v_str)
 
bool IsBaseAvailable () const
 
bool IsLSTMAvailable () const
 
const STRINGGetDataFileName () const
 
bool CombineDataFiles (const char *language_data_path_prefix, const char *output_filename)
 
bool OverwriteComponents (const char *new_traineddata_filename, char **component_filenames, int num_new_components)
 
bool ExtractToFile (const char *filename)
 

Detailed Description

Definition at line 126 of file tessdatamanager.h.

Constructor & Destructor Documentation

◆ TessdataManager() [1/2]

tesseract::TessdataManager::TessdataManager ( )

Definition at line 42 of file tessdatamanager.cpp.

42  : reader_(nullptr), is_loaded_(false), swap_(false) {
43  SetVersionString(PACKAGE_VERSION);
44 }

◆ TessdataManager() [2/2]

tesseract::TessdataManager::TessdataManager ( FileReader  reader)
explicit

Definition at line 46 of file tessdatamanager.cpp.

47  : reader_(reader),
48  is_loaded_(false),
49  swap_(false) {
50  SetVersionString(PACKAGE_VERSION);
51 }

◆ ~TessdataManager()

tesseract::TessdataManager::~TessdataManager ( )
default

Member Function Documentation

◆ Clear()

void tesseract::TessdataManager::Clear ( )

Definition at line 194 of file tessdatamanager.cpp.

194  {
195  for (auto& entry : entries_) {
196  entry.clear();
197  }
198  is_loaded_ = false;
199 }

◆ CombineDataFiles()

bool tesseract::TessdataManager::CombineDataFiles ( const char *  language_data_path_prefix,
const char *  output_filename 
)

Reads all the standard tesseract config and data files for a language at the given path and bundles them up into one binary data file. Returns true if the combined traineddata file was successfully written.

Definition at line 244 of file tessdatamanager.cpp.

246  {
247  // Load individual tessdata components from files.
248  for (auto filesuffix : kTessdataFileSuffixes) {
250  ASSERT_HOST(TessdataTypeFromFileSuffix(filesuffix, &type));
251  STRING filename = language_data_path_prefix;
252  filename += filesuffix;
253  FILE *fp = fopen(filename.c_str(), "rb");
254  if (fp != nullptr) {
255  fclose(fp);
256  if (!LoadDataFromFile(filename.c_str(), &entries_[type])) {
257  tprintf("Load of file %s failed!\n", filename.c_str());
258  return false;
259  }
260  }
261  }
262  is_loaded_ = true;
263 
264  // Make sure that the required components are present.
265  if (!IsBaseAvailable() && !IsLSTMAvailable()) {
266  tprintf(
267  "Error: traineddata file must contain at least (a unicharset file"
268  "and inttemp) OR an lstm file.\n");
269  return false;
270  }
271  // Write updated data to the output traineddata file.
272  return SaveFile(output_filename, nullptr);
273 }

◆ Directory()

void tesseract::TessdataManager::Directory ( ) const

Definition at line 202 of file tessdatamanager.cpp.

202  {
203  tprintf("Version string:%s\n", VersionString().c_str());
204  int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
205  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
206  if (!entries_[i].empty()) {
207  tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
208  entries_[i].size(), offset);
209  offset += entries_[i].size();
210  }
211  }
212 }

◆ ExtractToFile()

bool tesseract::TessdataManager::ExtractToFile ( const char *  filename)

Extracts tessdata component implied by the name of the input file from the combined traineddata loaded into TessdataManager. Writes the extracted component to the file indicated by the file name. E.g. if the filename given is somepath/somelang.unicharset, unicharset will be extracted from the data loaded into the TessdataManager and will be written to somepath/somelang.unicharset.

Returns
true if the component was successfully extracted, false if the component was not present in the traineddata loaded into TessdataManager.

Definition at line 295 of file tessdatamanager.cpp.

295  {
297  ASSERT_HOST(
298  tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type));
299  if (entries_[type].empty()) return false;
300  return SaveDataToFile(entries_[type], filename);
301 }

◆ GetComponent() [1/2]

bool tesseract::TessdataManager::GetComponent ( TessdataType  type,
TFile fp 
)

Definition at line 216 of file tessdatamanager.cpp.

216  {
217  if (!is_loaded_ && !Init(data_file_name_.c_str())) return false;
218  const TessdataManager *const_this = this;
219  return const_this->GetComponent(type, fp);
220 }

◆ GetComponent() [2/2]

bool tesseract::TessdataManager::GetComponent ( TessdataType  type,
TFile fp 
) const

Definition at line 224 of file tessdatamanager.cpp.

224  {
225  ASSERT_HOST(is_loaded_);
226  if (entries_[type].empty()) return false;
227  fp->Open(&entries_[type][0], entries_[type].size());
228  fp->set_swap(swap_);
229  return true;
230 }

◆ GetDataFileName()

const STRING& tesseract::TessdataManager::GetDataFileName ( ) const
inline

Definition at line 186 of file tessdatamanager.h.

186 { return data_file_name_; }

◆ Init()

bool tesseract::TessdataManager::Init ( const char *  data_file_name)

Opens and reads the given data file right now.

Returns
true on success.

Definition at line 97 of file tessdatamanager.cpp.

97  {
99  if (reader_ == nullptr) {
100 #if defined(HAVE_LIBARCHIVE)
101  if (LoadArchiveFile(data_file_name)) return true;
102 #endif
103  if (!LoadDataFromFile(data_file_name, &data)) return false;
104  } else {
105  if (!(*reader_)(data_file_name, &data)) return false;
106  }
107  return LoadMemBuffer(data_file_name, &data[0], data.size());
108 }

◆ is_loaded()

bool tesseract::TessdataManager::is_loaded ( ) const
inline

Definition at line 134 of file tessdatamanager.h.

134 { return is_loaded_; }

◆ IsBaseAvailable()

bool tesseract::TessdataManager::IsBaseAvailable ( ) const
inline

Definition at line 177 of file tessdatamanager.h.

177  {
178  return !entries_[TESSDATA_UNICHARSET].empty() &&
179  !entries_[TESSDATA_INTTEMP].empty();
180  }

◆ IsComponentAvailable()

bool tesseract::TessdataManager::IsComponentAvailable ( TessdataType  type) const
inline

Definition at line 161 of file tessdatamanager.h.

161  {
162  return !entries_[type].empty();
163  }

◆ IsLSTMAvailable()

bool tesseract::TessdataManager::IsLSTMAvailable ( ) const
inline

Definition at line 183 of file tessdatamanager.h.

183 { return !entries_[TESSDATA_LSTM].empty(); }

◆ LoadFileLater()

void tesseract::TessdataManager::LoadFileLater ( const char *  data_file_name)

Definition at line 55 of file tessdatamanager.cpp.

55  {
56  Clear();
57  data_file_name_ = data_file_name;
58 }

◆ LoadMemBuffer()

bool tesseract::TessdataManager::LoadMemBuffer ( const char *  name,
const char *  data,
int  size 
)

Definition at line 111 of file tessdatamanager.cpp.

112  {
113  // TODO: This method supports only the proprietary file format.
114  Clear();
115  data_file_name_ = name;
116  TFile fp;
117  fp.Open(data, size);
118  uint32_t num_entries;
119  if (!fp.DeSerialize(&num_entries)) return false;
120  swap_ = num_entries > kMaxNumTessdataEntries;
121  fp.set_swap(swap_);
122  if (swap_) ReverseN(&num_entries, sizeof(num_entries));
123  if (num_entries > kMaxNumTessdataEntries) return false;
124  GenericVector<int64_t> offset_table;
125  offset_table.resize_no_init(num_entries);
126  if (!fp.DeSerialize(&offset_table[0], num_entries)) return false;
127  for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
128  if (offset_table[i] >= 0) {
129  int64_t entry_size = size - offset_table[i];
130  unsigned j = i + 1;
131  while (j < num_entries && offset_table[j] == -1) ++j;
132  if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
133  entries_[i].resize_no_init(entry_size);
134  if (!fp.DeSerialize(&entries_[i][0], entry_size)) return false;
135  }
136  }
137  if (entries_[TESSDATA_VERSION].empty()) {
138  SetVersionString("Pre-4.0.0");
139  }
140  is_loaded_ = true;
141  return true;
142 }

◆ OverwriteComponents()

bool tesseract::TessdataManager::OverwriteComponents ( const char *  new_traineddata_filename,
char **  component_filenames,
int  num_new_components 
)

Gets the individual components from the data_file_ with which the class was initialized. Overwrites the components specified by component_filenames. Writes the updated traineddata file to new_traineddata_filename.

Definition at line 275 of file tessdatamanager.cpp.

278  {
279  // Open the files with the new components.
280  // TODO: This method supports only the proprietary file format.
281  for (int i = 0; i < num_new_components; ++i) {
283  if (TessdataTypeFromFileName(component_filenames[i], &type)) {
284  if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
285  tprintf("Failed to read component file:%s\n", component_filenames[i]);
286  return false;
287  }
288  }
289  }
290 
291  // Write updated data to the output traineddata file.
292  return SaveFile(new_traineddata_filename, nullptr);
293 }

◆ OverwriteEntry()

void tesseract::TessdataManager::OverwriteEntry ( TessdataType  type,
const char *  data,
int  size 
)

Definition at line 145 of file tessdatamanager.cpp.

146  {
147  is_loaded_ = true;
148  entries_[type].resize_no_init(size);
149  memcpy(&entries_[type][0], data, size);
150 }

◆ SaveFile()

bool tesseract::TessdataManager::SaveFile ( const STRING filename,
FileWriter  writer 
) const

Definition at line 153 of file tessdatamanager.cpp.

154  {
155  // TODO: This method supports only the proprietary file format.
156  ASSERT_HOST(is_loaded_);
157  GenericVector<char> data;
158  Serialize(&data);
159  if (writer == nullptr)
160  return SaveDataToFile(data, filename.c_str());
161  else
162  return (*writer)(data, filename.c_str());
163 }

◆ Serialize()

void tesseract::TessdataManager::Serialize ( GenericVector< char > *  data) const

Definition at line 166 of file tessdatamanager.cpp.

166  {
167  // TODO: This method supports only the proprietary file format.
168  ASSERT_HOST(is_loaded_);
169  // Compute the offset_table and total size.
170  int64_t offset_table[TESSDATA_NUM_ENTRIES];
171  int64_t offset = sizeof(int32_t) + sizeof(offset_table);
172  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
173  if (entries_[i].empty()) {
174  offset_table[i] = -1;
175  } else {
176  offset_table[i] = offset;
177  offset += entries_[i].size();
178  }
179  }
180  data->init_to_size(offset, 0);
181  int32_t num_entries = TESSDATA_NUM_ENTRIES;
182  TFile fp;
183  fp.OpenWrite(data);
184  fp.Serialize(&num_entries);
185  fp.Serialize(&offset_table[0], countof(offset_table));
186  for (const auto& entry : entries_) {
187  if (!entry.empty()) {
188  fp.Serialize(&entry[0], entry.size());
189  }
190  }
191 }

◆ SetVersionString()

void tesseract::TessdataManager::SetVersionString ( const std::string v_str)

Definition at line 239 of file tessdatamanager.cpp.

239  {
240  entries_[TESSDATA_VERSION].resize_no_init(v_str.size());
241  memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
242 }

◆ swap()

bool tesseract::TessdataManager::swap ( ) const
inline

Definition at line 133 of file tessdatamanager.h.

133 { return swap_; }

◆ VersionString()

std::string tesseract::TessdataManager::VersionString ( ) const

Definition at line 233 of file tessdatamanager.cpp.

233  {
234  return std::string(&entries_[TESSDATA_VERSION][0],
235  entries_[TESSDATA_VERSION].size());
236 }

The documentation for this class was generated from the following files:
string
std::string string
Definition: equationdetect_test.cc:21
tesseract::TessdataManager::SetVersionString
void SetVersionString(const std::string &v_str)
Definition: tessdatamanager.cpp:239
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::LoadDataFromFile
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
Definition: genericvector.h:341
tesseract::TessdataManager::VersionString
std::string VersionString() const
Definition: tessdatamanager.cpp:233
tesseract::countof
constexpr size_t countof(T const (&)[N]) noexcept
Definition: serialis.h:41
STRING
Definition: strngs.h:45
tesseract::SaveDataToFile
bool SaveDataToFile(const GenericVector< char > &data, const char *filename)
Definition: genericvector.h:362
tesseract::TessdataManager::IsLSTMAvailable
bool IsLSTMAvailable() const
Definition: tessdatamanager.h:183
tesseract::TESSDATA_UNICHARSET
Definition: tessdatamanager.h:58
tesseract::TESSDATA_INTTEMP
Definition: tessdatamanager.h:60
tesseract::TESSDATA_VERSION
Definition: tessdatamanager.h:80
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::TessdataManager::TessdataManager
TessdataManager()
Definition: tessdatamanager.cpp:42
GenericVector::resize_no_init
void resize_no_init(int size)
Definition: genericvector.h:65
tesseract::TessdataManager::LoadMemBuffer
bool LoadMemBuffer(const char *name, const char *data, int size)
Definition: tessdatamanager.cpp:111
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
tesseract::TessdataType
TessdataType
Definition: tessdatamanager.h:56
GenericVector< char >
tesseract::TESSDATA_NUM_ENTRIES
Definition: tessdatamanager.h:82
tesseract::TessdataManager::Init
bool Init(const char *data_file_name)
Definition: tessdatamanager.cpp:97
tesseract::TessdataManager::Clear
void Clear()
Definition: tessdatamanager.cpp:194
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:706
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesstrain_utils.type
type
Definition: tesstrain_utils.py:141
ReverseN
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:183
tesseract::TessdataManager::Serialize
void Serialize(GenericVector< char > *data) const
Definition: tessdatamanager.cpp:166
tesseract::TESSDATA_LSTM
Definition: tessdatamanager.h:74
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::TessdataManager::IsBaseAvailable
bool IsBaseAvailable() const
Definition: tessdatamanager.h:177
tesseract::TessdataManager::SaveFile
bool SaveFile(const STRING &filename, FileWriter writer) const
Definition: tessdatamanager.cpp:153