tesseract  4.0.0-1-g2a2b
tesseract::TessdataManager Class Reference

#include <tessdatamanager.h>

Public Member Functions

 TessdataManager ()
 
 TessdataManager (FileReader reader)
 
 ~TessdataManager ()=default
 
bool swap () const
 
bool is_loaded () const
 
void LoadFileLater (const char *data_file_name)
 
bool Init (const char *data_file_name)
 
bool LoadMemBuffer (const char *name, const char *data, int size)
 
void OverwriteEntry (TessdataType type, const char *data, int size)
 
bool SaveFile (const STRING &filename, FileWriter writer) const
 
void Serialize (GenericVector< char > *data) const
 
void Clear ()
 
void Directory () const
 
bool IsComponentAvailable (TessdataType type) const
 
bool GetComponent (TessdataType type, TFile *fp)
 
bool GetComponent (TessdataType type, TFile *fp) const
 
std::string VersionString () const
 
void SetVersionString (const std::string &v_str)
 
bool IsBaseAvailable () const
 
bool IsLSTMAvailable () const
 
const STRINGGetDataFileName () const
 
bool CombineDataFiles (const char *language_data_path_prefix, const char *output_filename)
 
bool OverwriteComponents (const char *new_traineddata_filename, char **component_filenames, int num_new_components)
 
bool ExtractToFile (const char *filename)
 

Static Public Member Functions

static bool TessdataTypeFromFileSuffix (const char *suffix, TessdataType *type)
 
static bool TessdataTypeFromFileName (const char *filename, TessdataType *type)
 

Detailed Description

Definition at line 126 of file tessdatamanager.h.

Constructor & Destructor Documentation

◆ TessdataManager() [1/2]

tesseract::TessdataManager::TessdataManager ( )

Definition at line 37 of file tessdatamanager.cpp.

37  : reader_(nullptr), is_loaded_(false), swap_(false) {
38  SetVersionString(PACKAGE_VERSION);
39 }
void SetVersionString(const std::string &v_str)

◆ TessdataManager() [2/2]

tesseract::TessdataManager::TessdataManager ( FileReader  reader)
explicit

Definition at line 41 of file tessdatamanager.cpp.

42  : reader_(reader),
43  is_loaded_(false),
44  swap_(false) {
45  SetVersionString(PACKAGE_VERSION);
46 }
void SetVersionString(const std::string &v_str)

◆ ~TessdataManager()

tesseract::TessdataManager::~TessdataManager ( )
default

Member Function Documentation

◆ Clear()

void tesseract::TessdataManager::Clear ( )

Definition at line 146 of file tessdatamanager.cpp.

146  {
147  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
148  entries_[i].clear();
149  }
150  is_loaded_ = false;
151 }

◆ CombineDataFiles()

bool tesseract::TessdataManager::CombineDataFiles ( const char *  language_data_path_prefix,
const char *  output_filename 
)

Reads all the standard tesseract config and data files for a language at the given path and bundles them up into one binary data file. Returns true if the combined traineddata file was successfully written.

Definition at line 196 of file tessdatamanager.cpp.

198  {
199  // Load individual tessdata components from files.
200  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
201  TessdataType type;
202  ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
203  STRING filename = language_data_path_prefix;
204  filename += kTessdataFileSuffixes[i];
205  FILE *fp = fopen(filename.string(), "rb");
206  if (fp != nullptr) {
207  fclose(fp);
208  if (!LoadDataFromFile(filename, &entries_[type])) {
209  tprintf("Load of file %s failed!\n", filename.string());
210  return false;
211  }
212  }
213  }
214  is_loaded_ = true;
215 
216  // Make sure that the required components are present.
217  if (!IsBaseAvailable() && !IsLSTMAvailable()) {
218  tprintf(
219  "Error: traineddata file must contain at least (a unicharset file"
220  "and inttemp) OR an lstm file.\n");
221  return false;
222  }
223  // Write updated data to the output traineddata file.
224  return SaveFile(output_filename, nullptr);
225 }
const char * string() const
Definition: strngs.cpp:196
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: strngs.h:45
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type)
bool SaveFile(const STRING &filename, FileWriter writer) const
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ Directory()

void tesseract::TessdataManager::Directory ( ) const

Definition at line 154 of file tessdatamanager.cpp.

154  {
155  tprintf("Version string:%s\n", VersionString().c_str());
156  int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
157  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
158  if (!entries_[i].empty()) {
159  tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
160  entries_[i].size(), offset);
161  offset += entries_[i].size();
162  }
163  }
164 }
int size() const
Definition: genericvector.h:71
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
std::string VersionString() const

◆ ExtractToFile()

bool tesseract::TessdataManager::ExtractToFile ( const char *  filename)

Extracts tessdata component implied by the name of the input file from the combined traineddata loaded into TessdataManager. Writes the extracted component to the file indicated by the file name. E.g. if the filename given is somepath/somelang.unicharset, unicharset will be extracted from the data loaded into the TessdataManager and will be written to somepath/somelang.unicharset.

Returns
true if the component was successfully extracted, false if the component was not present in the traineddata loaded into TessdataManager.

Definition at line 246 of file tessdatamanager.cpp.

246  {
248  ASSERT_HOST(
250  if (entries_[type].empty()) return false;
251  return SaveDataToFile(entries_[type], filename);
252 }
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type)
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ GetComponent() [1/2]

bool tesseract::TessdataManager::GetComponent ( TessdataType  type,
TFile fp 
)

Definition at line 168 of file tessdatamanager.cpp.

168  {
169  if (!is_loaded_ && !Init(data_file_name_.string())) return false;
170  const TessdataManager *const_this = this;
171  return const_this->GetComponent(type, fp);
172 }
const char * string() const
Definition: strngs.cpp:196
bool Init(const char *data_file_name)

◆ GetComponent() [2/2]

bool tesseract::TessdataManager::GetComponent ( TessdataType  type,
TFile fp 
) const

Definition at line 176 of file tessdatamanager.cpp.

176  {
177  ASSERT_HOST(is_loaded_);
178  if (entries_[type].empty()) return false;
179  fp->Open(&entries_[type][0], entries_[type].size());
180  fp->set_swap(swap_);
181  return true;
182 }
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ GetDataFileName()

const STRING& tesseract::TessdataManager::GetDataFileName ( ) const
inline

Definition at line 186 of file tessdatamanager.h.

186 { return data_file_name_; }

◆ Init()

bool tesseract::TessdataManager::Init ( const char *  data_file_name)

Opens and reads the given data file right now.

Returns
true on success.

Definition at line 55 of file tessdatamanager.cpp.

55  {
57  if (reader_ == nullptr) {
58  if (!LoadDataFromFile(data_file_name, &data)) return false;
59  } else {
60  if (!(*reader_)(data_file_name, &data)) return false;
61  }
62  return LoadMemBuffer(data_file_name, &data[0], data.size());
63 }
int size() const
Definition: genericvector.h:71
bool LoadMemBuffer(const char *name, const char *data, int size)
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)

◆ is_loaded()

bool tesseract::TessdataManager::is_loaded ( ) const
inline

Definition at line 134 of file tessdatamanager.h.

134 { return is_loaded_; }

◆ IsBaseAvailable()

bool tesseract::TessdataManager::IsBaseAvailable ( ) const
inline

Definition at line 177 of file tessdatamanager.h.

177  {
178  return !entries_[TESSDATA_UNICHARSET].empty() &&
179  !entries_[TESSDATA_INTTEMP].empty();
180  }
bool empty() const
Definition: genericvector.h:90

◆ IsComponentAvailable()

bool tesseract::TessdataManager::IsComponentAvailable ( TessdataType  type) const
inline

Definition at line 161 of file tessdatamanager.h.

161  {
162  return !entries_[type].empty();
163  }
bool empty() const
Definition: genericvector.h:90

◆ IsLSTMAvailable()

bool tesseract::TessdataManager::IsLSTMAvailable ( ) const
inline

Definition at line 183 of file tessdatamanager.h.

183 { return !entries_[TESSDATA_LSTM].empty(); }
bool empty() const
Definition: genericvector.h:90

◆ LoadFileLater()

void tesseract::TessdataManager::LoadFileLater ( const char *  data_file_name)

Definition at line 50 of file tessdatamanager.cpp.

50  {
51  Clear();
52  data_file_name_ = data_file_name;
53 }

◆ LoadMemBuffer()

bool tesseract::TessdataManager::LoadMemBuffer ( const char *  name,
const char *  data,
int  size 
)

Definition at line 66 of file tessdatamanager.cpp.

67  {
68  Clear();
69  data_file_name_ = name;
70  TFile fp;
71  fp.Open(data, size);
72  uint32_t num_entries;
73  if (!fp.DeSerialize(&num_entries)) return false;
74  swap_ = num_entries > kMaxNumTessdataEntries;
75  fp.set_swap(swap_);
76  if (swap_) ReverseN(&num_entries, sizeof(num_entries));
77  if (num_entries > kMaxNumTessdataEntries) return false;
78  GenericVector<int64_t> offset_table;
79  offset_table.resize_no_init(num_entries);
80  if (!fp.DeSerialize(&offset_table[0], num_entries)) return false;
81  for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
82  if (offset_table[i] >= 0) {
83  int64_t entry_size = size - offset_table[i];
84  int j = i + 1;
85  while (j < num_entries && offset_table[j] == -1) ++j;
86  if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
87  entries_[i].resize_no_init(entry_size);
88  if (!fp.DeSerialize(&entries_[i][0], entry_size)) return false;
89  }
90  }
91  if (entries_[TESSDATA_VERSION].empty()) {
92  SetVersionString("Pre-4.0.0");
93  }
94  is_loaded_ = true;
95  return true;
96 }
void SetVersionString(const std::string &v_str)
void resize_no_init(int size)
Definition: genericvector.h:65
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:178

◆ OverwriteComponents()

bool tesseract::TessdataManager::OverwriteComponents ( const char *  new_traineddata_filename,
char **  component_filenames,
int  num_new_components 
)

Gets the individual components from the data_file_ with which the class was initialized. Overwrites the components specified by component_filenames. Writes the updated traineddata file to new_traineddata_filename.

Definition at line 227 of file tessdatamanager.cpp.

230  {
231  // Open the files with the new components.
232  for (int i = 0; i < num_new_components; ++i) {
233  TessdataType type;
234  if (TessdataTypeFromFileName(component_filenames[i], &type)) {
235  if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
236  tprintf("Failed to read component file:%s\n", component_filenames[i]);
237  return false;
238  }
239  }
240  }
241 
242  // Write updated data to the output traineddata file.
243  return SaveFile(new_traineddata_filename, nullptr);
244 }
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool SaveFile(const STRING &filename, FileWriter writer) const
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)

◆ OverwriteEntry()

void tesseract::TessdataManager::OverwriteEntry ( TessdataType  type,
const char *  data,
int  size 
)

Definition at line 99 of file tessdatamanager.cpp.

100  {
101  is_loaded_ = true;
102  entries_[type].resize_no_init(size);
103  memcpy(&entries_[type][0], data, size);
104 }
void resize_no_init(int size)
Definition: genericvector.h:65

◆ SaveFile()

bool tesseract::TessdataManager::SaveFile ( const STRING filename,
FileWriter  writer 
) const

Definition at line 107 of file tessdatamanager.cpp.

108  {
109  ASSERT_HOST(is_loaded_);
110  GenericVector<char> data;
111  Serialize(&data);
112  if (writer == nullptr)
113  return SaveDataToFile(data, filename);
114  else
115  return (*writer)(data, filename);
116 }
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)
void Serialize(GenericVector< char > *data) const
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ Serialize()

void tesseract::TessdataManager::Serialize ( GenericVector< char > *  data) const

Definition at line 119 of file tessdatamanager.cpp.

119  {
120  ASSERT_HOST(is_loaded_);
121  // Compute the offset_table and total size.
122  int64_t offset_table[TESSDATA_NUM_ENTRIES];
123  int64_t offset = sizeof(int32_t) + sizeof(offset_table);
124  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
125  if (entries_[i].empty()) {
126  offset_table[i] = -1;
127  } else {
128  offset_table[i] = offset;
129  offset += entries_[i].size();
130  }
131  }
132  data->init_to_size(offset, 0);
133  int32_t num_entries = TESSDATA_NUM_ENTRIES;
134  TFile fp;
135  fp.OpenWrite(data);
136  fp.Serialize(&num_entries);
137  fp.Serialize(&offset_table[0], countof(offset_table));
138  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
139  if (!entries_[i].empty()) {
140  fp.Serialize(&entries_[i][0], entries_[i].size());
141  }
142  }
143 }
int size() const
Definition: genericvector.h:71
constexpr size_t countof(T const (&)[N]) noexcept
Definition: serialis.h:43
void init_to_size(int size, const T &t)
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ SetVersionString()

void tesseract::TessdataManager::SetVersionString ( const std::string &  v_str)

Definition at line 191 of file tessdatamanager.cpp.

191  {
192  entries_[TESSDATA_VERSION].resize_no_init(v_str.size());
193  memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
194 }
void resize_no_init(int size)
Definition: genericvector.h:65

◆ swap()

bool tesseract::TessdataManager::swap ( ) const
inline

Definition at line 133 of file tessdatamanager.h.

133 { return swap_; }

◆ TessdataTypeFromFileName()

bool tesseract::TessdataManager::TessdataTypeFromFileName ( const char *  filename,
TessdataType type 
)
static

Tries to determine tessdata component file suffix from filename, returns true on success.

Definition at line 267 of file tessdatamanager.cpp.

268  {
269  // Get the file suffix (extension)
270  const char *suffix = strrchr(filename, '.');
271  if (suffix == nullptr || *(++suffix) == '\0') return false;
272  return TessdataTypeFromFileSuffix(suffix, type);
273 }
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type)

◆ TessdataTypeFromFileSuffix()

bool tesseract::TessdataManager::TessdataTypeFromFileSuffix ( const char *  suffix,
TessdataType type 
)
static

Fills type with TessdataType of the tessdata component represented by the given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.

Returns
true if the tessdata component type could be determined from the given file name.

Definition at line 254 of file tessdatamanager.cpp.

255  {
256  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
257  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
258  *type = static_cast<TessdataType>(i);
259  return true;
260  }
261  }
262  tprintf("TessdataManager can't determine which tessdata"
263  " component is represented by %s\n", suffix);
264  return false;
265 }
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37

◆ VersionString()

std::string tesseract::TessdataManager::VersionString ( ) const

Definition at line 185 of file tessdatamanager.cpp.

185  {
186  return std::string(&entries_[TESSDATA_VERSION][0],
187  entries_[TESSDATA_VERSION].size());
188 }

The documentation for this class was generated from the following files: