20 #include "config_auto.h"
28 #if defined(HAVE_LIBARCHIVE)
30 #include <archive_entry.h>
57 data_file_name_ = data_file_name;
60 #if defined(HAVE_LIBARCHIVE)
61 bool TessdataManager::LoadArchiveFile(
const char *filename) {
63 archive *a = archive_read_new();
65 archive_read_support_filter_all(a);
66 archive_read_support_format_all(a);
67 if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
69 while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
70 const char *component = archive_entry_pathname(ae);
71 if (component !=
nullptr) {
73 if (TessdataTypeFromFileName(component, &
type)) {
74 int64_t size = archive_entry_size(ae);
77 if (archive_read_data(a, &entries_[
type][0], size) == size) {
87 tprintf(
"archive_read_open_filename(...,%s,...) failed, %s\n",
88 filename, strerror(archive_errno(a)));
99 if (reader_ ==
nullptr) {
100 #if defined(HAVE_LIBARCHIVE)
101 if (LoadArchiveFile(data_file_name))
return true;
105 if (!(*reader_)(data_file_name, &data))
return false;
115 data_file_name_ = name;
118 uint32_t num_entries;
120 swap_ = num_entries > kMaxNumTessdataEntries;
122 if (swap_)
ReverseN(&num_entries,
sizeof(num_entries));
123 if (num_entries > kMaxNumTessdataEntries)
return false;
126 if (!fp.
DeSerialize(&offset_table[0], num_entries))
return false;
128 if (offset_table[i] >= 0) {
129 int64_t entry_size = size - offset_table[i];
131 while (j < num_entries && offset_table[j] == -1) ++j;
132 if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
134 if (!fp.
DeSerialize(&entries_[i][0], entry_size))
return false;
149 memcpy(&entries_[
type][0], data, size);
159 if (writer ==
nullptr)
162 return (*writer)(data, filename.
c_str());
171 int64_t offset =
sizeof(int32_t) +
sizeof(offset_table);
173 if (entries_[i].empty()) {
174 offset_table[i] = -1;
176 offset_table[i] = offset;
177 offset += entries_[i].
size();
186 for (
const auto& entry : entries_) {
187 if (!entry.empty()) {
195 for (
auto& entry : entries_) {
206 if (!entries_[i].empty()) {
207 tprintf(
"%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
208 entries_[i].size(), offset);
209 offset += entries_[i].
size();
217 if (!is_loaded_ && !
Init(data_file_name_.
c_str()))
return false;
226 if (entries_[
type].empty())
return false;
245 const char *language_data_path_prefix,
246 const char *output_filename) {
248 for (
auto filesuffix : kTessdataFileSuffixes) {
251 STRING filename = language_data_path_prefix;
252 filename += filesuffix;
253 FILE *fp = fopen(filename.
c_str(),
"rb");
267 "Error: traineddata file must contain at least (a unicharset file"
268 "and inttemp) OR an lstm file.\n");
272 return SaveFile(output_filename,
nullptr);
276 const char *new_traineddata_filename,
277 char **component_filenames,
278 int num_new_components) {
281 for (
int i = 0; i < num_new_components; ++i) {
283 if (TessdataTypeFromFileName(component_filenames[i], &
type)) {
285 tprintf(
"Failed to read component file:%s\n", component_filenames[i]);
292 return SaveFile(new_traineddata_filename,
nullptr);
298 tesseract::TessdataManager::TessdataTypeFromFileName(filename, &
type));
299 if (entries_[
type].empty())
return false;
303 bool TessdataManager::TessdataTypeFromFileSuffix(
const char *suffix,
306 if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
307 *
type = static_cast<TessdataType>(i);
312 tprintf(
"TessdataManager can't determine which tessdata"
313 " component is represented by %s\n", suffix);
318 bool TessdataManager::TessdataTypeFromFileName(
const char *filename,
321 const char *suffix = strrchr(filename,
'.');
322 if (suffix ==
nullptr || *(++suffix) ==
'\0')
return false;
323 return TessdataTypeFromFileSuffix(suffix,
type);