21 #pragma warning(disable:4244) // Conversion warnings
38 debug_level_ = debug_level;
39 data_file_name_ = data_file_name;
40 data_file_ = fopen(data_file_name,
"rb");
41 if (data_file_ ==
NULL) {
42 tprintf(
"Error opening data file %s\n", data_file_name);
43 tprintf(
"Please make sure the TESSDATA_PREFIX environment variable is set "
44 "to the parent directory of your \"tessdata\" directory.\n");
47 fread(&actual_tessdata_num_entries_,
sizeof(
inT32), 1, data_file_);
48 swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
50 ReverseN(&actual_tessdata_num_entries_,
51 sizeof(actual_tessdata_num_entries_));
57 fread(offset_table_,
sizeof(
inT64),
58 actual_tessdata_num_entries_, data_file_);
60 for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
61 ReverseN(&offset_table_[i],
sizeof(offset_table_[i]));
65 tprintf(
"TessdataManager loaded %d types of tesseract data files.\n",
66 actual_tessdata_num_entries_);
67 for (i = 0; i < actual_tessdata_num_entries_; ++i) {
68 tprintf(
"Offset for type %d is %lld\n", i, offset_table_[i]);
75 bool newline_end,
inT64 num_bytes_to_copy) {
76 if (num_bytes_to_copy == 0)
return;
77 int buffer_size = 1024;
78 if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
79 buffer_size = num_bytes_to_copy;
81 inT64 num_bytes_copied = 0;
82 char *chunk =
new char[buffer_size];
85 while ((bytes_read = fread(chunk,
sizeof(
char),
86 buffer_size, input_file))) {
87 fwrite(chunk,
sizeof(
char), bytes_read, output_file);
88 last_char = chunk[bytes_read-1];
89 if (num_bytes_to_copy > 0) {
90 num_bytes_copied += bytes_read;
91 if (num_bytes_copied == num_bytes_to_copy)
break;
92 if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
93 buffer_size = num_bytes_to_copy - num_bytes_copied;
102 const char * language_data_path_prefix,
106 if (fseek(output_file, 0, SEEK_SET) != 0 ||
107 fwrite(&num_entries,
sizeof(
inT32), 1, output_file) != 1 ||
112 tprintf(
"WriteMetadata failed in TessdataManager!\n");
113 }
else if (fclose(output_file)) {
115 tprintf(
"WriteMetadata failed to close file!\n");
117 tprintf(
"TessdataManager combined tesseract data files.\n");
119 tprintf(
"Offset for type %2d (%s%-22s) is %lld\n", i,
120 language_data_path_prefix, kTessdataFileSuffixes[i],
128 const char *language_data_path_prefix,
129 const char *output_filename) {
133 FILE *output_file = fopen(output_filename,
"wb");
134 if (output_file ==
NULL) {
135 tprintf(
"Error opening %s for writing\n", output_filename);
139 if (fseek(output_file,
140 sizeof(
inT32) +
sizeof(
inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
141 tprintf(
"Error seeking %s\n", output_filename);
146 bool text_file =
false;
152 kTessdataFileSuffixes[i], &type, &text_file));
154 filename += kTessdataFileSuffixes[i];
155 file_ptr[i] = fopen(filename.
string(),
"rb");
156 if (file_ptr[i] !=
NULL) {
157 offset_table[type] = ftell(output_file);
158 CopyFile(file_ptr[i], output_file, text_file, -1);
165 tprintf(
"Error opening %sunicharset file\n", language_data_path_prefix);
172 tprintf(
"Error opening %spffmtable and/or %snormproto files"
173 " while %sinttemp file was present\n", language_data_path_prefix,
174 language_data_path_prefix, language_data_path_prefix);
179 return WriteMetadata(offset_table, language_data_path_prefix, output_file);
183 const char *new_traineddata_filename,
184 char **component_filenames,
185 int num_new_components) {
189 bool text_file =
false;
192 offset_table[i] = -1;
195 FILE *output_file = fopen(new_traineddata_filename,
"wb");
196 if (output_file ==
NULL) {
197 tprintf(
"Error opening %s for writing\n", new_traineddata_filename);
202 if (fseek(output_file,
203 sizeof(
inT32) +
sizeof(
inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
205 tprintf(
"Error seeking %s\n", new_traineddata_filename);
210 for (i = 0; i < num_new_components; ++i) {
212 file_ptr[type] = fopen(component_filenames[i],
"rb");
217 if (file_ptr[i] !=
NULL) {
219 offset_table[i] = ftell(output_file);
220 CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
225 offset_table[i] = ftell(output_file);
226 CopyFile(data_file_, output_file, kTessdataFileIsText[i],
228 ftell(data_file_) + 1);
232 const char *language_data_path_prefix = strchr(new_traineddata_filename,
'.');
233 return WriteMetadata(offset_table, language_data_path_prefix, output_file);
237 const char *suffix,
TessdataType *type,
bool *text_file) {
239 if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
241 *text_file = kTessdataFileIsText[i];
245 tprintf(
"TessdataManager can't determine which tessdata"
246 " component is represented by %s\n", suffix);
253 const char *suffix = strrchr(filename,
'.');
254 if (suffix ==
NULL || *(++suffix) ==
'\0')
return false;
260 bool text_file =
false;
262 filename, &type, &text_file));
265 FILE *output_file = fopen(filename,
"wb");
266 if (output_file ==
NULL) {
267 tprintf(
"Error opening %s\n", filename);
274 end_offset - begin_offset + 1);
FILE * GetDataFilePtr() const
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type, bool *text_file)
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type, bool *text_file)
inT64 GetEndOffset(TessdataType tessdata_type) const
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
bool ExtractToFile(const char *filename)
static bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
static bool WriteMetadata(inT64 *offset_table, const char *language_data_path_prefix, FILE *output_file)
void ReverseN(void *ptr, int num_bytes)
bool SeekToStart(TessdataType tessdata_type)
bool Init(const char *data_file_name, int debug_level)
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
const char * string() const