tesseract  5.0.0-alpha-619-ge9db
combine_tessdata.cpp File Reference
#include <cerrno>
#include "commontraining.h"
#include "lstmrecognizer.h"
#include "tessdatamanager.h"

Go to the source code of this file.

Functions

int main (int argc, char **argv)
 

Function Documentation

◆ main()

int main ( int  argc,
char **  argv 
)

Definition at line 68 of file combine_tessdata.cpp.

68  {
69  tesseract::CheckSharedLibraryVersion();
70 
71  int i;
73  if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) {
74  printf("%s\n", tesseract::TessBaseAPI::Version());
75  return EXIT_SUCCESS;
76  } else if (argc == 2) {
77  printf("Combining tessdata files\n");
78  STRING lang = argv[1];
79  char* last = &argv[1][strlen(argv[1])-1];
80  if (*last != '.')
81  lang += '.';
82  STRING output_file = lang;
83  output_file += kTrainedDataSuffix;
84  if (!tm.CombineDataFiles(lang.c_str(), output_file.c_str())) {
85  printf("Error combining tessdata files into %s\n",
86  output_file.c_str());
87  } else {
88  printf("Output %s created successfully.\n", output_file.c_str());
89  }
90  } else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 ||
91  strcmp(argv[1], "-u") == 0)) {
92  // Initialize TessdataManager with the data in the given traineddata file.
93  if (!tm.Init(argv[2])) {
94  tprintf("Failed to read %s\n", argv[2]);
95  return EXIT_FAILURE;
96  }
97  printf("Extracting tessdata components from %s\n", argv[2]);
98  if (strcmp(argv[1], "-e") == 0) {
99  for (i = 3; i < argc; ++i) {
100  errno = 0;
101  if (tm.ExtractToFile(argv[i])) {
102  printf("Wrote %s\n", argv[i]);
103  } else if (errno == 0) {
104  printf("Not extracting %s, since this component"
105  " is not present\n", argv[i]);
106  return EXIT_FAILURE;
107  } else {
108  printf("Error, could not extract %s: %s\n",
109  argv[i], strerror(errno));
110  return EXIT_FAILURE;
111  }
112  }
113  } else { // extract all the components
114  for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
115  STRING filename = argv[3];
116  char* last = &argv[3][strlen(argv[3])-1];
117  if (*last != '.')
118  filename += '.';
119  filename += tesseract::kTessdataFileSuffixes[i];
120  errno = 0;
121  if (tm.ExtractToFile(filename.c_str())) {
122  printf("Wrote %s\n", filename.c_str());
123  } else if (errno != 0) {
124  printf("Error, could not extract %s: %s\n",
125  filename.c_str(), strerror(errno));
126  return EXIT_FAILURE;
127  }
128  }
129  }
130  } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
131  // Rename the current traineddata file to a temporary name.
132  const char *new_traineddata_filename = argv[2];
133  STRING traineddata_filename = new_traineddata_filename;
134  traineddata_filename += ".__tmp__";
135  if (rename(new_traineddata_filename, traineddata_filename.c_str()) != 0) {
136  tprintf("Failed to create a temporary file %s\n",
137  traineddata_filename.c_str());
138  return EXIT_FAILURE;
139  }
140 
141  // Initialize TessdataManager with the data in the given traineddata file.
142  tm.Init(traineddata_filename.c_str());
143 
144  // Write the updated traineddata file.
145  tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
146  } else if (argc == 3 && strcmp(argv[1], "-c") == 0) {
147  if (!tm.Init(argv[2])) {
148  tprintf("Failed to read %s\n", argv[2]);
149  return EXIT_FAILURE;
150  }
151  tesseract::TFile fp;
152  if (!tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {
153  tprintf("No LSTM Component found in %s!\n", argv[2]);
154  return EXIT_FAILURE;
155  }
156  tesseract::LSTMRecognizer recognizer;
157  if (!recognizer.DeSerialize(&tm, &fp)) {
158  tprintf("Failed to deserialize LSTM in %s!\n", argv[2]);
159  return EXIT_FAILURE;
160  }
161  recognizer.ConvertToInt();
162  GenericVector<char> lstm_data;
163  fp.OpenWrite(&lstm_data);
164  ASSERT_HOST(recognizer.Serialize(&tm, &fp));
165  tm.OverwriteEntry(tesseract::TESSDATA_LSTM, &lstm_data[0],
166  lstm_data.size());
167  if (!tm.SaveFile(argv[2], nullptr)) {
168  tprintf("Failed to write modified traineddata:%s!\n", argv[2]);
169  return EXIT_FAILURE;
170  }
171  } else if (argc == 3 && strcmp(argv[1], "-d") == 0) {
172  // Initialize TessdataManager with the data in the given traineddata file.
173  tm.Init(argv[2]);
174  } else {
175  printf("Usage for combining tessdata components:\n"
176  " %s language_data_path_prefix\n"
177  " (e.g. %s tessdata/eng.)\n\n", argv[0], argv[0]);
178  printf("Usage for extracting tessdata components:\n"
179  " %s -e traineddata_file [output_component_file...]\n"
180  " (e.g. %s -e eng.traineddata eng.unicharset)\n\n",
181  argv[0], argv[0]);
182  printf("Usage for overwriting tessdata components:\n"
183  " %s -o traineddata_file [input_component_file...]\n"
184  " (e.g. %s -o eng.traineddata eng.unicharset)\n\n",
185  argv[0], argv[0]);
186  printf("Usage for unpacking all tessdata components:\n"
187  " %s -u traineddata_file output_path_prefix\n"
188  " (e.g. %s -u eng.traineddata tmp/eng.)\n", argv[0], argv[0]);
189  printf(
190  "Usage for listing directory of components:\n"
191  " %s -d traineddata_file\n",
192  argv[0]);
193  printf(
194  "Usage for compacting LSTM component to int:\n"
195  " %s -c traineddata_file\n",
196  argv[0]);
197  return 1;
198  }
199  tm.Directory();
200  return EXIT_SUCCESS;
201 }
tesseract::TessdataManager::OverwriteComponents
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
Definition: tessdatamanager.cpp:275
tesseract::TessdataManager
Definition: tessdatamanager.h:126
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
STRING
Definition: strngs.h:45
tesseract::TessdataManager::Directory
void Directory() const
Definition: tessdatamanager.cpp:202
tesseract::TessdataManager::CombineDataFiles
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
Definition: tessdatamanager.cpp:244
tesseract::TessdataManager::OverwriteEntry
void OverwriteEntry(TessdataType type, const char *data, int size)
Definition: tessdatamanager.cpp:145
tesseract::LSTMRecognizer::DeSerialize
bool DeSerialize(const TessdataManager *mgr, TFile *fp)
Definition: lstmrecognizer.cpp:108
last
LIST last(LIST var_list)
Definition: oldlist.cpp:151
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::LSTMRecognizer::ConvertToInt
void ConvertToInt()
Definition: lstmrecognizer.h:124
tesseract::TessdataManager::GetComponent
bool GetComponent(TessdataType type, TFile *fp)
Definition: tessdatamanager.cpp:216
tesseract::TFile
Definition: serialis.h:75
tesseract::LSTMRecognizer::Serialize
bool Serialize(const TessdataManager *mgr, TFile *fp) const
Definition: lstmrecognizer.cpp:89
tesseract::TessBaseAPI::Version
static const char * Version()
Definition: baseapi.cpp:233
GenericVector< char >
tesseract::TESSDATA_NUM_ENTRIES
Definition: tessdatamanager.h:82
tesseract::LSTMRecognizer
Definition: lstmrecognizer.h:53
tesseract::TessdataManager::Init
bool Init(const char *data_file_name)
Definition: tessdatamanager.cpp:97
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::TESSDATA_LSTM
Definition: tessdatamanager.h:74
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::TFile::OpenWrite
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:309
tesseract::TessdataManager::SaveFile
bool SaveFile(const STRING &filename, FileWriter writer) const
Definition: tessdatamanager.cpp:153
tesseract::TessdataManager::ExtractToFile
bool ExtractToFile(const char *filename)
Definition: tessdatamanager.cpp:295