tesseract  4.0.0-1-g2a2b
combine_tessdata.cpp File Reference
#include <cerrno>
#include "commontraining.h"
#include "lstmrecognizer.h"
#include "tessdatamanager.h"

Go to the source code of this file.

Functions

int main (int argc, char **argv)
 

Function Documentation

◆ main()

int main ( int  argc,
char **  argv 
)

Definition at line 69 of file combine_tessdata.cpp.

69  {
70  tesseract::CheckSharedLibraryVersion();
71 
72  int i;
74  if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) {
75  printf("%s\n", tesseract::TessBaseAPI::Version());
76  return EXIT_SUCCESS;
77  } else if (argc == 2) {
78  printf("Combining tessdata files\n");
79  STRING lang = argv[1];
80  char* last = &argv[1][strlen(argv[1])-1];
81  if (*last != '.')
82  lang += '.';
83  STRING output_file = lang;
84  output_file += kTrainedDataSuffix;
85  if (!tm.CombineDataFiles(lang.string(), output_file.string())) {
86  printf("Error combining tessdata files into %s\n",
87  output_file.string());
88  } else {
89  printf("Output %s created successfully.\n", output_file.string());
90  }
91  } else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 ||
92  strcmp(argv[1], "-u") == 0)) {
93  // Initialize TessdataManager with the data in the given traineddata file.
94  if (!tm.Init(argv[2])) {
95  tprintf("Failed to read %s\n", argv[2]);
96  return EXIT_FAILURE;
97  }
98  printf("Extracting tessdata components from %s\n", argv[2]);
99  if (strcmp(argv[1], "-e") == 0) {
100  for (i = 3; i < argc; ++i) {
101  errno = 0;
102  if (tm.ExtractToFile(argv[i])) {
103  printf("Wrote %s\n", argv[i]);
104  } else if (errno == 0) {
105  printf("Not extracting %s, since this component"
106  " is not present\n", argv[i]);
107  return EXIT_FAILURE;
108  } else {
109  printf("Error, could not extract %s: %s\n",
110  argv[i], strerror(errno));
111  return EXIT_FAILURE;
112  }
113  }
114  } else { // extract all the components
115  for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
116  STRING filename = argv[3];
117  char* last = &argv[3][strlen(argv[3])-1];
118  if (*last != '.')
119  filename += '.';
120  filename += tesseract::kTessdataFileSuffixes[i];
121  errno = 0;
122  if (tm.ExtractToFile(filename.string())) {
123  printf("Wrote %s\n", filename.string());
124  } else if (errno != 0) {
125  printf("Error, could not extract %s: %s\n",
126  filename.string(), strerror(errno));
127  return EXIT_FAILURE;
128  }
129  }
130  }
131  } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
132  // Rename the current traineddata file to a temporary name.
133  const char *new_traineddata_filename = argv[2];
134  STRING traineddata_filename = new_traineddata_filename;
135  traineddata_filename += ".__tmp__";
136  if (rename(new_traineddata_filename, traineddata_filename.string()) != 0) {
137  tprintf("Failed to create a temporary file %s\n",
138  traineddata_filename.string());
139  return EXIT_FAILURE;
140  }
141 
142  // Initialize TessdataManager with the data in the given traineddata file.
143  tm.Init(traineddata_filename.string());
144 
145  // Write the updated traineddata file.
146  tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
147  } else if (argc == 3 && strcmp(argv[1], "-c") == 0) {
148  if (!tm.Init(argv[2])) {
149  tprintf("Failed to read %s\n", argv[2]);
150  return EXIT_FAILURE;
151  }
152  tesseract::TFile fp;
153  if (!tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {
154  tprintf("No LSTM Component found in %s!\n", argv[2]);
155  return EXIT_FAILURE;
156  }
157  tesseract::LSTMRecognizer recognizer;
158  if (!recognizer.DeSerialize(&tm, &fp)) {
159  tprintf("Failed to deserialize LSTM in %s!\n", argv[2]);
160  return EXIT_FAILURE;
161  }
162  recognizer.ConvertToInt();
163  GenericVector<char> lstm_data;
164  fp.OpenWrite(&lstm_data);
165  ASSERT_HOST(recognizer.Serialize(&tm, &fp));
166  tm.OverwriteEntry(tesseract::TESSDATA_LSTM, &lstm_data[0],
167  lstm_data.size());
168  if (!tm.SaveFile(argv[2], nullptr)) {
169  tprintf("Failed to write modified traineddata:%s!\n", argv[2]);
170  return EXIT_FAILURE;
171  }
172  } else if (argc == 3 && strcmp(argv[1], "-d") == 0) {
173  // Initialize TessdataManager with the data in the given traineddata file.
174  tm.Init(argv[2]);
175  } else {
176  printf("Usage for combining tessdata components:\n"
177  " %s language_data_path_prefix\n"
178  " (e.g. %s tessdata/eng.)\n\n", argv[0], argv[0]);
179  printf("Usage for extracting tessdata components:\n"
180  " %s -e traineddata_file [output_component_file...]\n"
181  " (e.g. %s -e eng.traineddata eng.unicharset)\n\n",
182  argv[0], argv[0]);
183  printf("Usage for overwriting tessdata components:\n"
184  " %s -o traineddata_file [input_component_file...]\n"
185  " (e.g. %s -o eng.traineddata eng.unicharset)\n\n",
186  argv[0], argv[0]);
187  printf("Usage for unpacking all tessdata components:\n"
188  " %s -u traineddata_file output_path_prefix\n"
189  " (e.g. %s -u eng.traineddata tmp/eng.)\n", argv[0], argv[0]);
190  printf(
191  "Usage for listing directory of components:\n"
192  " %s -d traineddata_file\n",
193  argv[0]);
194  printf(
195  "Usage for compacting LSTM component to int:\n"
196  " %s -c traineddata_file\n",
197  argv[0]);
198  return 1;
199  }
200  tm.Directory();
201  return EXIT_SUCCESS;
202 }
int size() const
Definition: genericvector.h:71
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
bool GetComponent(TessdataType type, TFile *fp)
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:295
void OverwriteEntry(TessdataType type, const char *data, int size)
const char * string() const
Definition: strngs.cpp:196
LIST last(LIST var_list)
Definition: oldlist.cpp:242
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
bool DeSerialize(const TessdataManager *mgr, TFile *fp)
bool Init(const char *data_file_name)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: strngs.h:45
static const char * Version()
Definition: baseapi.cpp:223
bool SaveFile(const STRING &filename, FileWriter writer) const
bool Serialize(const TessdataManager *mgr, TFile *fp) const
bool ExtractToFile(const char *filename)
#define ASSERT_HOST(x)
Definition: errcode.h:84