tesseract  4.0.0-1-g2a2b
combine_tessdata.cpp
Go to the documentation of this file.
1 // File: combine_tessdata.cpp
3 // Description: Creates a unified traineddata file from several
4 // data files produced by the training process.
5 // Author: Daria Antonova
6 // Created: Wed Jun 03 11:26:43 PST 2009
7 //
8 // (C) Copyright 2009, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #include <cerrno>
22 #include "commontraining.h" // CheckSharedLibraryVersion
23 #include "lstmrecognizer.h"
24 #include "tessdatamanager.h"
25 
26 // Main program to combine/extract/overwrite tessdata components
27 // in [lang].traineddata files.
28 //
29 // To combine all the individual tessdata components (unicharset, DAWGs,
30 // classifier templates, ambiguities, language configs) located at, say,
31 // /home/$USER/temp/eng.* run:
32 //
33 // combine_tessdata /home/$USER/temp/eng.
34 //
35 // The result will be a combined tessdata file /home/$USER/temp/eng.traineddata
36 //
37 // Specify option -e if you would like to extract individual components
38 // from a combined traineddata file. For example, to extract language config
39 // file and the unicharset from tessdata/eng.traineddata run:
40 //
41 // combine_tessdata -e tessdata/eng.traineddata
42 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
43 //
44 // The desired config file and unicharset will be written to
45 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
46 //
47 // Specify option -o to overwrite individual components of the given
48 // [lang].traineddata file. For example, to overwrite language config
49 // and unichar ambiguities files in tessdata/eng.traineddata use:
50 //
51 // combine_tessdata -o tessdata/eng.traineddata
52 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
53 //
54 // As a result, tessdata/eng.traineddata will contain the new language config
55 // and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.
56 //
57 // Note: the file names of the files to extract to and to overwrite from should
58 // have the appropriate file suffixes (extensions) indicating their tessdata
59 // component type (.unicharset for the unicharset, .unicharambigs for unichar
60 // ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.
61 //
62 // Specify option -u to unpack all the components to the specified path:
63 //
64 // combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
65 //
66 // This will create /home/$USER/temp/eng.* files with individual tessdata
67 // components from tessdata/eng.traineddata.
68 //
69 int main(int argc, char **argv) {
70  tesseract::CheckSharedLibraryVersion();
71 
72  int i;
74  if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) {
75  printf("%s\n", tesseract::TessBaseAPI::Version());
76  return EXIT_SUCCESS;
77  } else if (argc == 2) {
78  printf("Combining tessdata files\n");
79  STRING lang = argv[1];
80  char* last = &argv[1][strlen(argv[1])-1];
81  if (*last != '.')
82  lang += '.';
83  STRING output_file = lang;
84  output_file += kTrainedDataSuffix;
85  if (!tm.CombineDataFiles(lang.string(), output_file.string())) {
86  printf("Error combining tessdata files into %s\n",
87  output_file.string());
88  } else {
89  printf("Output %s created successfully.\n", output_file.string());
90  }
91  } else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 ||
92  strcmp(argv[1], "-u") == 0)) {
93  // Initialize TessdataManager with the data in the given traineddata file.
94  if (!tm.Init(argv[2])) {
95  tprintf("Failed to read %s\n", argv[2]);
96  return EXIT_FAILURE;
97  }
98  printf("Extracting tessdata components from %s\n", argv[2]);
99  if (strcmp(argv[1], "-e") == 0) {
100  for (i = 3; i < argc; ++i) {
101  errno = 0;
102  if (tm.ExtractToFile(argv[i])) {
103  printf("Wrote %s\n", argv[i]);
104  } else if (errno == 0) {
105  printf("Not extracting %s, since this component"
106  " is not present\n", argv[i]);
107  return EXIT_FAILURE;
108  } else {
109  printf("Error, could not extract %s: %s\n",
110  argv[i], strerror(errno));
111  return EXIT_FAILURE;
112  }
113  }
114  } else { // extract all the components
115  for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
116  STRING filename = argv[3];
117  char* last = &argv[3][strlen(argv[3])-1];
118  if (*last != '.')
119  filename += '.';
120  filename += tesseract::kTessdataFileSuffixes[i];
121  errno = 0;
122  if (tm.ExtractToFile(filename.string())) {
123  printf("Wrote %s\n", filename.string());
124  } else if (errno != 0) {
125  printf("Error, could not extract %s: %s\n",
126  filename.string(), strerror(errno));
127  return EXIT_FAILURE;
128  }
129  }
130  }
131  } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
132  // Rename the current traineddata file to a temporary name.
133  const char *new_traineddata_filename = argv[2];
134  STRING traineddata_filename = new_traineddata_filename;
135  traineddata_filename += ".__tmp__";
136  if (rename(new_traineddata_filename, traineddata_filename.string()) != 0) {
137  tprintf("Failed to create a temporary file %s\n",
138  traineddata_filename.string());
139  return EXIT_FAILURE;
140  }
141 
142  // Initialize TessdataManager with the data in the given traineddata file.
143  tm.Init(traineddata_filename.string());
144 
145  // Write the updated traineddata file.
146  tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
147  } else if (argc == 3 && strcmp(argv[1], "-c") == 0) {
148  if (!tm.Init(argv[2])) {
149  tprintf("Failed to read %s\n", argv[2]);
150  return EXIT_FAILURE;
151  }
152  tesseract::TFile fp;
153  if (!tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {
154  tprintf("No LSTM Component found in %s!\n", argv[2]);
155  return EXIT_FAILURE;
156  }
157  tesseract::LSTMRecognizer recognizer;
158  if (!recognizer.DeSerialize(&tm, &fp)) {
159  tprintf("Failed to deserialize LSTM in %s!\n", argv[2]);
160  return EXIT_FAILURE;
161  }
162  recognizer.ConvertToInt();
163  GenericVector<char> lstm_data;
164  fp.OpenWrite(&lstm_data);
165  ASSERT_HOST(recognizer.Serialize(&tm, &fp));
166  tm.OverwriteEntry(tesseract::TESSDATA_LSTM, &lstm_data[0],
167  lstm_data.size());
168  if (!tm.SaveFile(argv[2], nullptr)) {
169  tprintf("Failed to write modified traineddata:%s!\n", argv[2]);
170  return EXIT_FAILURE;
171  }
172  } else if (argc == 3 && strcmp(argv[1], "-d") == 0) {
173  // Initialize TessdataManager with the data in the given traineddata file.
174  tm.Init(argv[2]);
175  } else {
176  printf("Usage for combining tessdata components:\n"
177  " %s language_data_path_prefix\n"
178  " (e.g. %s tessdata/eng.)\n\n", argv[0], argv[0]);
179  printf("Usage for extracting tessdata components:\n"
180  " %s -e traineddata_file [output_component_file...]\n"
181  " (e.g. %s -e eng.traineddata eng.unicharset)\n\n",
182  argv[0], argv[0]);
183  printf("Usage for overwriting tessdata components:\n"
184  " %s -o traineddata_file [input_component_file...]\n"
185  " (e.g. %s -o eng.traineddata eng.unicharset)\n\n",
186  argv[0], argv[0]);
187  printf("Usage for unpacking all tessdata components:\n"
188  " %s -u traineddata_file output_path_prefix\n"
189  " (e.g. %s -u eng.traineddata tmp/eng.)\n", argv[0], argv[0]);
190  printf(
191  "Usage for listing directory of components:\n"
192  " %s -d traineddata_file\n",
193  argv[0]);
194  printf(
195  "Usage for compacting LSTM component to int:\n"
196  " %s -c traineddata_file\n",
197  argv[0]);
198  return 1;
199  }
200  tm.Directory();
201  return EXIT_SUCCESS;
202 }
int size() const
Definition: genericvector.h:71
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
bool GetComponent(TessdataType type, TFile *fp)
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:295
void OverwriteEntry(TessdataType type, const char *data, int size)
const char * string() const
Definition: strngs.cpp:196
LIST last(LIST var_list)
Definition: oldlist.cpp:242
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
bool DeSerialize(const TessdataManager *mgr, TFile *fp)
bool Init(const char *data_file_name)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: strngs.h:45
int main(int argc, char **argv)
static const char * Version()
Definition: baseapi.cpp:223
bool SaveFile(const STRING &filename, FileWriter writer) const
bool Serialize(const TessdataManager *mgr, TFile *fp) const
bool ExtractToFile(const char *filename)
#define ASSERT_HOST(x)
Definition: errcode.h:84