tesseract  5.0.0-alpha-619-ge9db
combine_tessdata.cpp
Go to the documentation of this file.
1 // File: combine_tessdata.cpp
3 // Description: Creates a unified traineddata file from several
4 // data files produced by the training process.
5 // Author: Daria Antonova
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include <cerrno>
21 #include "commontraining.h" // CheckSharedLibraryVersion
22 #include "lstmrecognizer.h"
23 #include "tessdatamanager.h"
24 
25 // Main program to combine/extract/overwrite tessdata components
26 // in [lang].traineddata files.
27 //
28 // To combine all the individual tessdata components (unicharset, DAWGs,
29 // classifier templates, ambiguities, language configs) located at, say,
30 // /home/$USER/temp/eng.* run:
31 //
32 // combine_tessdata /home/$USER/temp/eng.
33 //
34 // The result will be a combined tessdata file /home/$USER/temp/eng.traineddata
35 //
36 // Specify option -e if you would like to extract individual components
37 // from a combined traineddata file. For example, to extract language config
38 // file and the unicharset from tessdata/eng.traineddata run:
39 //
40 // combine_tessdata -e tessdata/eng.traineddata
41 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
42 //
43 // The desired config file and unicharset will be written to
44 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
45 //
46 // Specify option -o to overwrite individual components of the given
47 // [lang].traineddata file. For example, to overwrite language config
48 // and unichar ambiguities files in tessdata/eng.traineddata use:
49 //
50 // combine_tessdata -o tessdata/eng.traineddata
51 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
52 //
53 // As a result, tessdata/eng.traineddata will contain the new language config
54 // and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.
55 //
56 // Note: the file names of the files to extract to and to overwrite from should
57 // have the appropriate file suffixes (extensions) indicating their tessdata
58 // component type (.unicharset for the unicharset, .unicharambigs for unichar
59 // ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.
60 //
61 // Specify option -u to unpack all the components to the specified path:
62 //
63 // combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
64 //
65 // This will create /home/$USER/temp/eng.* files with individual tessdata
66 // components from tessdata/eng.traineddata.
67 //
68 int main(int argc, char **argv) {
69  tesseract::CheckSharedLibraryVersion();
70 
71  int i;
73  if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) {
74  printf("%s\n", tesseract::TessBaseAPI::Version());
75  return EXIT_SUCCESS;
76  } else if (argc == 2) {
77  printf("Combining tessdata files\n");
78  STRING lang = argv[1];
79  char* last = &argv[1][strlen(argv[1])-1];
80  if (*last != '.')
81  lang += '.';
82  STRING output_file = lang;
83  output_file += kTrainedDataSuffix;
84  if (!tm.CombineDataFiles(lang.c_str(), output_file.c_str())) {
85  printf("Error combining tessdata files into %s\n",
86  output_file.c_str());
87  } else {
88  printf("Output %s created successfully.\n", output_file.c_str());
89  }
90  } else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 ||
91  strcmp(argv[1], "-u") == 0)) {
92  // Initialize TessdataManager with the data in the given traineddata file.
93  if (!tm.Init(argv[2])) {
94  tprintf("Failed to read %s\n", argv[2]);
95  return EXIT_FAILURE;
96  }
97  printf("Extracting tessdata components from %s\n", argv[2]);
98  if (strcmp(argv[1], "-e") == 0) {
99  for (i = 3; i < argc; ++i) {
100  errno = 0;
101  if (tm.ExtractToFile(argv[i])) {
102  printf("Wrote %s\n", argv[i]);
103  } else if (errno == 0) {
104  printf("Not extracting %s, since this component"
105  " is not present\n", argv[i]);
106  return EXIT_FAILURE;
107  } else {
108  printf("Error, could not extract %s: %s\n",
109  argv[i], strerror(errno));
110  return EXIT_FAILURE;
111  }
112  }
113  } else { // extract all the components
114  for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
115  STRING filename = argv[3];
116  char* last = &argv[3][strlen(argv[3])-1];
117  if (*last != '.')
118  filename += '.';
119  filename += tesseract::kTessdataFileSuffixes[i];
120  errno = 0;
121  if (tm.ExtractToFile(filename.c_str())) {
122  printf("Wrote %s\n", filename.c_str());
123  } else if (errno != 0) {
124  printf("Error, could not extract %s: %s\n",
125  filename.c_str(), strerror(errno));
126  return EXIT_FAILURE;
127  }
128  }
129  }
130  } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
131  // Rename the current traineddata file to a temporary name.
132  const char *new_traineddata_filename = argv[2];
133  STRING traineddata_filename = new_traineddata_filename;
134  traineddata_filename += ".__tmp__";
135  if (rename(new_traineddata_filename, traineddata_filename.c_str()) != 0) {
136  tprintf("Failed to create a temporary file %s\n",
137  traineddata_filename.c_str());
138  return EXIT_FAILURE;
139  }
140 
141  // Initialize TessdataManager with the data in the given traineddata file.
142  tm.Init(traineddata_filename.c_str());
143 
144  // Write the updated traineddata file.
145  tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
146  } else if (argc == 3 && strcmp(argv[1], "-c") == 0) {
147  if (!tm.Init(argv[2])) {
148  tprintf("Failed to read %s\n", argv[2]);
149  return EXIT_FAILURE;
150  }
151  tesseract::TFile fp;
152  if (!tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {
153  tprintf("No LSTM Component found in %s!\n", argv[2]);
154  return EXIT_FAILURE;
155  }
156  tesseract::LSTMRecognizer recognizer;
157  if (!recognizer.DeSerialize(&tm, &fp)) {
158  tprintf("Failed to deserialize LSTM in %s!\n", argv[2]);
159  return EXIT_FAILURE;
160  }
161  recognizer.ConvertToInt();
162  GenericVector<char> lstm_data;
163  fp.OpenWrite(&lstm_data);
164  ASSERT_HOST(recognizer.Serialize(&tm, &fp));
165  tm.OverwriteEntry(tesseract::TESSDATA_LSTM, &lstm_data[0],
166  lstm_data.size());
167  if (!tm.SaveFile(argv[2], nullptr)) {
168  tprintf("Failed to write modified traineddata:%s!\n", argv[2]);
169  return EXIT_FAILURE;
170  }
171  } else if (argc == 3 && strcmp(argv[1], "-d") == 0) {
172  // Initialize TessdataManager with the data in the given traineddata file.
173  tm.Init(argv[2]);
174  } else {
175  printf("Usage for combining tessdata components:\n"
176  " %s language_data_path_prefix\n"
177  " (e.g. %s tessdata/eng.)\n\n", argv[0], argv[0]);
178  printf("Usage for extracting tessdata components:\n"
179  " %s -e traineddata_file [output_component_file...]\n"
180  " (e.g. %s -e eng.traineddata eng.unicharset)\n\n",
181  argv[0], argv[0]);
182  printf("Usage for overwriting tessdata components:\n"
183  " %s -o traineddata_file [input_component_file...]\n"
184  " (e.g. %s -o eng.traineddata eng.unicharset)\n\n",
185  argv[0], argv[0]);
186  printf("Usage for unpacking all tessdata components:\n"
187  " %s -u traineddata_file output_path_prefix\n"
188  " (e.g. %s -u eng.traineddata tmp/eng.)\n", argv[0], argv[0]);
189  printf(
190  "Usage for listing directory of components:\n"
191  " %s -d traineddata_file\n",
192  argv[0]);
193  printf(
194  "Usage for compacting LSTM component to int:\n"
195  " %s -c traineddata_file\n",
196  argv[0]);
197  return 1;
198  }
199  tm.Directory();
200  return EXIT_SUCCESS;
201 }
tesseract::TessdataManager::OverwriteComponents
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
Definition: tessdatamanager.cpp:275
commontraining.h
tesseract::TessdataManager
Definition: tessdatamanager.h:126
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
STRING
Definition: strngs.h:45
tesseract::TessdataManager::Directory
void Directory() const
Definition: tessdatamanager.cpp:202
tesseract::TessdataManager::CombineDataFiles
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
Definition: tessdatamanager.cpp:244
tesseract::TessdataManager::OverwriteEntry
void OverwriteEntry(TessdataType type, const char *data, int size)
Definition: tessdatamanager.cpp:145
tesseract::LSTMRecognizer::DeSerialize
bool DeSerialize(const TessdataManager *mgr, TFile *fp)
Definition: lstmrecognizer.cpp:108
last
LIST last(LIST var_list)
Definition: oldlist.cpp:151
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::LSTMRecognizer::ConvertToInt
void ConvertToInt()
Definition: lstmrecognizer.h:124
tesseract::TessdataManager::GetComponent
bool GetComponent(TessdataType type, TFile *fp)
Definition: tessdatamanager.cpp:216
tesseract::TFile
Definition: serialis.h:75
tesseract::LSTMRecognizer::Serialize
bool Serialize(const TessdataManager *mgr, TFile *fp) const
Definition: lstmrecognizer.cpp:89
lstmrecognizer.h
tesseract::TessBaseAPI::Version
static const char * Version()
Definition: baseapi.cpp:233
GenericVector< char >
tesseract::TESSDATA_NUM_ENTRIES
Definition: tessdatamanager.h:82
tesseract::LSTMRecognizer
Definition: lstmrecognizer.h:53
tesseract::TessdataManager::Init
bool Init(const char *data_file_name)
Definition: tessdatamanager.cpp:97
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::TESSDATA_LSTM
Definition: tessdatamanager.h:74
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::TFile::OpenWrite
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:309
main
int main(int argc, char **argv)
Definition: combine_tessdata.cpp:68
tessdatamanager.h
tesseract::TessdataManager::SaveFile
bool SaveFile(const STRING &filename, FileWriter writer) const
Definition: tessdatamanager.cpp:153
tesseract::TessdataManager::ExtractToFile
bool ExtractToFile(const char *filename)
Definition: tessdatamanager.cpp:295