tesseract  5.0.0-alpha-619-ge9db
tessdatamanager.h
Go to the documentation of this file.
1 // File: tessdatamanager.h
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 //
6 // (C) Copyright 2009, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
20 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
21 
23 #include <tesseract/strngs.h> // for STRING
24 
25 static const char kTrainedDataSuffix[] = "traineddata";
26 
27 // When adding new tessdata types and file suffixes, please make sure to
28 // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
29 static const char kLangConfigFileSuffix[] = "config";
30 static const char kUnicharsetFileSuffix[] = "unicharset";
31 static const char kAmbigsFileSuffix[] = "unicharambigs";
32 static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
33 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
34 static const char kNormProtoFileSuffix[] = "normproto";
35 static const char kPuncDawgFileSuffix[] = "punc-dawg";
36 static const char kSystemDawgFileSuffix[] = "word-dawg";
37 static const char kNumberDawgFileSuffix[] = "number-dawg";
38 static const char kFreqDawgFileSuffix[] = "freq-dawg";
39 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
40 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
41 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
42 static const char kShapeTableFileSuffix[] = "shapetable";
43 static const char kBigramDawgFileSuffix[] = "bigram-dawg";
44 static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
45 static const char kParamsModelFileSuffix[] = "params-model";
46 static const char kLSTMModelFileSuffix[] = "lstm";
47 static const char kLSTMPuncDawgFileSuffix[] = "lstm-punc-dawg";
48 static const char kLSTMSystemDawgFileSuffix[] = "lstm-word-dawg";
49 static const char kLSTMNumberDawgFileSuffix[] = "lstm-number-dawg";
50 static const char kLSTMUnicharsetFileSuffix[] = "lstm-unicharset";
51 static const char kLSTMRecoderFileSuffix[] = "lstm-recoder";
52 static const char kVersionFileSuffix[] = "version";
53 
54 namespace tesseract {
55 
67  TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated
68  TESSDATA_CUBE_UNICHARSET, // 11 // deprecated
69  TESSDATA_CUBE_SYSTEM_DAWG, // 12 // deprecated
81 
83 };
84 
89 static const char *const kTessdataFileSuffixes[] = {
90  kLangConfigFileSuffix, // 0
91  kUnicharsetFileSuffix, // 1
92  kAmbigsFileSuffix, // 2
93  kBuiltInTemplatesFileSuffix, // 3
94  kBuiltInCutoffsFileSuffix, // 4
95  kNormProtoFileSuffix, // 5
96  kPuncDawgFileSuffix, // 6
97  kSystemDawgFileSuffix, // 7
98  kNumberDawgFileSuffix, // 8
99  kFreqDawgFileSuffix, // 9
100  kFixedLengthDawgsFileSuffix, // 10 // deprecated
101  kCubeUnicharsetFileSuffix, // 11 // deprecated
102  kCubeSystemDawgFileSuffix, // 12 // deprecated
103  kShapeTableFileSuffix, // 13
104  kBigramDawgFileSuffix, // 14
105  kUnambigDawgFileSuffix, // 15
106  kParamsModelFileSuffix, // 16
107  kLSTMModelFileSuffix, // 17
108  kLSTMPuncDawgFileSuffix, // 18
109  kLSTMSystemDawgFileSuffix, // 19
110  kLSTMNumberDawgFileSuffix, // 20
111  kLSTMUnicharsetFileSuffix, // 21
112  kLSTMRecoderFileSuffix, // 22
113  kVersionFileSuffix, // 23
114 };
115 
123 static const int kMaxNumTessdataEntries = 1000;
124 
125 
127  public:
128  TessdataManager();
129  explicit TessdataManager(FileReader reader);
130 
131  ~TessdataManager() = default;
132 
133  bool swap() const { return swap_; }
134  bool is_loaded() const { return is_loaded_; }
135 
136  // Lazily loads from the the given filename. Won't actually read the file
137  // until it needs it.
138  void LoadFileLater(const char *data_file_name);
143  bool Init(const char *data_file_name);
144  // Loads from the given memory buffer as if a file, remembering name as some
145  // arbitrary source id for caching.
146  bool LoadMemBuffer(const char *name, const char *data, int size);
147  // Overwrites a single entry of the given type.
148  void OverwriteEntry(TessdataType type, const char *data, int size);
149 
150  // Saves to the given filename.
151  bool SaveFile(const STRING &filename, FileWriter writer) const;
152  // Serializes to the given vector.
153  void Serialize(GenericVector<char> *data) const;
154  // Resets to the initial state, keeping the reader.
155  void Clear();
156 
157  // Prints a directory of contents.
158  void Directory() const;
159 
160  // Returns true if the component requested is present.
162  return !entries_[type].empty();
163  }
164  // Opens the given TFile pointer to the given component type.
165  // Returns false in case of failure.
166  bool GetComponent(TessdataType type, TFile *fp);
167  // As non-const version except it can't load the component if not already
168  // loaded.
169  bool GetComponent(TessdataType type, TFile *fp) const;
170 
171  // Returns the current version string.
172  std::string VersionString() const;
173  // Sets the version string to the given v_str.
174  void SetVersionString(const std::string &v_str);
175 
176  // Returns true if the base Tesseract components are present.
177  bool IsBaseAvailable() const {
178  return !entries_[TESSDATA_UNICHARSET].empty() &&
179  !entries_[TESSDATA_INTTEMP].empty();
180  }
181 
182  // Returns true if the LSTM components are present.
183  bool IsLSTMAvailable() const { return !entries_[TESSDATA_LSTM].empty(); }
184 
185  // Return the name of the underlying data file.
186  const STRING &GetDataFileName() const { return data_file_name_; }
187 
193  bool CombineDataFiles(const char *language_data_path_prefix,
194  const char *output_filename);
195 
201  bool OverwriteComponents(const char *new_traineddata_filename,
202  char **component_filenames,
203  int num_new_components);
204 
215  bool ExtractToFile(const char *filename);
216 
217  private:
218 
219  // Use libarchive.
220  bool LoadArchiveFile(const char *filename);
221 
228  static bool TessdataTypeFromFileSuffix(const char *suffix,
229  TessdataType *type);
230 
235  static bool TessdataTypeFromFileName(const char *filename,
236  TessdataType *type);
237 
238  // Name of file it came from.
239  STRING data_file_name_;
240  // Function to load the file when we need it.
241  FileReader reader_;
242  // True if the file has been loaded.
243  bool is_loaded_;
244  // True if the bytes need swapping.
245  bool swap_;
246  // Contents of each element of the traineddata file.
248 };
249 
250 } // namespace tesseract
251 
252 #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_
string
std::string string
Definition: equationdetect_test.cc:21
strngs.h
tesseract::FileWriter
bool(*)(const GenericVector< char > &data, const char *filename) FileWriter
Definition: serialis.h:51
tesseract::TessdataManager::OverwriteComponents
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
Definition: tessdatamanager.cpp:275
tesseract::TESSDATA_CUBE_SYSTEM_DAWG
Definition: tessdatamanager.h:69
tesseract::TessdataManager::~TessdataManager
~TessdataManager()=default
tesseract::TessdataManager::SetVersionString
void SetVersionString(const std::string &v_str)
Definition: tessdatamanager.cpp:239
tesseract::TESSDATA_SYSTEM_DAWG
Definition: tessdatamanager.h:64
tesseract::TessdataManager
Definition: tessdatamanager.h:126
tesseract::TESSDATA_BIGRAM_DAWG
Definition: tessdatamanager.h:71
tesseract::TESSDATA_PARAMS_MODEL
Definition: tessdatamanager.h:73
tesseract::TessdataManager::VersionString
std::string VersionString() const
Definition: tessdatamanager.cpp:233
STRING
Definition: strngs.h:45
tesseract::FileReader
bool(*)(const char *filename, GenericVector< char > *data) FileReader
Definition: serialis.h:47
tesseract::TESSDATA_CUBE_UNICHARSET
Definition: tessdatamanager.h:68
tesseract::TessdataManager::Directory
void Directory() const
Definition: tessdatamanager.cpp:202
tesseract::TessdataManager::IsLSTMAvailable
bool IsLSTMAvailable() const
Definition: tessdatamanager.h:183
tesseract::TESSDATA_LSTM_SYSTEM_DAWG
Definition: tessdatamanager.h:76
tesseract::TESSDATA_PUNC_DAWG
Definition: tessdatamanager.h:63
tesseract::TESSDATA_SHAPE_TABLE
Definition: tessdatamanager.h:70
tesseract::TESSDATA_UNICHARSET
Definition: tessdatamanager.h:58
tesseract::TessdataManager::CombineDataFiles
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
Definition: tessdatamanager.cpp:244
tesseract::TESSDATA_INTTEMP
Definition: tessdatamanager.h:60
tesseract::TessdataManager::OverwriteEntry
void OverwriteEntry(TessdataType type, const char *data, int size)
Definition: tessdatamanager.cpp:145
genericvector.h
tesseract::TESSDATA_VERSION
Definition: tessdatamanager.h:80
tesseract::TessdataManager::TessdataManager
TessdataManager()
Definition: tessdatamanager.cpp:42
tesseract::TessdataManager::GetComponent
bool GetComponent(TessdataType type, TFile *fp)
Definition: tessdatamanager.cpp:216
tesseract::TESSDATA_LSTM_NUMBER_DAWG
Definition: tessdatamanager.h:77
tesseract::TESSDATA_LSTM_RECODER
Definition: tessdatamanager.h:79
tesseract::TessdataManager::is_loaded
bool is_loaded() const
Definition: tessdatamanager.h:134
tesseract::TFile
Definition: serialis.h:75
tesseract::TessdataManager::LoadMemBuffer
bool LoadMemBuffer(const char *name, const char *data, int size)
Definition: tessdatamanager.cpp:111
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
tesseract::TESSDATA_PFFMTABLE
Definition: tessdatamanager.h:61
tesseract::TessdataManager::IsComponentAvailable
bool IsComponentAvailable(TessdataType type) const
Definition: tessdatamanager.h:161
tesseract::TESSDATA_AMBIGS
Definition: tessdatamanager.h:59
tesseract
Definition: baseapi.h:65
tesseract::TessdataType
TessdataType
Definition: tessdatamanager.h:56
GenericVector< char >
tesseract::TESSDATA_LANG_CONFIG
Definition: tessdatamanager.h:57
tesseract::TESSDATA_UNAMBIG_DAWG
Definition: tessdatamanager.h:72
tesseract::TESSDATA_LSTM_PUNC_DAWG
Definition: tessdatamanager.h:75
tesseract::TESSDATA_NUM_ENTRIES
Definition: tessdatamanager.h:82
tesseract::TessdataManager::swap
bool swap() const
Definition: tessdatamanager.h:133
tesseract::TessdataManager::Init
bool Init(const char *data_file_name)
Definition: tessdatamanager.cpp:97
tesseract::TessdataManager::Clear
void Clear()
Definition: tessdatamanager.cpp:194
tesseract::TessdataManager::LoadFileLater
void LoadFileLater(const char *data_file_name)
Definition: tessdatamanager.cpp:55
tesseract::TessdataManager::GetDataFileName
const STRING & GetDataFileName() const
Definition: tessdatamanager.h:186
tesstrain_utils.type
type
Definition: tesstrain_utils.py:141
tesseract::TessdataManager::Serialize
void Serialize(GenericVector< char > *data) const
Definition: tessdatamanager.cpp:166
tesseract::TESSDATA_LSTM
Definition: tessdatamanager.h:74
tesseract::TESSDATA_LSTM_UNICHARSET
Definition: tessdatamanager.h:78
tesseract::TESSDATA_NUMBER_DAWG
Definition: tessdatamanager.h:65
tesseract::TessdataManager::IsBaseAvailable
bool IsBaseAvailable() const
Definition: tessdatamanager.h:177
tesseract::TESSDATA_FIXED_LENGTH_DAWGS
Definition: tessdatamanager.h:67
tesseract::TessdataManager::SaveFile
bool SaveFile(const STRING &filename, FileWriter writer) const
Definition: tessdatamanager.cpp:153
tesseract::TessdataManager::ExtractToFile
bool ExtractToFile(const char *filename)
Definition: tessdatamanager.cpp:295
tesseract::TESSDATA_NORMPROTO
Definition: tessdatamanager.h:62
tesseract::TESSDATA_FREQ_DAWG
Definition: tessdatamanager.h:66