tesseract  4.0.0-1-g2a2b
tessedit.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tessedit.cpp (Formerly tessedit.c)
3  * Description: (Previously) Main program for merge of tess and editor.
4  * Now just code to load the language model and various
5  * engine-specific data files.
6  * Author: Ray Smith
7  * Created: Tue Jan 07 15:21:46 GMT 1992
8  *
9  * (C) Copyright 1992, Hewlett-Packard Ltd.
10  ** Licensed under the Apache License, Version 2.0 (the "License");
11  ** you may not use this file except in compliance with the License.
12  ** You may obtain a copy of the License at
13  ** http://www.apache.org/licenses/LICENSE-2.0
14  ** Unless required by applicable law or agreed to in writing, software
15  ** distributed under the License is distributed on an "AS IS" BASIS,
16  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  ** See the License for the specific language governing permissions and
18  ** limitations under the License.
19  *
20  **********************************************************************/
21 
22 // Include automatically generated configuration file if running autoconf.
23 #ifdef HAVE_CONFIG_H
24 #include "config_auto.h"
25 #endif
26 
27 #include "basedir.h"
28 #include "tessvars.h"
29 #include "control.h"
30 #include "reject.h"
31 #include "pageres.h"
32 #include "pgedit.h"
33 #include "tprintf.h"
34 #include "tessedit.h"
35 #include "stopper.h"
36 #ifndef DISABLED_LEGACY_ENGINE
37 #include "intmatcher.h"
38 #include "chop.h"
39 #endif
40 #include "globals.h"
41 #ifndef ANDROID_BUILD
42 #include "lstmrecognizer.h"
43 #endif
44 #include "tesseractclass.h"
45 #include "params.h"
46 #ifdef DISABLED_LEGACY_ENGINE
47 #include "matchdefs.h"
48 #endif
49 
50  // config under api
51 #define API_CONFIG "configs/api_config"
52 
53 ETEXT_DESC *global_monitor = nullptr; // progress monitor
54 
55 namespace tesseract {
56 
57 // Read a "config" file containing a set of variable, value pairs.
58 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
59 // and also accepts a relative or absolute path name.
60 void Tesseract::read_config_file(const char *filename,
61  SetParamConstraint constraint) {
62  STRING path = datadir;
63  path += "configs/";
64  path += filename;
65  FILE* fp;
66  if ((fp = fopen(path.string(), "rb")) != nullptr) {
67  fclose(fp);
68  } else {
69  path = datadir;
70  path += "tessconfigs/";
71  path += filename;
72  if ((fp = fopen(path.string(), "rb")) != nullptr) {
73  fclose(fp);
74  } else {
75  path = filename;
76  }
77  }
78  ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
79 }
80 
81 // Returns false if a unicharset file for the specified language was not found
82 // or was invalid.
83 // This function initializes TessdataManager. After TessdataManager is
84 // no longer needed, TessdataManager::End() should be called.
85 //
86 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
87 // it is OEM_DEFAULT, in which case the value of the variable will be obtained
88 // from the language-specific config file (stored in [lang].traineddata), from
89 // the config files specified on the command line or left as the default
90 // OEM_TESSERACT_ONLY if none of the configs specify this variable.
92  const char *arg0, const char *textbase, const char *language,
93  OcrEngineMode oem, char **configs, int configs_size,
94  const GenericVector<STRING> *vars_vec,
95  const GenericVector<STRING> *vars_values, bool set_only_non_debug_params,
96  TessdataManager *mgr) {
97  // Set the basename, compute the data directory.
98  main_setup(arg0, textbase);
99 
100  // Set the language data path prefix
101  lang = language != nullptr ? language : "eng";
105 
106  // Initialize TessdataManager.
107  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
108  if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) {
109  tprintf("Error opening data file %s\n", tessdata_path.string());
110  tprintf("Please make sure the TESSDATA_PREFIX environment variable is set"
111  " to your \"tessdata\" directory.\n");
112  return false;
113  }
114 #ifndef DISABLED_LEGACY_ENGINE
115  if (oem == OEM_DEFAULT) {
116  // Set the engine mode from availability, which can then be overridden by
117  // the config file when we read it below.
118  if (!mgr->IsLSTMAvailable()) {
120  } else if (!mgr->IsBaseAvailable()) {
122  } else {
124  }
125  }
126 #endif // ndef DISABLED_LEGACY_ENGINE
127 
128  // If a language specific config file (lang.config) exists, load it in.
129  TFile fp;
130  if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
132  this->params());
133  }
134 
135  SetParamConstraint set_params_constraint = set_only_non_debug_params ?
137  // Load tesseract variables from config files. This is done after loading
138  // language-specific variables from [lang].traineddata file, so that custom
139  // config files can override values in [lang].traineddata file.
140  for (int i = 0; i < configs_size; ++i) {
141  read_config_file(configs[i], set_params_constraint);
142  }
143 
144  // Set params specified in vars_vec (done after setting params from config
145  // files, so that params in vars_vec can override those from files).
146  if (vars_vec != nullptr && vars_values != nullptr) {
147  for (int i = 0; i < vars_vec->size(); ++i) {
148  if (!ParamUtils::SetParam((*vars_vec)[i].string(),
149  (*vars_values)[i].string(),
150  set_params_constraint, this->params())) {
151  tprintf("Error setting param %s\n", (*vars_vec)[i].string());
152  exit(1);
153  }
154  }
155  }
156 
157  if (((STRING &)tessedit_write_params_to_file).length() > 0) {
158  FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
159  if (params_file != nullptr) {
160  ParamUtils::PrintParams(params_file, this->params());
161  fclose(params_file);
162  } else {
163  tprintf("Failed to open %s for writing params.\n",
165  }
166  }
167 
168  // Determine which ocr engine(s) should be loaded and used for recognition.
169  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
170 
171  // If we are only loading the config file (and so not planning on doing any
172  // recognition) then there's nothing else do here.
174  return true;
175  }
176 
177 // The various OcrEngineMode settings (see publictypes.h) determine which
178 // engine-specific data files need to be loaded.
179 // If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
180 #ifndef ANDROID_BUILD
181 #ifdef DISABLED_LEGACY_ENGINE
183 #else
186 #endif // ndef DISABLED_LEGACY_ENGINE
188  lstm_recognizer_ = new LSTMRecognizer;
189  ASSERT_HOST(
190  lstm_recognizer_->Load(lstm_use_matrix ? language : nullptr, mgr));
191  } else {
192  tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
194  }
195  }
196 #endif // ndef ANDROID_BUILD
197 
198  // Load the unicharset
200  // Avoid requiring a unicharset when we aren't running base tesseract.
201 #ifndef ANDROID_BUILD
202  unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
203 #endif // ndef ANDROID_BUILD
204  }
205 #ifndef DISABLED_LEGACY_ENGINE
206  else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
207  !unicharset.load_from_file(&fp, false)) {
208  return false;
209  }
210 #endif // ndef DISABLED_LEGACY_ENGINE
211  if (unicharset.size() > MAX_NUM_CLASSES) {
212  tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
213  return false;
214  }
215  right_to_left_ = unicharset.major_right_to_left();
216 
217  // Setup initial unichar ambigs table and read universal ambigs.
218  UNICHARSET encoder_unicharset;
219  encoder_unicharset.CopyFrom(unicharset);
221  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
222 
224  unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp,
227  }
228 #ifndef DISABLED_LEGACY_ENGINE
229  // Init ParamsModel.
230  // Load pass1 and pass2 weights (for now these two sets are the same, but in
231  // the future separate sets of weights can be generated).
232  for (int p = ParamsModel::PTRAIN_PASS1;
234  language_model_->getParamsModel().SetPass(
235  static_cast<ParamsModel::PassEnum>(p));
236  if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
237  if (!language_model_->getParamsModel().LoadFromFp(lang.string(), &fp)) {
238  return false;
239  }
240  }
241  }
242 #endif // ndef DISABLED_LEGACY_ENGINE
243 
244  return true;
245 }
246 
247 // Helper returns true if the given string is in the vector of strings.
248 static bool IsStrInList(const STRING& str,
249  const GenericVector<STRING>& str_list) {
250  for (int i = 0; i < str_list.size(); ++i) {
251  if (str_list[i] == str)
252  return true;
253  }
254  return false;
255 }
256 
257 // Parse a string of the form [~]<lang>[+[~]<lang>]*.
258 // Langs with no prefix get appended to to_load, provided they
259 // are not in there already.
260 // Langs with ~ prefix get appended to not_to_load, provided they are not in
261 // there already.
262 void Tesseract::ParseLanguageString(const char* lang_str,
263  GenericVector<STRING>* to_load,
264  GenericVector<STRING>* not_to_load) {
265  STRING remains(lang_str);
266  while (remains.length() > 0) {
267  // Find the start of the lang code and which vector to add to.
268  const char* start = remains.string();
269  while (*start == '+')
270  ++start;
271  GenericVector<STRING>* target = to_load;
272  if (*start == '~') {
273  target = not_to_load;
274  ++start;
275  }
276  // Find the index of the end of the lang code in string start.
277  int end = strlen(start);
278  const char* plus = strchr(start, '+');
279  if (plus != nullptr && plus - start < end)
280  end = plus - start;
281  STRING lang_code(start);
282  lang_code.truncate_at(end);
283  STRING next(start + end);
284  remains = next;
285  // Check whether lang_code is already in the target vector and add.
286  if (!IsStrInList(lang_code, *target)) {
287  target->push_back(lang_code);
288  }
289  }
290 }
291 
292 // Initialize for potentially a set of languages defined by the language
293 // string and recursively any additional languages required by any language
294 // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
295 // See init_tesseract_internal for args.
296 int Tesseract::init_tesseract(const char *arg0, const char *textbase,
297  const char *language, OcrEngineMode oem,
298  char **configs, int configs_size,
299  const GenericVector<STRING> *vars_vec,
300  const GenericVector<STRING> *vars_values,
301  bool set_only_non_debug_params,
302  TessdataManager *mgr) {
303  GenericVector<STRING> langs_to_load;
304  GenericVector<STRING> langs_not_to_load;
305  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
306 
307  sub_langs_.delete_data_pointers();
308  sub_langs_.clear();
309  // Find the first loadable lang and load into this.
310  // Add any languages that this language requires
311  bool loaded_primary = false;
312  // Load the rest into sub_langs_.
313  for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
314  if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
315  const char *lang_str = langs_to_load[lang_index].string();
316  Tesseract *tess_to_init;
317  if (!loaded_primary) {
318  tess_to_init = this;
319  } else {
320  tess_to_init = new Tesseract;
321  }
322 
323  int result = tess_to_init->init_tesseract_internal(
324  arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
325  vars_values, set_only_non_debug_params, mgr);
326  // Forget that language, but keep any reader we were given.
327  mgr->Clear();
328 
329  if (!loaded_primary) {
330  if (result < 0) {
331  tprintf("Failed loading language '%s'\n", lang_str);
332  } else {
333  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
334  &langs_to_load, &langs_not_to_load);
335  loaded_primary = true;
336  }
337  } else {
338  if (result < 0) {
339  tprintf("Failed loading language '%s'\n", lang_str);
340  delete tess_to_init;
341  } else {
342  sub_langs_.push_back(tess_to_init);
343  // Add any languages that this language requires
344  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
345  &langs_to_load, &langs_not_to_load);
346  }
347  }
348  }
349  }
350  if (!loaded_primary) {
351  tprintf("Tesseract couldn't load any languages!\n");
352  return -1; // Couldn't load any language!
353  }
354 #ifndef DISABLED_LEGACY_ENGINE
355  if (!sub_langs_.empty()) {
356  // In multilingual mode word ratings have to be directly comparable,
357  // so use the same language model weights for all languages:
358  // use the primary language's params model if
359  // tessedit_use_primary_params_model is set,
360  // otherwise use default language model weights.
362  for (int s = 0; s < sub_langs_.size(); ++s) {
363  sub_langs_[s]->language_model_->getParamsModel().Copy(
364  this->language_model_->getParamsModel());
365  }
366  tprintf("Using params model of the primary language\n");
367  } else {
368  this->language_model_->getParamsModel().Clear();
369  for (int s = 0; s < sub_langs_.size(); ++s) {
370  sub_langs_[s]->language_model_->getParamsModel().Clear();
371  }
372  }
373  }
374 
376 #endif // ndef DISABLED_LEGACY_ENGINE
377  return 0;
378 }
379 
380 // Common initialization for a single language.
381 // arg0 is the datapath for the tessdata directory, which could be the
382 // path of the tessdata directory with no trailing /, or (if tessdata
383 // lives in the same directory as the executable, the path of the executable,
384 // hence the name arg0.
385 // textbase is an optional output file basename (used only for training)
386 // language is the language code to load.
387 // oem controls which engine(s) will operate on the image
388 // configs (argv) is an array of config filenames to load variables from.
389 // May be nullptr.
390 // configs_size (argc) is the number of elements in configs.
391 // vars_vec is an optional vector of variables to set.
392 // vars_values is an optional corresponding vector of values for the variables
393 // in vars_vec.
394 // If set_only_init_params is true, then only the initialization variables
395 // will be set.
396 int Tesseract::init_tesseract_internal(const char *arg0, const char *textbase,
397  const char *language, OcrEngineMode oem,
398  char **configs, int configs_size,
399  const GenericVector<STRING> *vars_vec,
400  const GenericVector<STRING> *vars_values,
401  bool set_only_non_debug_params,
402  TessdataManager *mgr) {
403  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
404  configs_size, vars_vec, vars_values,
405  set_only_non_debug_params, mgr)) {
406  return -1;
407  }
409  return 0;
410  }
411  // If only LSTM will be used, skip loading Tesseract classifier's
412  // pre-trained templates and dictionary.
414  program_editup(textbase, init_tesseract ? mgr : nullptr,
415  init_tesseract ? mgr : nullptr);
416  return 0; //Normal exit
417 }
418 
419 #ifndef DISABLED_LEGACY_ENGINE
420 
421 // Helper builds the all_fonts table by adding new fonts from new_fonts.
422 static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
423  UnicityTable<FontInfo>* all_fonts) {
424  for (int i = 0; i < new_fonts.size(); ++i) {
425  // UnicityTable uniques as we go.
426  all_fonts->push_back(new_fonts.get(i));
427  }
428 }
429 
430 // Helper assigns an id to lang_fonts using the index in all_fonts table.
431 static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
432  UnicityTable<FontInfo>* lang_fonts) {
433  for (int i = 0; i < lang_fonts->size(); ++i) {
434  int index = all_fonts.get_id(lang_fonts->get(i));
435  lang_fonts->get_mutable(i)->universal_id = index;
436  }
437 }
438 
439 // Set the universal_id member of each font to be unique among all
440 // instances of the same font loaded.
442  // Note that we can get away with bitwise copying FontInfo in
443  // all_fonts, as it is a temporary structure and we avoid setting the
444  // delete callback.
445  UnicityTable<FontInfo> all_fonts;
447 
448  // Create the universal ID table.
449  CollectFonts(get_fontinfo_table(), &all_fonts);
450  for (int i = 0; i < sub_langs_.size(); ++i) {
451  CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
452  }
453  // Assign ids from the table to each font table.
454  AssignIds(all_fonts, &get_fontinfo_table());
455  for (int i = 0; i < sub_langs_.size(); ++i) {
456  AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
457  }
458  font_table_size_ = all_fonts.size();
459 }
460 
461 // init the LM component
462 int Tesseract::init_tesseract_lm(const char *arg0, const char *textbase,
463  const char *language, TessdataManager *mgr) {
464  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
465  nullptr, 0, nullptr, nullptr, false, mgr))
466  return -1;
468  getDict().Load(lang, mgr);
469  getDict().FinishLoad();
470  return 0;
471 }
472 
473 #endif // ndef DISABLED_LEGACY_ENGINE
474 
476  end_recog();
477 }
478 
479 /* Define command type identifiers */
480 
482 {
487 };
488 } // namespace tesseract
const UNICHARSET & GetUnicharset() const
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:193
int size() const
Definition: genericvector.h:71
char * tessedit_write_params_to_file
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:171
bool GetComponent(TessdataType type, TFile *fp)
Dict & getDict() override
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:447
T * get_mutable(int id)
const char * string() const
Definition: strngs.cpp:196
bool use_ambigs_for_adaption
Definition: ccutil.h:88
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:89
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
Definition: tessedit.cpp:462
STRING language_data_path_prefix
Definition: ccutil.h:67
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:63
ETEXT_DESC * global_monitor
Definition: tessedit.cpp:53
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:91
int size() const
Definition: unicharset.h:336
bool major_right_to_left() const
Definition: unicharset.cpp:962
int ambigs_debug_level
Definition: ccutil.h:84
SetParamConstraint
Definition: params.h:36
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:296
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
void set_compare_callback(TessResultCallback2< bool, T const &, T const &> *cb)
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:120
void SetupUniversalFontIds()
Definition: tessedit.cpp:441
UNICHARSET unicharset
Definition: ccutil.h:68
STRING lang
Definition: ccutil.h:66
int size() const
Return the size used.
bool Load(const char *lang, TessdataManager *mgr)
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:262
bool Init(const char *data_file_name)
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:39
int get_id(T object) const
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:219
bool FinishLoad()
Definition: dict.cpp:323
ParamsVectors * params()
Definition: ccutil.h:62
int init_tesseract_internal(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:396
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
Definition: params.cpp:60
void truncate_at(int32_t index)
Definition: strngs.cpp:267
void program_editup(const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
Definition: tface.cpp:40
int push_back(T object)
Add an element in the table.
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:201
int push_back(T object)
Definition: strngs.h:45
const T & get(int id) const
Return the object from an id.
STRING datadir
Definition: ccutil.h:64
int32_t universal_id
Definition: fontinfo.h:123
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:49
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:60
void main_setup(const char *argv0, const char *basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: mainblk.cpp:48
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:70
#define MAX_NUM_CLASSES
Definition: matchdefs.h:32
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:383
int32_t length() const
Definition: strngs.cpp:191
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:69
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool IsComponentAvailable(TessdataType type) const