tesseract  5.0.0-alpha-619-ge9db
classifier_tester.cpp
Go to the documentation of this file.
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 // Filename: classifier_tester.cpp
15 // Purpose: Tests a character classifier on data as formatted for training,
16 // but doesn't have to be the same as the training data.
17 // Author: Ray Smith
18 
19 #include <algorithm>
20 #include <cstdio>
21 #ifdef GOOGLE_TESSERACT
22 #include "base/commandlineflags.h"
23 #endif // GOOGLE_TESSERACT
24 #include <tesseract/baseapi.h>
25 #include "commontraining.h"
26 #include "mastertrainer.h"
27 #include "params.h"
28 #include <tesseract/strngs.h>
29 #include "tessclassifier.h"
30 #include "tesseractclass.h"
31 
32 static STRING_PARAM_FLAG(classifier, "", "Classifier to test");
33 static STRING_PARAM_FLAG(lang, "eng", "Language to test");
34 static STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files");
35 
40 };
41 
42 static const char* names[] = {"pruner", "full"};
43 
44 static tesseract::ShapeClassifier* InitializeClassifier(
45  const char* classifer_name, const UNICHARSET& unicharset,
46  int argc, char **argv,
47  tesseract::TessBaseAPI** api) {
48  // Decode the classifier string.
49  ClassifierName classifier = CN_COUNT;
50  for (int c = 0; c < CN_COUNT; ++c) {
51  if (strcmp(classifer_name, names[c]) == 0) {
52  classifier = static_cast<ClassifierName>(c);
53  break;
54  }
55  }
56  if (classifier == CN_COUNT) {
57  fprintf(stderr, "Invalid classifier name:%s\n", FLAGS_classifier.c_str());
58  return nullptr;
59  }
60 
61  // We need to initialize tesseract to test.
62  *api = new tesseract::TessBaseAPI;
65  tesseract::Classify* classify = nullptr;
66  if (
67  classifier == CN_PRUNER || classifier == CN_FULL) {
68  if ((*api)->Init(FLAGS_tessdata_dir.c_str(), FLAGS_lang.c_str(),
69  engine_mode) < 0) {
70  fprintf(stderr, "Tesseract initialization failed!\n");
71  return nullptr;
72  }
73  tesseract = const_cast<tesseract::Tesseract*>((*api)->tesseract());
74  classify = static_cast<tesseract::Classify*>(tesseract);
75  if (classify->shape_table() == nullptr) {
76  fprintf(stderr, "Tesseract must contain a ShapeTable!\n");
77  return nullptr;
78  }
79  }
80  tesseract::ShapeClassifier* shape_classifier = nullptr;
81 
82  if (classifier == CN_PRUNER) {
83  shape_classifier = new tesseract::TessClassifier(true, classify);
84  } else if (classifier == CN_FULL) {
85  shape_classifier = new tesseract::TessClassifier(false, classify);
86  }
87  tprintf("Testing classifier %s:\n", classifer_name);
88  return shape_classifier;
89 }
90 
91 // This program has complex setup requirements, so here is some help:
92 // Two different modes, tr files and serialized mastertrainer.
93 // From tr files:
94 // classifier_tester -U unicharset -F font_properties -X xheights
95 // -classifier x -lang lang [-output_trainer trainer] *.tr
96 // From a serialized trainer:
97 // classifier_tester -input_trainer trainer [-lang lang] -classifier x
98 //
99 // In the first case, the unicharset must be the unicharset from within
100 // the classifier under test, and the font_properties and xheights files must
101 // match the files used during training.
102 // In the second case, the trainer file must have been prepared from
103 // some previous run of shapeclustering, mftraining, or classifier_tester
104 // using the same conditions as above, ie matching unicharset/font_properties.
105 //
106 // Available values of classifier (x above) are:
107 // pruner : Tesseract class pruner only.
108 // full : Tesseract full classifier.
109 // with an input trainer.)
110 int main(int argc, char **argv) {
111  tesseract::CheckSharedLibraryVersion();
112  ParseArguments(&argc, &argv);
113  STRING file_prefix;
114  tesseract::MasterTrainer* trainer =
115  tesseract::LoadTrainingData(argc, argv, false, nullptr, &file_prefix);
117  // Decode the classifier string.
118  tesseract::ShapeClassifier* shape_classifier = InitializeClassifier(
119  FLAGS_classifier.c_str(), trainer->unicharset(), argc, argv, &api);
120  if (shape_classifier == nullptr) {
121  fprintf(stderr, "Classifier init failed!:%s\n", FLAGS_classifier.c_str());
122  return 1;
123  }
124 
125  // We want to test junk as well if it is available.
126  // trainer->IncludeJunk();
127  // We want to test with replicated samples too.
129 
131  std::max(3, static_cast<int>(FLAGS_debug_level)), false,
132  shape_classifier, nullptr);
133  delete shape_classifier;
134  delete api;
135  delete trainer;
136 
137  return 0;
138 } /* main */
strngs.h
commontraining.h
STRING_PARAM_FLAG
#define STRING_PARAM_FLAG(name, val, comment)
Definition: commandlineflags.h:37
tesseractclass.h
params.h
tesseract::Tesseract
Definition: tesseractclass.h:172
tesseract::TessClassifier
Definition: tessclassifier.h:36
STRING
Definition: strngs.h:45
tesseract::Classify
Definition: classify.h:103
mastertrainer.h
CN_PRUNER
Definition: classifier_tester.cpp:37
tesseract::OcrEngineMode
OcrEngineMode
Definition: publictypes.h:265
tesseract::ShapeClassifier
Definition: shapeclassifier.h:43
baseapi.h
CN_FULL
Definition: classifier_tester.cpp:38
tesseract::MasterTrainer::unicharset
const UNICHARSET & unicharset() const
Definition: mastertrainer.h:186
tesseract::TessBaseAPI
Definition: baseapi.h:98
UNICHARSET
Definition: unicharset.h:145
tesseract
Definition: baseapi.h:65
tesseract::MasterTrainer
Definition: mastertrainer.h:69
tessclassifier.h
tesseract::LoadTrainingData
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
Definition: commontraining.cpp:211
ParseArguments
void ParseArguments(int *argc, char ***argv)
Definition: commontraining.cpp:122
tesseract::MasterTrainer::ReplicateAndRandomizeSamplesIfRequired
void ReplicateAndRandomizeSamplesIfRequired()
Definition: mastertrainer.cpp:320
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
TessBaseAPI
struct TessBaseAPI TessBaseAPI
Definition: capi.h:72
CN_COUNT
Definition: classifier_tester.cpp:39
tesseract::OEM_TESSERACT_ONLY
Definition: publictypes.h:266
tesseract::CT_UNICHAR_TOP1_ERR
Definition: errorcounter.h:74
ClassifierName
ClassifierName
Definition: classifier_tester.cpp:36
tesseract::MasterTrainer::TestClassifierOnSamples
void TestClassifierOnSamples(CountTypes error_mode, int report_level, bool replicate_samples, ShapeClassifier *test_classifier, STRING *report_string)
Definition: mastertrainer.cpp:760
main
int main(int argc, char **argv)
Definition: classifier_tester.cpp:110