tesseract  4.0.0-1-g2a2b
classifier_tester.cpp
Go to the documentation of this file.
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 // Filename: classifier_tester.cpp
15 // Purpose: Tests a character classifier on data as formatted for training,
16 // but doesn't have to be the same as the training data.
17 // Author: Ray Smith
18 
19 #include <algorithm>
20 #include <cstdio>
21 #ifdef GOOGLE_TESSERACT
22 #include "base/commandlineflags.h"
23 #endif // GOOGLE_TESSERACT
24 #include "baseapi.h"
25 #include "commontraining.h"
26 #include "mastertrainer.h"
27 #include "params.h"
28 #include "strngs.h"
29 #include "tessclassifier.h"
30 #include "tesseractclass.h"
31 
32 STRING_PARAM_FLAG(classifier, "", "Classifier to test");
33 STRING_PARAM_FLAG(lang, "eng", "Language to test");
34 STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files");
35 DECLARE_INT_PARAM_FLAG(debug_level);
36 
41 };
42 
43 static const char* names[] = {"pruner", "full"};
44 
45 static tesseract::ShapeClassifier* InitializeClassifier(
46  const char* classifer_name, const UNICHARSET& unicharset,
47  int argc, char **argv,
48  tesseract::TessBaseAPI** api) {
49  // Decode the classifier string.
50  ClassifierName classifier = CN_COUNT;
51  for (int c = 0; c < CN_COUNT; ++c) {
52  if (strcmp(classifer_name, names[c]) == 0) {
53  classifier = static_cast<ClassifierName>(c);
54  break;
55  }
56  }
57  if (classifier == CN_COUNT) {
58  fprintf(stderr, "Invalid classifier name:%s\n", FLAGS_classifier.c_str());
59  return nullptr;
60  }
61 
62  // We need to initialize tesseract to test.
63  *api = new tesseract::TessBaseAPI;
66  tesseract::Classify* classify = nullptr;
67  if (
68  classifier == CN_PRUNER || classifier == CN_FULL) {
69  if ((*api)->Init(FLAGS_tessdata_dir.c_str(), FLAGS_lang.c_str(),
70  engine_mode) < 0) {
71  fprintf(stderr, "Tesseract initialization failed!\n");
72  return nullptr;
73  }
74  tesseract = const_cast<tesseract::Tesseract*>((*api)->tesseract());
75  classify = static_cast<tesseract::Classify*>(tesseract);
76  if (classify->shape_table() == nullptr) {
77  fprintf(stderr, "Tesseract must contain a ShapeTable!\n");
78  return nullptr;
79  }
80  }
81  tesseract::ShapeClassifier* shape_classifier = nullptr;
82 
83  if (classifier == CN_PRUNER) {
84  shape_classifier = new tesseract::TessClassifier(true, classify);
85  } else if (classifier == CN_FULL) {
86  shape_classifier = new tesseract::TessClassifier(false, classify);
87  }
88  tprintf("Testing classifier %s:\n", classifer_name);
89  return shape_classifier;
90 }
91 
92 // This program has complex setup requirements, so here is some help:
93 // Two different modes, tr files and serialized mastertrainer.
94 // From tr files:
95 // classifier_tester -U unicharset -F font_properties -X xheights
96 // -classifier x -lang lang [-output_trainer trainer] *.tr
97 // From a serialized trainer:
98 // classifier_tester -input_trainer trainer [-lang lang] -classifier x
99 //
100 // In the first case, the unicharset must be the unicharset from within
101 // the classifier under test, and the font_properties and xheights files must
102 // match the files used during training.
103 // In the second case, the trainer file must have been prepared from
104 // some previous run of shapeclustering, mftraining, or classifier_tester
105 // using the same conditions as above, ie matching unicharset/font_properties.
106 //
107 // Available values of classifier (x above) are:
108 // pruner : Tesseract class pruner only.
109 // full : Tesseract full classifier.
110 // with an input trainer.)
111 int main(int argc, char **argv) {
112  tesseract::CheckSharedLibraryVersion();
113  ParseArguments(&argc, &argv);
114  STRING file_prefix;
115  tesseract::MasterTrainer* trainer =
116  tesseract::LoadTrainingData(argc, argv, false, nullptr, &file_prefix);
118  // Decode the classifier string.
119  tesseract::ShapeClassifier* shape_classifier = InitializeClassifier(
120  FLAGS_classifier.c_str(), trainer->unicharset(), argc, argv, &api);
121  if (shape_classifier == nullptr) {
122  fprintf(stderr, "Classifier init failed!:%s\n", FLAGS_classifier.c_str());
123  return 1;
124  }
125 
126  // We want to test junk as well if it is available.
127  // trainer->IncludeJunk();
128  // We want to test with replicated samples too.
130 
132  std::max(3, static_cast<int>(FLAGS_debug_level)), false,
133  shape_classifier, nullptr);
134  delete shape_classifier;
135  delete api;
136  delete trainer;
137 
138  return 0;
139 } /* main */
ClassifierName
const UNICHARSET & unicharset() const
struct TessBaseAPI TessBaseAPI
Definition: capi.h:89
STRING_PARAM_FLAG(classifier, "", "Classifier to test")
void ParseArguments(int *argc, char ***argv)
void TestClassifierOnSamples(CountTypes error_mode, int report_level, bool replicate_samples, ShapeClassifier *test_classifier, STRING *report_string)
int main(int argc, char **argv)
void ReplicateAndRandomizeSamplesIfRequired()
DECLARE_INT_PARAM_FLAG(debug_level)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: strngs.h:45
const ShapeTable * shape_table() const
Definition: classify.h:111
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)