All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
classifier_tester.cpp
Go to the documentation of this file.
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 // Filename: classifier_tester.cpp
15 // Purpose: Tests a character classifier on data as formatted for training,
16 // but doesn't have to be the same as the training data.
17 // Author: Ray Smith
18 
19 #include <stdio.h>
20 #ifndef USE_STD_NAMESPACE
21 #include "base/commandlineflags.h"
22 #endif
23 #include "baseapi.h"
24 #include "commontraining.h"
25 #include "cubeclassifier.h"
26 #include "mastertrainer.h"
27 #include "params.h"
28 #include "strngs.h"
29 #include "tessclassifier.h"
30 
31 STRING_PARAM_FLAG(classifier, "", "Classifier to test");
32 STRING_PARAM_FLAG(lang, "eng", "Language to test");
33 STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files");
34 DECLARE_INT_PARAM_FLAG(debug_level);
36 
43 };
44 
45 const char* names[] = {"pruner", "full", "cube", "cubetess", NULL };
46 
47 static tesseract::ShapeClassifier* InitializeClassifier(
48  const char* classifer_name, const UNICHARSET& unicharset,
49  int argc, char **argv,
51  // Decode the classifier string.
52  ClassifierName classifier = CN_COUNT;
53  for (int c = 0; c < CN_COUNT; ++c) {
54  if (strcmp(classifer_name, names[c]) == 0) {
55  classifier = static_cast<ClassifierName>(c);
56  break;
57  }
58  }
59  if (classifier == CN_COUNT) {
60  fprintf(stderr, "Invalid classifier name:%s\n", FLAGS_classifier.c_str());
61  return NULL;
62  }
63 
64  // We need to initialize tesseract to test.
65  *api = new tesseract::TessBaseAPI;
67  if (classifier == CN_CUBE || classifier == CN_CUBETESS)
70  tesseract::Classify* classify = NULL;
71  if (classifier == CN_CUBE || classifier == CN_CUBETESS ||
72  classifier == CN_PRUNER || classifier == CN_FULL) {
73  (*api)->SetVariable("cube_debug_level", "2");
74  if ((*api)->Init(FLAGS_tessdata_dir.c_str(), FLAGS_lang.c_str(),
75  engine_mode) < 0) {
76  fprintf(stderr, "Tesseract initialization failed!\n");
77  return NULL;
78  }
79  tesseract = const_cast<tesseract::Tesseract*>((*api)->tesseract());
80  classify = reinterpret_cast<tesseract::Classify*>(tesseract);
81  if (classify->shape_table() == NULL) {
82  fprintf(stderr, "Tesseract must contain a ShapeTable!\n");
83  return NULL;
84  }
85  }
86  tesseract::ShapeClassifier* shape_classifier = NULL;
87 
88  if (!FLAGS_T.empty()) {
89  const char* config_name;
90  while ((config_name = GetNextFilename(argc, argv)) != NULL) {
91  tprintf("Reading config file %s ...\n", config_name);
92  (*api)->ReadConfigFile(config_name);
93  }
94  }
95  if (classifier == CN_PRUNER) {
96  shape_classifier = new tesseract::TessClassifier(true, classify);
97  } else if (classifier == CN_FULL) {
98  shape_classifier = new tesseract::TessClassifier(false, classify);
99  } else if (classifier == CN_CUBE) {
100  shape_classifier = new tesseract::CubeClassifier(tesseract);
101  } else if (classifier == CN_CUBETESS) {
102  shape_classifier = new tesseract::CubeTessClassifier(tesseract);
103  } else {
104  fprintf(stderr, "%s tester not yet implemented\n", classifer_name);
105  return NULL;
106  }
107  tprintf("Testing classifier %s:\n", classifer_name);
108  return shape_classifier;
109 }
110 
111 // This program has complex setup requirements, so here is some help:
112 // Two different modes, tr files and serialized mastertrainer.
113 // From tr files:
114 // classifier_tester -U unicharset -F font_properties -X xheights
115 // -classifier x -lang lang [-output_trainer trainer] *.tr
116 // From a serialized trainer:
117 // classifier_tester -input_trainer trainer [-lang lang] -classifier x
118 //
119 // In the first case, the unicharset must be the unicharset from within
120 // the classifier under test, and the font_properties and xheights files must
121 // match the files used during training.
122 // In the second case, the trainer file must have been prepared from
123 // some previous run of shapeclustering, mftraining, or classifier_tester
124 // using the same conditions as above, ie matching unicharset/font_properties.
125 //
126 // Available values of classifier (x above) are:
127 // pruner : Tesseract class pruner only.
128 // full : Tesseract full classifier.
129 // cube : Cube classifier. (Not possible with an input trainer.)
130 // cubetess : Tesseract class pruner with rescoring by Cube. (Not possible
131 // with an input trainer.)
132 int main(int argc, char **argv) {
133  ParseArguments(&argc, &argv);
134  STRING file_prefix;
136  argc, argv, false, NULL, &file_prefix);
138  // Decode the classifier string.
139  tesseract::ShapeClassifier* shape_classifier = InitializeClassifier(
140  FLAGS_classifier.c_str(), trainer->unicharset(), argc, argv, &api);
141  if (shape_classifier == NULL) {
142  fprintf(stderr, "Classifier init failed!:%s\n", FLAGS_classifier.c_str());
143  return 1;
144  }
145 
146  // We want to test junk as well if it is available.
147  // trainer->IncludeJunk();
148  // We want to test with replicated samples too.
150 
152  MAX(3, FLAGS_debug_level), false,
153  shape_classifier, NULL);
154  delete shape_classifier;
155  delete api;
156  delete trainer;
157 
158  return 0;
159 } /* main */
160 
161 
162 
163 
164 
165 
#define MAX(x, y)
Definition: ndminx.h:24
ClassifierName
const ShapeTable * shape_table() const
Definition: classify.h:69
#define tprintf(...)
Definition: tprintf.h:31
STRING_PARAM_FLAG(classifier,"","Classifier to test")
int main(int argc, char **argv)
struct TessBaseAPI TessBaseAPI
Definition: capi.h:67
const char * GetNextFilename(int argc, const char *const *argv)
void ReplicateAndRandomizeSamplesIfRequired()
void ParseArguments(int *argc, char ***argv)
const UNICHARSET & unicharset() const
void TestClassifierOnSamples(CountTypes error_mode, int report_level, bool replicate_samples, ShapeClassifier *test_classifier, STRING *report_string)
const char * names[]
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
DECLARE_STRING_PARAM_FLAG(T)
Definition: strngs.h:44
#define NULL
Definition: host.h:144
DECLARE_INT_PARAM_FLAG(debug_level)