tesseract  5.0.0-alpha-619-ge9db
baseapi_thread_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 // Unit test to run Tesseract instances in parallel threads and verify
13 // the OCR result.
14 
15 // Note that success of running this test as-is does NOT verify
16 // thread-safety. For that, you need to run this binary under TSAN using the
17 // associated baseapi_thread_test_with_tsan.sh script.
18 //
19 // The tests are partitioned by instance to allow running Tesseract/Cube/both
20 // and by stage to run initialization/recognition/both. See flag descriptions
21 // for details.
22 
23 #include <functional>
24 #include <memory>
25 #include <string>
26 #include <tensorflow/core/lib/core/threadpool.h>
27 #include "absl/strings/ascii.h" // for absl::StripAsciiWhitespace
28 #include "allheaders.h"
29 #include "include_gunit.h"
30 #include <tesseract/baseapi.h>
31 #include "commandlineflags.h"
32 #include "log.h"
33 
34 // Run with Tesseract instances.
35 BOOL_PARAM_FLAG(test_tesseract, true, "Test tesseract instances");
36 // Run with Cube instances.
37 // Note that with TSAN, Cube typically takes much longer to test. Ignoring
38 // std::string operations using the associated tess_tsan.ignore file when
39 // testing Cube significantly reduces testing time.
40 BOOL_PARAM_FLAG(test_cube, true, "Test Cube instances");
41 
42 // When used with TSAN, having more repetitions can help in finding hidden
43 // thread-safety violations at the expense of increased testing time.
44 INT_PARAM_FLAG(reps, 1, "Num of parallel test repetitions to run.");
45 
46 INT_PARAM_FLAG(max_concurrent_instances, 0,
47  "Maximum number of instances to run in parallel at any given "
48  "instant. The number of concurrent instances cannot exceed "
49  "reps * number_of_langs_tested, which is also the default value.");
50 
52 
53 namespace {
54 
55 static const char* kTessLangs[] = {"eng", "vie", nullptr};
56 static const char* kTessImages[] = {"HelloGoogle.tif", "viet.tif", nullptr};
57 static const char* kTessTruthText[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67",
58  nullptr};
59 
60 static const char* kCubeLangs[] = {"hin", "ara", nullptr};
61 static const char* kCubeImages[] = {"raaj.tif", "arabic.tif", nullptr};
62 static const char* kCubeTruthText[] = {
63  "\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c",
64  "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", nullptr};
65 
66 class BaseapiThreadTest : public ::testing::Test {
67  protected:
68  static void SetUpTestCase() {
69  CHECK(FLAGS_test_tesseract || FLAGS_test_cube)
70  << "Need to test at least one of Tesseract/Cube!";
71  // Form a list of langs/gt_text/image_files we will work with.
72  std::vector<std::string> image_files;
73  if (FLAGS_test_tesseract) {
74  int i = 0;
75  while (kTessLangs[i] && kTessTruthText[i] && kTessImages[i]) {
76  langs_.push_back(kTessLangs[i]);
77  gt_text_.push_back(kTessTruthText[i]);
78  image_files.push_back(kTessImages[i]);
79  ++i;
80  }
81  LOG(INFO) << "Testing Tesseract on " << i << " languages.";
82  }
83  if (FLAGS_test_cube) {
84  int i = 0;
85  while (kCubeLangs[i] && kCubeTruthText[i] && kCubeImages[i]) {
86  langs_.push_back(kCubeLangs[i]);
87  gt_text_.push_back(kCubeTruthText[i]);
88  image_files.push_back(kCubeImages[i]);
89  ++i;
90  }
91  LOG(INFO) << "Testing Cube on " << i << " languages.";
92  }
93  num_langs_ = langs_.size();
94 
95  // Pre-load the images into an array. We will be making multiple copies of
96  // an image here if FLAGS_reps > 1 and that is intentional. In this test, we
97  // wish to not make any assumptions about the thread-safety of Pix objects,
98  // and so entirely disallow concurrent access of a Pix instance.
99  const int n = num_langs_ * FLAGS_reps;
100  for (int i = 0; i < n; ++i) {
101  std::string path = TESTING_DIR "/" + image_files[i % num_langs_];
102  Pix* new_pix = pixRead(path.c_str());
103  QCHECK(new_pix != nullptr) << "Could not read " << path;
104  pix_.push_back(new_pix);
105  }
106 
107  pool_size_ = (FLAGS_max_concurrent_instances < 1)
108  ? num_langs_ * FLAGS_reps
109  : FLAGS_max_concurrent_instances;
110  }
111 
112  static void TearDownTestCase() {
113  for (auto& pix : pix_) {
114  pixDestroy(&pix);
115  }
116  }
117 
118  void ResetPool() {
119  pool_.reset(new tensorflow::thread::ThreadPool(tensorflow::Env::Default(), "tessthread", pool_size_));
120  }
121 
122  void WaitForPoolWorkers() { pool_.reset(nullptr); }
123 
124  std::unique_ptr<tensorflow::thread::ThreadPool> pool_;
125  static int pool_size_;
126  static std::vector<Pix*> pix_;
127  static std::vector<std::string> langs_;
128  static std::vector<std::string> gt_text_;
129  static int num_langs_;
130 };
131 
132 // static member variable declarations.
133 int BaseapiThreadTest::pool_size_;
134 std::vector<Pix*> BaseapiThreadTest::pix_;
135 std::vector<std::string> BaseapiThreadTest::langs_;
136 std::vector<std::string> BaseapiThreadTest::gt_text_;
137 int BaseapiThreadTest::num_langs_;
138 
139 static void InitTessInstance(TessBaseAPI* tess, const std::string& lang) {
140  CHECK(tess != nullptr);
141  EXPECT_EQ(0, tess->Init(TESSDATA_DIR, lang.c_str()));
142 }
143 
144 static void GetCleanedText(TessBaseAPI* tess, Pix* pix, std::string* ocr_text) {
145  tess->SetImage(pix);
146  char* result = tess->GetUTF8Text();
147  *ocr_text = result;
148  delete[] result;
149  absl::StripAsciiWhitespace(ocr_text);
150 }
151 
152 static void VerifyTextResult(TessBaseAPI* tess, Pix* pix, const std::string& lang,
153  const std::string& expected_text) {
154  TessBaseAPI* tess_local = nullptr;
155  if (tess) {
156  tess_local = tess;
157  } else {
158  tess_local = new TessBaseAPI;
159  InitTessInstance(tess_local, lang);
160  }
161  std::string ocr_text;
162  GetCleanedText(tess_local, pix, &ocr_text);
163  EXPECT_STREQ(expected_text.c_str(), ocr_text.c_str());
164  if (tess_local != tess) delete tess_local;
165 }
166 
167 // Check that Tesseract/Cube produce the correct results in single-threaded
168 // operation. If not, it is pointless to run the real multi-threaded tests.
169 TEST_F(BaseapiThreadTest, TestBasicSanity) {
170  for (int i = 0; i < num_langs_; ++i) {
171  TessBaseAPI tess;
172  InitTessInstance(&tess, langs_[i]);
173  std::string ocr_text;
174  GetCleanedText(&tess, pix_[i], &ocr_text);
175  CHECK(strcmp(gt_text_[i].c_str(), ocr_text.c_str()) == 0)
176  << "Failed with lang = " << langs_[i];
177  }
178 }
179 
180 // Test concurrent instance initialization.
181 TEST_F(BaseapiThreadTest, TestInit) {
182  const int n = num_langs_ * FLAGS_reps;
183  ResetPool();
184  std::vector<TessBaseAPI> tess(n);
185  for (int i = 0; i < n; ++i) {
186  pool_->Schedule(std::bind(InitTessInstance, &tess[i], langs_[i % num_langs_]));
187  }
188  WaitForPoolWorkers();
189 }
190 
191 // Test concurrent recognition.
192 TEST_F(BaseapiThreadTest, TestRecognition) {
193  const int n = num_langs_ * FLAGS_reps;
194  std::vector<TessBaseAPI> tess(n);
195  // Initialize api instances in a single thread.
196  for (int i = 0; i < n; ++i) {
197  InitTessInstance(&tess[i], langs_[i % num_langs_]);
198  }
199 
200  ResetPool();
201  for (int i = 0; i < n; ++i) {
202  pool_->Schedule(std::bind(VerifyTextResult, &tess[i], pix_[i],
203  langs_[i % num_langs_], gt_text_[i % num_langs_]));
204  }
205  WaitForPoolWorkers();
206 }
207 
208 TEST_F(BaseapiThreadTest, TestAll) {
209  const int n = num_langs_ * FLAGS_reps;
210  ResetPool();
211  for (int i = 0; i < n; ++i) {
212  pool_->Schedule(std::bind(VerifyTextResult, nullptr, pix_[i],
213  langs_[i % num_langs_], gt_text_[i % num_langs_]));
214  }
215  WaitForPoolWorkers();
216 }
217 } // namespace
string
std::string string
Definition: equationdetect_test.cc:21
INFO
Definition: log.h:29
BOOL_PARAM_FLAG
BOOL_PARAM_FLAG(test_tesseract, true, "Test tesseract instances")
include_gunit.h
tesseract::TEST_F
TEST_F(EquationFinderTest, IdentifySpecialText)
Definition: equationdetect_test.cc:181
tesseract::TessBaseAPI::Init
int Init(const char *datapath, const char *language, OcrEngineMode mode, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_non_debug_params)
Definition: baseapi.cpp:337
CHECK
#define CHECK(test)
Definition: include_gunit.h:57
baseapi.h
tesseract::TessBaseAPI
Definition: baseapi.h:98
INT_PARAM_FLAG
INT_PARAM_FLAG(reps, 1, "Num of parallel test repetitions to run.")
commandlineflags.h
log.h
LOG
Definition: cleanapi_test.cc:19
TessBaseAPI
struct TessBaseAPI TessBaseAPI
Definition: capi.h:72
tesseract::TessBaseAPI::GetUTF8Text
char * GetUTF8Text()
Definition: baseapi.cpp:1348
tesseract::TessBaseAPI::SetImage
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: baseapi.cpp:571