tesseract  5.0.0-alpha-619-ge9db
tatweel_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #if defined(_WIN32)
13 #include <io.h> // for _access
14 #else
15 #include <unistd.h> // for access
16 #endif
17 
18 #include "include_gunit.h"
19 #include "dawg.h"
20 #include "trie.h"
21 #include "unicharset.h"
22 #include "util/utf8/unicodetext.h" // for UnicodeText
23 
24 namespace {
25 
26 // Replacement for std::filesystem::exists (C++-17)
27 static bool file_exists(const char* filename) {
28 #if defined(_WIN32)
29  return _access(filename, 0) == 0;
30 #else
31  return access(filename, 0) == 0;
32 #endif
33 }
34 
35 class TatweelTest : public ::testing::Test {
36  protected:
37  void SetUp() override {
38  static std::locale system_locale("");
39  std::locale::global(system_locale);
40  }
41 
42  TatweelTest() {
43  std::string filename = TestDataNameToPath("ara.wordlist");
44  if (file_exists(filename.c_str())) {
45  std::string wordlist(u8"\u0640");
46  CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults()));
47  // Put all the unicodes in the unicharset_.
48  UnicodeText text;
49  text.PointToUTF8(wordlist.data(), wordlist.size());
50  int num_tatweel = 0;
51  for (auto it = text.begin(); it != text.end(); ++it) {
52  std::string utf8 = it.get_utf8_string();
53  if (utf8.find(u8"\u0640") != std::string::npos) ++num_tatweel;
54  unicharset_.unichar_insert(utf8.c_str());
55  }
56  LOG(INFO) << "Num tatweels in source data=" << num_tatweel;
57  EXPECT_GT(num_tatweel, 0);
58  }
59  }
60 
61  std::string TestDataNameToPath(const std::string& name) {
62  return file::JoinPath(TESTDATA_DIR, name);
63  }
65 };
66 
67 TEST_F(TatweelTest, UnicharsetIgnoresTatweel) {
68  // This test verifies that the unicharset ignores the Tatweel character.
69  for (int i = 0; i < unicharset_.size(); ++i) {
70  const char* utf8 = unicharset_.id_to_unichar(i);
71  EXPECT_EQ(strstr(utf8, u8"\u0640"), nullptr);
72  }
73 }
74 
75 TEST_F(TatweelTest, DictIgnoresTatweel) {
76  // This test verifies that the dictionary ignores the Tatweel character.
78  unicharset_.size(), 0);
79  std::string filename = TestDataNameToPath("ara.wordlist");
80  if (!file_exists(filename.c_str())) {
81  LOG(INFO) << "Skip test because of missing " << filename;
82  GTEST_SKIP();
83  } else {
84  EXPECT_TRUE(trie.read_and_add_word_list(
85  filename.c_str(), unicharset_,
87  EXPECT_EQ(0, trie.check_for_words(filename.c_str(), unicharset_, false));
88  }
89 }
90 
91 TEST_F(TatweelTest, UnicharsetLoadKeepsTatweel) {
92  // This test verifies that a load of an existing unicharset keeps any
93  // existing tatweel for backwards compatibility.
94  std::string filename = TestDataNameToPath("ara.unicharset");
95  if (!file_exists(filename.c_str())) {
96  LOG(INFO) << "Skip test because of missing " << filename;
97  GTEST_SKIP();
98  } else {
99  EXPECT_TRUE(unicharset_.load_from_file(filename.c_str()));
100  int num_tatweel = 0;
101  for (int i = 0; i < unicharset_.size(); ++i) {
102  const char* utf8 = unicharset_.id_to_unichar(i);
103  if (strstr(utf8, u8"\u0640") != nullptr) ++num_tatweel;
104  }
105  LOG(INFO) << "Num tatweels in unicharset=" << num_tatweel;
106  EXPECT_EQ(num_tatweel, 4);
107  }
108 }
109 
110 } // namespace
UNICHARSET::load_from_file
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:378
file::JoinPath
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:43
string
std::string string
Definition: equationdetect_test.cc:21
INFO
Definition: log.h:29
CHECK_OK
#define CHECK_OK(test)
Definition: include_gunit.h:62
SYSTEM_DAWG_PERM
Definition: ratngs.h:239
tesseract::Trie
Definition: trie.h:54
file::Defaults
static int Defaults()
Definition: include_gunit.h:39
include_gunit.h
tesseract::TEST_F
TEST_F(EquationFinderTest, IdentifySpecialText)
Definition: equationdetect_test.cc:181
file::GetContents
static bool GetContents(const std::string &filename, std::string *out, int)
Definition: include_gunit.h:31
tesseract::DAWG_TYPE_WORD
Definition: dawg.h:68
unicodetext.h
unicharset.h
dawg.h
tesseract::Trie::RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:58
UNICHARSET
Definition: unicharset.h:145
UnicodeText
Definition: unicodetext.h:116
unicharset_
UNICHARSET unicharset_
Definition: unicharcompress_test.cc:167
UnicodeText::PointToUTF8
UnicodeText & PointToUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:256
UnicodeText::end
const_iterator end() const
Definition: unicodetext.cc:412
LOG
Definition: cleanapi_test.cc:19
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
UNICHARSET::unichar_insert
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:625
UNICHARSET::size
int size() const
Definition: unicharset.h:341
trie.h
UnicodeText::begin
const_iterator begin() const
Definition: unicodetext.cc:408