tesseract  5.0.0-alpha-619-ge9db
unicharset_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include <string>
13 #include "log.h" // for LOG
14 #include "unicharset.h"
15 #include "gmock/gmock.h" // for testing::ElementsAreArray
16 #include "include_gunit.h"
17 
18 using testing::ElementsAreArray;
19 
20 namespace {
21 
22 class UnicharsetTest : public ::testing::Test {
23  protected:
24  void SetUp() override {
25  std::locale::global(std::locale(""));
26  }
27 };
28 
29 TEST(UnicharsetTest, Basics) {
30  // This test verifies basic insertion, unichar_to_id, and encode.
31  UNICHARSET u;
32  u.unichar_insert("a");
33  EXPECT_EQ(u.size(), 4);
34  u.unichar_insert("f");
35  EXPECT_EQ(u.size(), 5);
36  u.unichar_insert("i");
37  EXPECT_EQ(u.size(), 6);
38  // The fi ligature is NOT added because it can be encoded with a cleanup as f
39  // then i.
40  u.unichar_insert("\ufb01");
41  EXPECT_EQ(u.size(), 6);
42  u.unichar_insert("e");
43  EXPECT_EQ(u.size(), 7);
44  u.unichar_insert("n");
45  EXPECT_EQ(u.size(), 8);
46  EXPECT_EQ(u.unichar_to_id("f"), 4);
47  EXPECT_EQ(u.unichar_to_id("i"), 5);
48  // The fi ligature has no valid id.
49  EXPECT_EQ(u.unichar_to_id("\ufb01"), INVALID_UNICHAR_ID);
50  // The fi pair has no valid id.
51  EXPECT_EQ(u.unichar_to_id("fi"), INVALID_UNICHAR_ID);
52  GenericVector<int> labels;
53  EXPECT_TRUE(u.encode_string("affine", true, &labels, nullptr, nullptr));
54  std::vector<int> v(&labels[0], &labels[0] + labels.size());
55  EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6}));
56  // With the fi ligature encoding fails without a pre-cleanup.
57  std::string lig_str = "af\ufb01ne";
58  EXPECT_FALSE(
59  u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));
60  lig_str = u.CleanupString(lig_str.c_str());
61  EXPECT_TRUE(
62  u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));
63  v = std::vector<int>(&labels[0], &labels[0] + labels.size());
64  EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6}));
65 }
66 
67 TEST(UnicharsetTest, Multibyte) {
68  // This test verifies basic insertion, unichar_to_id, and encode.
69  // The difference from Basic above is that now we are testing multi-byte
70  // unicodes instead of single byte.
71  UNICHARSET u;
72  // Insert some Arabic letters.
73  u.unichar_insert("\u0627");
74  EXPECT_EQ(u.size(), 4);
75  u.unichar_insert("\u062c");
76  EXPECT_EQ(u.size(), 5);
77  u.unichar_insert("\u062f");
78  EXPECT_EQ(u.size(), 6);
79  u.unichar_insert("\ufb01"); // fi ligature is added as fi pair.
80  EXPECT_EQ(u.size(), 7);
81  u.unichar_insert("\u062b");
82  EXPECT_EQ(u.size(), 8);
83  u.unichar_insert("\u0635");
84  EXPECT_EQ(u.size(), 9);
85  EXPECT_EQ(u.unichar_to_id("\u0627"), 3);
86  EXPECT_EQ(u.unichar_to_id("\u062c"), 4);
87  // The first two bytes of this string is \u0627, which matches id 3;
88  EXPECT_EQ(u.unichar_to_id("\u0627\u062c", 2), 3);
89  EXPECT_EQ(u.unichar_to_id("\u062f"), 5);
90  // Individual f and i are not present, but they are there as a pair.
91  EXPECT_EQ(u.unichar_to_id("f"), INVALID_UNICHAR_ID);
92  EXPECT_EQ(u.unichar_to_id("i"), INVALID_UNICHAR_ID);
93  EXPECT_EQ(u.unichar_to_id("fi"), 6);
94  // The fi ligature is findable.
95  EXPECT_EQ(u.unichar_to_id("\ufb01"), 6);
96  GenericVector<int> labels;
97  EXPECT_TRUE(u.encode_string("\u0627\u062c\u062c\u062f\u0635\u062b", true,
98  &labels, nullptr, nullptr));
99  std::vector<int> v(&labels[0], &labels[0] + labels.size());
100  EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 8, 7}));
101  // With the fi ligature the fi is picked out.
102  GenericVector<char> lengths;
103  int encoded_length;
104  std::string src_str = "\u0627\u062c\ufb01\u0635\u062b";
105  // src_str has to be pre-cleaned for lengths to be correct.
106  std::string cleaned = u.CleanupString(src_str.c_str());
107  EXPECT_TRUE(u.encode_string(cleaned.c_str(), true, &labels, &lengths,
108  &encoded_length));
109  EXPECT_EQ(encoded_length, cleaned.size());
110  std::string len_str(&lengths[0], lengths.size());
111  EXPECT_STREQ(len_str.c_str(), "\002\002\002\002\002");
112  v = std::vector<int>(&labels[0], &labels[0] + labels.size());
113  EXPECT_THAT(v, ElementsAreArray({3, 4, 6, 8, 7}));
114 }
115 
116 TEST(UnicharsetTest, MultibyteBigrams) {
117  // This test verifies basic insertion, unichar_to_id, and encode.
118  // The difference from Basic above is that now we are testing multi-byte
119  // unicodes instead of single byte.
120  UNICHARSET u;
121  // Insert some Arabic letters.
122  u.unichar_insert("\u0c9c");
123  EXPECT_EQ(u.size(), 4);
124  u.unichar_insert("\u0cad");
125  EXPECT_EQ(u.size(), 5);
126  u.unichar_insert("\u0ccd\u0c9c");
127  EXPECT_EQ(u.size(), 6);
128  u.unichar_insert("\u0ccd");
129  EXPECT_EQ(u.size(), 7);
130  // By default the encodable bigram is NOT added.
131  u.unichar_insert("\u0ccd\u0cad");
132  EXPECT_EQ(u.size(), 7);
133  // It is added if we force it to be.
134  u.unichar_insert("\u0ccd\u0cad", OldUncleanUnichars::kTrue);
135  EXPECT_EQ(u.size(), 8);
136  GenericVector<char> data;
137  tesseract::TFile fp;
138  fp.OpenWrite(&data);
139  u.save_to_file(&fp);
140  fp.Open(&data[0], data.size());
141  UNICHARSET v;
142  v.load_from_file(&fp, false);
143  EXPECT_EQ(v.unichar_to_id("\u0c9c"), 3);
144  EXPECT_EQ(v.unichar_to_id("\u0cad"), 4);
145  EXPECT_EQ(v.unichar_to_id("\u0ccd\u0c9c"), 5);
146  EXPECT_EQ(v.unichar_to_id("\u0ccd"), 6);
147  EXPECT_EQ(v.unichar_to_id("\u0ccd\u0cad"), 7);
148 }
149 
150 TEST(UnicharsetTest, OldStyle) {
151  // This test verifies an old unicharset that contains fi/fl ligatures loads
152  // and keeps all the entries.
153  std::string filename =
154  file::JoinPath(TESTDATA_DIR, "eng.unicharset");
155  UNICHARSET u;
156  LOG(INFO) << "Filename=" << filename;
157  EXPECT_TRUE(u.load_from_file(filename.c_str()));
158  EXPECT_EQ(u.size(), 111);
159 }
160 
161 } // namespace
UNICHARSET::load_from_file
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:378
file::JoinPath
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:43
string
std::string string
Definition: equationdetect_test.cc:21
INFO
Definition: log.h:29
UNICHARSET::encode_string
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:258
OldUncleanUnichars::kTrue
include_gunit.h
tesseract::TFile::Open
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:210
UNICHARSET::save_to_file
bool save_to_file(const char *const filename) const
Definition: unicharset.h:350
unicharset.h
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
tesseract::TFile
Definition: serialis.h:75
UNICHARSET
Definition: unicharset.h:145
UNICHARSET::CleanupString
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:246
GenericVector< int >
log.h
LOG
Definition: cleanapi_test.cc:19
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::TFile::OpenWrite
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:309
UNICHARSET::unichar_insert
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:625
UNICHARSET::size
int size() const
Definition: unicharset.h:341