tesseract  5.0.0-alpha-619-ge9db
dawg_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include <cstdlib> // for system
13 #include <fstream> // for ifstream
14 #include <set>
15 #include <string>
16 #include <vector>
17 
18 #include "ratngs.h"
19 #include "unicharset.h"
20 #include "trie.h"
21 
22 #include "include_gunit.h"
23 
24 namespace {
25 
26 // Test some basic functionality dealing with Dawgs (compressed dictionaries,
27 // aka Directed Acyclic Word Graphs).
28 class DawgTest : public testing::Test {
29  protected:
30  void SetUp() {
31  std::locale::global(std::locale(""));
32  }
33 
34  void LoadWordlist(const std::string& filename, std::set<std::string>* words) const {
35  std::ifstream file(filename);
36  if (file.is_open()) {
37  std::string line;
38  while (getline(file, line)) {
39  // Remove trailing line terminators from line.
40  while (!line.empty() && (line.back() == '\n' || line.back() == '\r')) {
41  line.resize(line.size() - 1);
42  }
43  // Add line to set.
44  words->insert(line.c_str());
45  }
46  file.close();
47  }
48  }
49  std::string TessBinaryPath(const std::string& name) const {
50  return file::JoinPath(TESSBIN_DIR, "src/training/" + name);
51  }
52  std::string OutputNameToPath(const std::string& name) const {
53  return file::JoinPath(FLAGS_test_tmpdir, name);
54  }
55  int RunCommand(const std::string& program, const std::string& arg1,
56  const std::string& arg2, const std::string& arg3) const {
57  std::string cmdline =
58  TessBinaryPath(program) + " " + arg1 + " " + arg2 + " " + arg3;
59  return system(cmdline.c_str());
60  }
61  // Test that we are able to convert a wordlist file (one "word" per line) to
62  // a dawg (a compressed format) and then extract the original wordlist back
63  // out using the tools "wordlist2dawg" and "dawg2wordlist."
64  void TestDawgRoundTrip(const std::string& unicharset_filename,
65  const std::string& wordlist_filename) const {
66  std::set<std::string> orig_words, roundtrip_words;
67  std::string unicharset = file::JoinPath(TESTING_DIR, unicharset_filename);
68  std::string orig_wordlist = file::JoinPath(TESTING_DIR, wordlist_filename);
69  std::string output_dawg = OutputNameToPath(wordlist_filename + ".dawg");
70  std::string output_wordlist = OutputNameToPath(wordlist_filename);
71  LoadWordlist(orig_wordlist, &orig_words);
72  EXPECT_EQ(
73  RunCommand("wordlist2dawg", orig_wordlist, output_dawg, unicharset), 0);
74  EXPECT_EQ(
75  RunCommand("dawg2wordlist", unicharset, output_dawg, output_wordlist),
76  0);
77  LoadWordlist(output_wordlist, &roundtrip_words);
78  EXPECT_EQ(orig_words, roundtrip_words);
79  }
80 };
81 
82 TEST_F(DawgTest, TestDawgConversion) {
83  TestDawgRoundTrip("eng.unicharset", "eng.wordlist.clean.freq");
84 }
85 
86 TEST_F(DawgTest, TestMatching) {
87  UNICHARSET unicharset;
88  unicharset.load_from_file(file::JoinPath(TESTING_DIR, "eng.unicharset").c_str());
90  unicharset.size(), 0);
91  WERD_CHOICE space_apos(" '", unicharset);
92  trie.add_word_to_dawg(space_apos);
93 
94  WERD_CHOICE space(" ", unicharset);
95 
96  // partial match ok - then good!
97  EXPECT_TRUE(trie.prefix_in_dawg(space, false));
98  // require complete match - not present.
99  EXPECT_FALSE(trie.word_in_dawg(space));
100  EXPECT_FALSE(trie.prefix_in_dawg(space, true));
101 
102  // partial or complete match ok for full word:
103  EXPECT_TRUE(trie.prefix_in_dawg(space_apos, false));
104  EXPECT_TRUE(trie.word_in_dawg(space_apos));
105  EXPECT_TRUE(trie.prefix_in_dawg(space_apos, true));
106 }
107 
108 } // namespace
UNICHARSET::load_from_file
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:378
file::JoinPath
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:43
string
std::string string
Definition: equationdetect_test.cc:21
WERD_CHOICE
Definition: ratngs.h:261
tesseract::Trie
Definition: trie.h:54
include_gunit.h
tesseract::TEST_F
TEST_F(EquationFinderTest, IdentifySpecialText)
Definition: equationdetect_test.cc:181
tesseract::DAWG_TYPE_WORD
Definition: dawg.h:68
ratngs.h
FLAGS_test_tmpdir
const char * FLAGS_test_tmpdir
Definition: include_gunit.h:20
unicharset.h
file
Definition: include_gunit.h:22
UNICHARSET
Definition: unicharset.h:145
NGRAM_PERM
Definition: ratngs.h:236
UNICHARSET::size
int size() const
Definition: unicharset.h:341
trie.h