tesseract  5.0.0-alpha-619-ge9db
dawg.cpp
Go to the documentation of this file.
1 /********************************************************************************
2  *
3  * File: dawg.cpp (Formerly dawg.c)
4  * Description: Use a Directed Acyclic Word Graph
5  * Author: Mark Seaman, OCR Technology
6  *
7  * (c) Copyright 1987, Hewlett-Packard Company.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  *********************************************************************************/
19 /*----------------------------------------------------------------------
20  I n c l u d e s
21 ----------------------------------------------------------------------*/
22 
23 #include "dawg.h"
24 
25 #include "dict.h"
26 #include <tesseract/helpers.h>
27 #include <tesseract/strngs.h>
28 #include "tprintf.h"
29 
30 #include <memory>
31 
32 /*----------------------------------------------------------------------
33  F u n c t i o n s f o r D a w g
34 ----------------------------------------------------------------------*/
35 namespace tesseract {
36 
37 // Destructor.
38 // It is defined here, so the compiler can create a single vtable
39 // instead of weak vtables in every compilation unit.
40 Dawg::~Dawg() = default;
41 
42 bool Dawg::prefix_in_dawg(const WERD_CHOICE &word,
43  bool requires_complete) const {
44  if (word.length() == 0) return !requires_complete;
45  NODE_REF node = 0;
46  int end_index = word.length() - 1;
47  for (int i = 0; i < end_index; i++) {
48  EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
49  if (edge == NO_EDGE) {
50  return false;
51  }
52  if ((node = next_node(edge)) == 0) {
53  // This only happens if all words following this edge terminate --
54  // there are no larger words. See Trie::add_word_to_dawg()
55  return false;
56  }
57  }
58  // Now check the last character.
59  return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
60  NO_EDGE;
61 }
62 
63 bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
64  return prefix_in_dawg(word, true);
65 }
66 
67 int Dawg::check_for_words(const char *filename,
68  const UNICHARSET &unicharset,
69  bool enable_wildcard) const {
70  if (filename == nullptr) return 0;
71 
72  FILE *word_file;
73  char string [CHARS_PER_LINE];
74  int misses = 0;
75  UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);
76 
77  word_file = fopen(filename, "r");
78  if (word_file == nullptr) {
79  tprintf("Error: Could not open file %s\n", filename);
80  ASSERT_HOST(word_file);
81  }
82 
83  while (fgets (string, CHARS_PER_LINE, word_file) != nullptr) {
84  chomp_string(string); // remove newline
85  WERD_CHOICE word(string, unicharset);
86  if (word.length() > 0 &&
87  !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
88  if (!match_words(&word, 0, 0,
89  enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
90  tprintf("Missing word: %s\n", string);
91  ++misses;
92  }
93  } else {
94  tprintf("Failed to create a valid word from %s\n", string);
95  }
96  }
97  fclose (word_file);
98  // Make sure the user sees this with fprintf instead of tprintf.
99  if (debug_level_) tprintf("Number of lost words=%d\n", misses);
100  return misses;
101 }
102 
103 void Dawg::iterate_words(const UNICHARSET& unicharset,
104  std::function<void(const WERD_CHOICE*)> cb) const {
105  WERD_CHOICE word(&unicharset);
106  iterate_words_rec(word, 0, cb);
107 }
108 
109 static void CallWithUTF8(std::function<void(const char*)> cb,
110  const WERD_CHOICE* wc) {
111  STRING s;
112  wc->string_and_lengths(&s, nullptr);
113  cb(s.c_str());
114 }
115 
116 void Dawg::iterate_words(const UNICHARSET& unicharset,
117  std::function<void(const char*)> cb) const {
118  using namespace std::placeholders; // for _1
119  std::function<void(const WERD_CHOICE*)> shim(
120  std::bind(CallWithUTF8, cb, _1));
121  WERD_CHOICE word(&unicharset);
122  iterate_words_rec(word, 0, shim);
123 }
124 
125 void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
126  NODE_REF to_explore,
127  std::function<void(const WERD_CHOICE*)> cb) const {
128  NodeChildVector children;
129  this->unichar_ids_of(to_explore, &children, false);
130  for (int i = 0; i < children.size(); i++) {
131  WERD_CHOICE next_word(word_so_far);
132  next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
133  if (this->end_of_word(children[i].edge_ref)) {
134  cb(&next_word);
135  }
136  NODE_REF next = next_node(children[i].edge_ref);
137  if (next != 0) {
138  iterate_words_rec(next_word, next, cb);
139  }
140  }
141 }
142 
143 bool Dawg::match_words(WERD_CHOICE *word, int32_t index,
144  NODE_REF node, UNICHAR_ID wildcard) const {
145  EDGE_REF edge;
146  int32_t word_end;
147 
148  if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
149  bool any_matched = false;
150  NodeChildVector vec;
151  this->unichar_ids_of(node, &vec, false);
152  for (int i = 0; i < vec.size(); ++i) {
153  word->set_unichar_id(vec[i].unichar_id, index);
154  if (match_words(word, index, node, wildcard))
155  any_matched = true;
156  }
157  word->set_unichar_id(wildcard, index);
158  return any_matched;
159  } else {
160  word_end = index == word->length() - 1;
161  edge = edge_char_of(node, word->unichar_id(index), word_end);
162  if (edge != NO_EDGE) { // normal edge in DAWG
163  node = next_node(edge);
164  if (word_end) {
165  if (debug_level_ > 1) word->print("match_words() found: ");
166  return true;
167  } else if (node != 0) {
168  return match_words(word, index+1, node, wildcard);
169  }
170  }
171  }
172  return false;
173 }
174 
175 void Dawg::init(int unicharset_size) {
176  ASSERT_HOST(unicharset_size > 0);
177  unicharset_size_ = unicharset_size;
178  // Set bit masks. We will use the value unicharset_size_ as a null char, so
179  // the actual number of unichars is unicharset_size_ + 1.
180  flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0));
182  letter_mask_ = ~(~0ull << flag_start_bit_);
185 }
186 
187 
188 /*----------------------------------------------------------------------
189  F u n c t i o n s f o r S q u i s h e d D a w g
190 ----------------------------------------------------------------------*/
191 
192 SquishedDawg::~SquishedDawg() { delete[] edges_; }
193 
195  UNICHAR_ID unichar_id,
196  bool word_end) const {
197  EDGE_REF edge = node;
198  if (node == 0) { // binary search
199  EDGE_REF start = 0;
200  EDGE_REF end = num_forward_edges_in_node0 - 1;
201  int compare;
202  while (start <= end) {
203  edge = (start + end) >> 1; // (start + end) / 2
204  compare = given_greater_than_edge_rec(NO_EDGE, word_end,
205  unichar_id, edges_[edge]);
206  if (compare == 0) { // given == vec[k]
207  return edge;
208  } else if (compare == 1) { // given > vec[k]
209  start = edge + 1;
210  } else { // given < vec[k]
211  end = edge - 1;
212  }
213  }
214  } else { // linear search
215  if (edge != NO_EDGE && edge_occupied(edge)) {
216  do {
217  if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
218  (!word_end || end_of_word_from_edge_rec(edges_[edge])))
219  return (edge);
220  } while (!last_edge(edge++));
221  }
222  }
223  return (NO_EDGE); // not found
224 }
225 
226 int32_t SquishedDawg::num_forward_edges(NODE_REF node) const {
227  EDGE_REF edge = node;
228  int32_t num = 0;
229 
230  if (forward_edge (edge)) {
231  do {
232  num++;
233  } while (!last_edge(edge++));
234  }
235 
236  return (num);
237 }
238 
239 void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {
240  if (node == NO_EDGE) return; // nothing to print
241 
242  EDGE_REF edge = node;
243  const char *forward_string = "FORWARD";
244  const char *backward_string = " ";
245 
246  const char *last_string = "LAST";
247  const char *not_last_string = " ";
248 
249  const char *eow_string = "EOW";
250  const char *not_eow_string = " ";
251 
252  const char *direction;
253  const char *is_last;
254  const char *eow;
255 
256  UNICHAR_ID unichar_id;
257 
258  if (edge_occupied(edge)) {
259  do {
260  direction =
261  forward_edge(edge) ? forward_string : backward_string;
262  is_last = last_edge(edge) ? last_string : not_last_string;
263  eow = end_of_word(edge) ? eow_string : not_eow_string;
264 
265  unichar_id = edge_letter(edge);
266  tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
267  edge, next_node(edge), unichar_id,
268  direction, is_last, eow);
269 
270  if (edge - node > max_num_edges) return;
271  } while (!last_edge(edge++));
272 
273  if (edge < num_edges_ &&
274  edge_occupied(edge) && backward_edge(edge)) {
275  do {
276  direction =
277  forward_edge(edge) ? forward_string : backward_string;
278  is_last = last_edge(edge) ? last_string : not_last_string;
279  eow = end_of_word(edge) ? eow_string : not_eow_string;
280 
281  unichar_id = edge_letter(edge);
282  tprintf(REFFORMAT " : next = " REFFORMAT
283  ", unichar_id = %d, %s %s %s\n",
284  edge, next_node(edge), unichar_id,
285  direction, is_last, eow);
286 
287  if (edge - node > MAX_NODE_EDGES_DISPLAY) return;
288  } while (!last_edge(edge++));
289  }
290  }
291  else {
292  tprintf(REFFORMAT " : no edges in this node\n", node);
293  }
294  tprintf("\n");
295 }
296 
297 void SquishedDawg::print_edge(EDGE_REF edge) const {
298  if (edge == NO_EDGE) {
299  tprintf("NO_EDGE\n");
300  } else {
301  tprintf(REFFORMAT " : next = " REFFORMAT
302  ", unichar_id = '%d', %s %s %s\n", edge,
303  next_node(edge), edge_letter(edge),
304  (forward_edge(edge) ? "FORWARD" : " "),
305  (last_edge(edge) ? "LAST" : " "),
306  (end_of_word(edge) ? "EOW" : ""));
307  }
308 }
309 
310 bool SquishedDawg::read_squished_dawg(TFile *file) {
311  if (debug_level_) tprintf("Reading squished dawg\n");
312 
313  // Read the magic number and check that it matches kDawgMagicNumber, as
314  // auto-endian fixing should make sure it is always correct.
315  int16_t magic;
316  if (!file->DeSerialize(&magic)) return false;
317  if (magic != kDawgMagicNumber) {
318  tprintf("Bad magic number on dawg: %d vs %d\n", magic, kDawgMagicNumber);
319  return false;
320  }
321 
322  int32_t unicharset_size;
323  if (!file->DeSerialize(&unicharset_size)) return false;
324  if (!file->DeSerialize(&num_edges_)) return false;
325  ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
326  Dawg::init(unicharset_size);
327 
328  edges_ = new EDGE_RECORD[num_edges_];
329  if (!file->DeSerialize(&edges_[0], num_edges_)) return false;
330  if (debug_level_ > 2) {
331  tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
332  type_, lang_.c_str(), perm_, unicharset_size_, num_edges_);
333  for (EDGE_REF edge = 0; edge < num_edges_; ++edge) print_edge(edge);
334  }
335  return true;
336 }
337 
338 std::unique_ptr<EDGE_REF[]> SquishedDawg::build_node_map(
339  int32_t *num_nodes) const {
340  EDGE_REF edge;
341  std::unique_ptr<EDGE_REF[]> node_map(new EDGE_REF[num_edges_]);
342  int32_t node_counter;
343  int32_t num_edges;
344 
345  for (edge = 0; edge < num_edges_; edge++) // init all slots
346  node_map[edge] = -1;
347 
348  node_counter = num_forward_edges(0);
349 
350  *num_nodes = 0;
351  for (edge = 0; edge < num_edges_; edge++) { // search all slots
352 
353  if (forward_edge(edge)) {
354  (*num_nodes)++; // count nodes links
355  node_map[edge] = (edge ? node_counter : 0);
356  num_edges = num_forward_edges(edge);
357  if (edge != 0) node_counter += num_edges;
358  edge += num_edges;
359  if (edge >= num_edges_) break;
360  if (backward_edge(edge)) while (!last_edge(edge++));
361  edge--;
362  }
363  }
364  return node_map;
365 }
366 
368  EDGE_REF edge;
369  int32_t num_edges;
370  int32_t node_count = 0;
371  EDGE_REF old_index;
372  EDGE_RECORD temp_record;
373 
374  if (debug_level_) tprintf("write_squished_dawg\n");
375 
376  std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count));
377 
378  // Write the magic number to help detecting a change in endianness.
379  int16_t magic = kDawgMagicNumber;
380  if (!file->Serialize(&magic)) return false;
381  if (!file->Serialize(&unicharset_size_)) return false;
382 
383  // Count the number of edges in this Dawg.
384  num_edges = 0;
385  for (edge=0; edge < num_edges_; edge++)
386  if (forward_edge(edge))
387  num_edges++;
388 
389  // Write edge count to file.
390  if (!file->Serialize(&num_edges)) return false;
391 
392  if (debug_level_) {
393  tprintf("%d nodes in DAWG\n", node_count);
394  tprintf("%d edges in DAWG\n", num_edges);
395  }
396 
397  for (edge = 0; edge < num_edges_; edge++) {
398  if (forward_edge(edge)) { // write forward edges
399  do {
400  old_index = next_node_from_edge_rec(edges_[edge]);
401  set_next_node(edge, node_map[old_index]);
402  temp_record = edges_[edge];
403  if (!file->Serialize(&temp_record)) return false;
404  set_next_node(edge, old_index);
405  } while (!last_edge(edge++));
406 
407  if (edge >= num_edges_) break;
408  if (backward_edge(edge)) // skip back links
409  while (!last_edge(edge++));
410 
411  edge--;
412  }
413  }
414  return true;
415 }
416 
417 } // namespace tesseract
tesseract::Dawg::edge_letter
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
strngs.h
tesseract::Dawg::prefix_in_dawg
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const
Definition: dawg.cpp:57
tesseract::Dawg::end_of_word
virtual bool end_of_word(EDGE_REF edge_ref) const =0
tesseract::Dawg::match_words
bool match_words(WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_ID wildcard) const
Definition: dawg.cpp:158
tesseract::SquishedDawg::edge_char_of
EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const override
Returns the edge that corresponds to the letter out of this node.
Definition: dawg.cpp:209
dict.h
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
tesseract::Dawg::flag_start_bit_
int flag_start_bit_
Definition: dawg.h:307
WERD_CHOICE
Definition: ratngs.h:261
tesseract::Dawg::flags_mask_
uint64_t flags_mask_
Definition: dawg.h:304
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::Dawg::iterate_words_rec
void iterate_words_rec(const WERD_CHOICE &word_so_far, NODE_REF to_explore, std::function< void(const WERD_CHOICE *)> cb) const
Definition: dawg.cpp:140
language_specific.log
log
Definition: language_specific.py:25
tesseract::Dawg::next_node_from_edge_rec
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:207
WERD_CHOICE::contains_unichar_id
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:328
tesseract::Dawg::end_of_word_from_edge_rec
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:220
tesseract::NodeChildVector
GenericVector< NodeChild > NodeChildVector
Definition: dawg.h:62
STRING
Definition: strngs.h:45
tesseract::Dawg::unichar_ids_of
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const =0
tesseract::Dawg::given_greater_than_edge_rec
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:245
tesseract::Dawg::unichar_id_from_edge_rec
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:224
tesseract::Dawg::perm_
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:298
MAX_NODE_EDGES_DISPLAY
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:81
WERD_CHOICE::string_and_lengths
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:451
tesseract::SquishedDawg::~SquishedDawg
~SquishedDawg() override
Definition: dawg.cpp:207
WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:347
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
dawg.h
file
Definition: include_gunit.h:22
chomp_string
void chomp_string(char *str)
Definition: helpers.h:75
tesseract::Dawg::init
void init(int unicharset_size)
Definition: dawg.cpp:190
tesseract::Dawg::next_node_start_bit_
int next_node_start_bit_
Definition: dawg.h:308
EDGE_RECORD
uint64_t EDGE_RECORD
Definition: dawg.h:47
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
tesseract::Dawg::debug_level_
int debug_level_
Definition: dawg.h:310
UNICHARSET
Definition: unicharset.h:145
NUM_FLAG_BITS
#define NUM_FLAG_BITS
Definition: dawg.h:86
tesseract::Dawg::edge_char_of
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
helpers.h
tesseract
Definition: baseapi.h:65
tesseract::Dawg::iterate_words
void iterate_words(const UNICHARSET &unicharset, std::function< void(const WERD_CHOICE *)> cb) const
Definition: dawg.cpp:118
tesseract::Dawg::next_node
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
REFFORMAT
#define REFFORMAT
Definition: dawg.h:87
tprintf.h
tesseract::Dawg::word_in_dawg
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:78
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
GenericVector
Definition: baseapi.h:40
tesseract::Dawg::kDawgMagicNumber
static const int16_t kDawgMagicNumber
Magic number to determine endianness when reading the Dawg from file.
Definition: dawg.h:116
tesseract::Dawg::lang_
STRING lang_
Definition: dawg.h:295
WERD_CHOICE::print
void print() const
Definition: ratngs.h:568
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
tesseract::Dawg::check_for_words
int check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
Definition: dawg.cpp:82
EDGE_REF
int64_t EDGE_REF
Definition: dawg.h:49
tesseract::Dawg::letter_mask_
uint64_t letter_mask_
Definition: dawg.h:305
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::SquishedDawg::write_squished_dawg
bool write_squished_dawg(TFile *file)
Writes the squished/reduced Dawg to a file.
Definition: dawg.cpp:382
tesseract::Dawg::unicharset_size_
int unicharset_size_
Definition: dawg.h:306
tesseract::Dawg::next_node_mask_
uint64_t next_node_mask_
Definition: dawg.h:303
CHARS_PER_LINE
#define CHARS_PER_LINE
Definition: dict.h:38
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::SquishedDawg::print_node
void print_node(NODE_REF node, int max_num_edges) const override
Definition: dawg.cpp:254
tesseract::Dawg::type_
DawgType type_
Definition: dawg.h:296
tesseract::Dawg::~Dawg
virtual ~Dawg()
NODE_REF
int64_t NODE_REF
Definition: dawg.h:50