tesseract  4.0.0-1-g2a2b
dawg.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: dawg.cpp (Formerly dawg.c)
5  * Description: Use a Directed Acyclic Word Graph
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Wed Jul 24 16:59:16 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I n c l u d e s
27 ----------------------------------------------------------------------*/
28 
29 #include "dawg.h"
30 
31 #include "dict.h"
32 #include "emalloc.h"
33 #include "helpers.h"
34 #include "strngs.h"
35 #include "tesscallback.h"
36 #include "tprintf.h"
37 
38 #include <memory>
39 
40 /*----------------------------------------------------------------------
41  F u n c t i o n s f o r D a w g
42 ----------------------------------------------------------------------*/
43 namespace tesseract {
44 
45 // Destructor.
46 // It is defined here, so the compiler can create a single vtable
47 // instead of weak vtables in every compilation unit.
48 Dawg::~Dawg() = default;
49 
51  bool requires_complete) const {
52  if (word.length() == 0) return !requires_complete;
53  NODE_REF node = 0;
54  int end_index = word.length() - 1;
55  for (int i = 0; i < end_index; i++) {
56  EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
57  if (edge == NO_EDGE) {
58  return false;
59  }
60  if ((node = next_node(edge)) == 0) {
61  // This only happens if all words following this edge terminate --
62  // there are no larger words. See Trie::add_word_to_dawg()
63  return false;
64  }
65  }
66  // Now check the last character.
67  return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
68  NO_EDGE;
69 }
70 
71 bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
72  return prefix_in_dawg(word, true);
73 }
74 
75 int Dawg::check_for_words(const char *filename,
76  const UNICHARSET &unicharset,
77  bool enable_wildcard) const {
78  if (filename == nullptr) return 0;
79 
80  FILE *word_file;
81  char string [CHARS_PER_LINE];
82  int misses = 0;
83  UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);
84 
85  word_file = fopen(filename, "r");
86  if (word_file == nullptr) {
87  tprintf("Error: Could not open file %s\n", filename);
88  ASSERT_HOST(word_file);
89  }
90 
91  while (fgets (string, CHARS_PER_LINE, word_file) != nullptr) {
92  chomp_string(string); // remove newline
93  WERD_CHOICE word(string, unicharset);
94  if (word.length() > 0 &&
95  !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
96  if (!match_words(&word, 0, 0,
97  enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
98  tprintf("Missing word: %s\n", string);
99  ++misses;
100  }
101  } else {
102  tprintf("Failed to create a valid word from %s\n", string);
103  }
104  }
105  fclose (word_file);
106  // Make sure the user sees this with fprintf instead of tprintf.
107  if (debug_level_) tprintf("Number of lost words=%d\n", misses);
108  return misses;
109 }
110 
111 void Dawg::iterate_words(const UNICHARSET &unicharset,
113  WERD_CHOICE word(&unicharset);
114  iterate_words_rec(word, 0, cb);
115 }
116 
117 static void CallWithUTF8(TessCallback1<const char *> *cb,
118  const WERD_CHOICE *wc) {
119  STRING s;
120  wc->string_and_lengths(&s, nullptr);
121  cb->Run(s.string());
122 }
123 
124 void Dawg::iterate_words(const UNICHARSET &unicharset,
125  TessCallback1<const char *> *cb) const {
126  std::unique_ptr<TessCallback1<const WERD_CHOICE *>> shim(
127  NewPermanentTessCallback(CallWithUTF8, cb));
128  WERD_CHOICE word(&unicharset);
129  iterate_words_rec(word, 0, shim.get());
130 }
131 
132 void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
133  NODE_REF to_explore,
135  NodeChildVector children;
136  this->unichar_ids_of(to_explore, &children, false);
137  for (int i = 0; i < children.size(); i++) {
138  WERD_CHOICE next_word(word_so_far);
139  next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
140  if (this->end_of_word(children[i].edge_ref)) {
141  cb->Run(&next_word);
142  }
143  NODE_REF next = next_node(children[i].edge_ref);
144  if (next != 0) {
145  iterate_words_rec(next_word, next, cb);
146  }
147  }
148 }
149 
150 bool Dawg::match_words(WERD_CHOICE *word, int32_t index,
151  NODE_REF node, UNICHAR_ID wildcard) const {
152  EDGE_REF edge;
153  int32_t word_end;
154 
155  if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
156  bool any_matched = false;
157  NodeChildVector vec;
158  this->unichar_ids_of(node, &vec, false);
159  for (int i = 0; i < vec.size(); ++i) {
160  word->set_unichar_id(vec[i].unichar_id, index);
161  if (match_words(word, index, node, wildcard))
162  any_matched = true;
163  }
164  word->set_unichar_id(wildcard, index);
165  return any_matched;
166  } else {
167  word_end = index == word->length() - 1;
168  edge = edge_char_of(node, word->unichar_id(index), word_end);
169  if (edge != NO_EDGE) { // normal edge in DAWG
170  node = next_node(edge);
171  if (word_end) {
172  if (debug_level_ > 1) word->print("match_words() found: ");
173  return true;
174  } else if (node != 0) {
175  return match_words(word, index+1, node, wildcard);
176  }
177  }
178  }
179  return false;
180 }
181 
182 void Dawg::init(int unicharset_size) {
183  ASSERT_HOST(unicharset_size > 0);
184  unicharset_size_ = unicharset_size;
185  // Set bit masks. We will use the value unicharset_size_ as a null char, so
186  // the actual number of unichars is unicharset_size_ + 1.
187  flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0));
189  letter_mask_ = ~(~0ull << flag_start_bit_);
192 }
193 
194 
195 /*----------------------------------------------------------------------
196  F u n c t i o n s f o r S q u i s h e d D a w g
197 ----------------------------------------------------------------------*/
198 
199 SquishedDawg::~SquishedDawg() { delete[] edges_; }
200 
202  UNICHAR_ID unichar_id,
203  bool word_end) const {
204  EDGE_REF edge = node;
205  if (node == 0) { // binary search
206  EDGE_REF start = 0;
207  EDGE_REF end = num_forward_edges_in_node0 - 1;
208  int compare;
209  while (start <= end) {
210  edge = (start + end) >> 1; // (start + end) / 2
211  compare = given_greater_than_edge_rec(NO_EDGE, word_end,
212  unichar_id, edges_[edge]);
213  if (compare == 0) { // given == vec[k]
214  return edge;
215  } else if (compare == 1) { // given > vec[k]
216  start = edge + 1;
217  } else { // given < vec[k]
218  end = edge - 1;
219  }
220  }
221  } else { // linear search
222  if (edge != NO_EDGE && edge_occupied(edge)) {
223  do {
224  if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
225  (!word_end || end_of_word_from_edge_rec(edges_[edge])))
226  return (edge);
227  } while (!last_edge(edge++));
228  }
229  }
230  return (NO_EDGE); // not found
231 }
232 
233 int32_t SquishedDawg::num_forward_edges(NODE_REF node) const {
234  EDGE_REF edge = node;
235  int32_t num = 0;
236 
237  if (forward_edge (edge)) {
238  do {
239  num++;
240  } while (!last_edge(edge++));
241  }
242 
243  return (num);
244 }
245 
246 void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {
247  if (node == NO_EDGE) return; // nothing to print
248 
249  EDGE_REF edge = node;
250  const char *forward_string = "FORWARD";
251  const char *backward_string = " ";
252 
253  const char *last_string = "LAST";
254  const char *not_last_string = " ";
255 
256  const char *eow_string = "EOW";
257  const char *not_eow_string = " ";
258 
259  const char *direction;
260  const char *is_last;
261  const char *eow;
262 
263  UNICHAR_ID unichar_id;
264 
265  if (edge_occupied(edge)) {
266  do {
267  direction =
268  forward_edge(edge) ? forward_string : backward_string;
269  is_last = last_edge(edge) ? last_string : not_last_string;
270  eow = end_of_word(edge) ? eow_string : not_eow_string;
271 
272  unichar_id = edge_letter(edge);
273  tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
274  edge, next_node(edge), unichar_id,
275  direction, is_last, eow);
276 
277  if (edge - node > max_num_edges) return;
278  } while (!last_edge(edge++));
279 
280  if (edge < num_edges_ &&
281  edge_occupied(edge) && backward_edge(edge)) {
282  do {
283  direction =
284  forward_edge(edge) ? forward_string : backward_string;
285  is_last = last_edge(edge) ? last_string : not_last_string;
286  eow = end_of_word(edge) ? eow_string : not_eow_string;
287 
288  unichar_id = edge_letter(edge);
289  tprintf(REFFORMAT " : next = " REFFORMAT
290  ", unichar_id = %d, %s %s %s\n",
291  edge, next_node(edge), unichar_id,
292  direction, is_last, eow);
293 
294  if (edge - node > MAX_NODE_EDGES_DISPLAY) return;
295  } while (!last_edge(edge++));
296  }
297  }
298  else {
299  tprintf(REFFORMAT " : no edges in this node\n", node);
300  }
301  tprintf("\n");
302 }
303 
304 void SquishedDawg::print_edge(EDGE_REF edge) const {
305  if (edge == NO_EDGE) {
306  tprintf("NO_EDGE\n");
307  } else {
308  tprintf(REFFORMAT " : next = " REFFORMAT
309  ", unichar_id = '%d', %s %s %s\n", edge,
310  next_node(edge), edge_letter(edge),
311  (forward_edge(edge) ? "FORWARD" : " "),
312  (last_edge(edge) ? "LAST" : " "),
313  (end_of_word(edge) ? "EOW" : ""));
314  }
315 }
316 
317 bool SquishedDawg::read_squished_dawg(TFile *file) {
318  if (debug_level_) tprintf("Reading squished dawg\n");
319 
320  // Read the magic number and check that it matches kDawgMagicNumber, as
321  // auto-endian fixing should make sure it is always correct.
322  int16_t magic;
323  if (!file->DeSerialize(&magic)) return false;
324  if (magic != kDawgMagicNumber) {
325  tprintf("Bad magic number on dawg: %d vs %d\n", magic, kDawgMagicNumber);
326  return false;
327  }
328 
329  int32_t unicharset_size;
330  if (!file->DeSerialize(&unicharset_size)) return false;
331  if (!file->DeSerialize(&num_edges_)) return false;
332  ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
333  Dawg::init(unicharset_size);
334 
335  edges_ = new EDGE_RECORD[num_edges_];
336  if (!file->DeSerialize(&edges_[0], num_edges_)) return false;
337  if (debug_level_ > 2) {
338  tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
339  type_, lang_.string(), perm_, unicharset_size_, num_edges_);
340  for (EDGE_REF edge = 0; edge < num_edges_; ++edge) print_edge(edge);
341  }
342  return true;
343 }
344 
345 std::unique_ptr<EDGE_REF[]> SquishedDawg::build_node_map(
346  int32_t *num_nodes) const {
347  EDGE_REF edge;
348  std::unique_ptr<EDGE_REF[]> node_map(new EDGE_REF[num_edges_]);
349  int32_t node_counter;
350  int32_t num_edges;
351 
352  for (edge = 0; edge < num_edges_; edge++) // init all slots
353  node_map[edge] = -1;
354 
355  node_counter = num_forward_edges(0);
356 
357  *num_nodes = 0;
358  for (edge = 0; edge < num_edges_; edge++) { // search all slots
359 
360  if (forward_edge(edge)) {
361  (*num_nodes)++; // count nodes links
362  node_map[edge] = (edge ? node_counter : 0);
363  num_edges = num_forward_edges(edge);
364  if (edge != 0) node_counter += num_edges;
365  edge += num_edges;
366  if (edge >= num_edges_) break;
367  if (backward_edge(edge)) while (!last_edge(edge++));
368  edge--;
369  }
370  }
371  return node_map;
372 }
373 
375  EDGE_REF edge;
376  int32_t num_edges;
377  int32_t node_count = 0;
378  EDGE_REF old_index;
379  EDGE_RECORD temp_record;
380 
381  if (debug_level_) tprintf("write_squished_dawg\n");
382 
383  std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count));
384 
385  // Write the magic number to help detecting a change in endianness.
386  int16_t magic = kDawgMagicNumber;
387  if (!file->Serialize(&magic)) return false;
388  if (!file->Serialize(&unicharset_size_)) return false;
389 
390  // Count the number of edges in this Dawg.
391  num_edges = 0;
392  for (edge=0; edge < num_edges_; edge++)
393  if (forward_edge(edge))
394  num_edges++;
395 
396  // Write edge count to file.
397  if (!file->Serialize(&num_edges)) return false;
398 
399  if (debug_level_) {
400  tprintf("%d nodes in DAWG\n", node_count);
401  tprintf("%d edges in DAWG\n", num_edges);
402  }
403 
404  for (edge = 0; edge < num_edges_; edge++) {
405  if (forward_edge(edge)) { // write forward edges
406  do {
407  old_index = next_node_from_edge_rec(edges_[edge]);
408  set_next_node(edge, node_map[old_index]);
409  temp_record = edges_[edge];
410  if (!file->Serialize(&temp_record)) return false;
411  set_next_node(edge, old_index);
412  } while (!last_edge(edge++));
413 
414  if (edge >= num_edges_) break;
415  if (backward_edge(edge)) // skip back links
416  while (!last_edge(edge++));
417 
418  edge--;
419  }
420  }
421  return true;
422 }
423 
424 } // namespace tesseract
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:359
int next_node_start_bit_
Definition: dawg.h:311
int UNICHAR_ID
Definition: unichar.h:35
int size() const
Definition: genericvector.h:71
void print_node(NODE_REF node, int max_num_edges) const
Definition: dawg.cpp:246
uint64_t letter_mask_
Definition: dawg.h:314
#define CHARS_PER_LINE
Definition: dict.h:35
const char * string() const
Definition: strngs.cpp:196
void print() const
Definition: ratngs.h:580
STRING lang_
Definition: dawg.h:302
int unicharset_size_
Definition: dawg.h:309
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const
Definition: dawg.cpp:50
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:468
void iterate_words_rec(const WERD_CHOICE &word_so_far, NODE_REF to_explore, TessCallback1< const WERD_CHOICE *> *cb) const
Definition: dawg.cpp:132
virtual ~Dawg()
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:71
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Definition: dawg.h:478
#define REFFORMAT
Definition: dawg.h:93
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:304
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:251
static const int16_t kDawgMagicNumber
Magic number to determine endianness when reading the Dawg from file.
Definition: dawg.h:122
virtual void Run(A1)=0
int64_t EDGE_REF
Definition: dawg.h:55
int64_t NODE_REF
Definition: dawg.h:56
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:87
bool write_squished_dawg(TFile *file)
Writes the squished/reduced Dawg to a file.
Definition: dawg.cpp:374
void chomp_string(char *str)
Definition: helpers.h:83
uint64_t next_node_mask_
Definition: dawg.h:312
EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const
Returns the edge that corresponds to the letter out of this node.
Definition: dawg.cpp:201
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
uint64_t flags_mask_
Definition: dawg.h:313
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:326
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:147
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool match_words(WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_ID wildcard) const
Definition: dawg.cpp:150
int length() const
Definition: ratngs.h:303
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const =0
void iterate_words(const UNICHARSET &unicharset, TessCallback1< const WERD_CHOICE *> *cb) const
Definition: dawg.cpp:111
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:449
int flag_start_bit_
Definition: dawg.h:310
Definition: strngs.h:45
uint64_t EDGE_RECORD
Definition: dawg.h:53
DawgType type_
Definition: dawg.h:301
void init(int unicharset_size)
Definition: dawg.cpp:182
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:226
#define NUM_FLAG_BITS
Definition: dawg.h:92
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:213
virtual ~SquishedDawg()
Definition: dawg.cpp:199
int debug_level_
Definition: dawg.h:316
int check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
Definition: dawg.cpp:75
NODE_REF next_node(EDGE_REF edge) const
Definition: dawg.h:467
virtual bool end_of_word(EDGE_REF edge_ref) const =0
bool end_of_word(EDGE_REF edge_ref) const
Definition: dawg.h:473
#define ASSERT_HOST(x)
Definition: errcode.h:84