tesseract  5.0.0-alpha-619-ge9db
dawg.h
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * File: dawg.h
4  * Description: Definition of a class that represents Directed Acyclic Word
5  * Graph (DAWG), functions to build and manipulate the DAWG.
6  * Author: Mark Seaman, SW Productivity
7  *
8  * (c) Copyright 1987, Hewlett-Packard Company.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  *****************************************************************************/
20 
21 #ifndef DICT_DAWG_H_
22 #define DICT_DAWG_H_
23 
24 /*----------------------------------------------------------------------
25  I n c l u d e s
26 ----------------------------------------------------------------------*/
27 
28 #include <cinttypes> // for PRId64
29 #include <functional> // for std::function
30 #include <memory>
31 #include "elst.h"
32 #include "params.h"
33 #include "ratngs.h"
34 
35 #ifndef __GNUC__
36 #ifdef _WIN32
37 #define NO_EDGE (int64_t) 0xffffffffffffffffi64
38 #endif /*_WIN32*/
39 #else
40 #define NO_EDGE (int64_t) 0xffffffffffffffffll
41 #endif /*__GNUC__*/
42 
43 /*----------------------------------------------------------------------
44  T y p e s
45 ----------------------------------------------------------------------*/
46 class UNICHARSET;
47 
48 using EDGE_RECORD = uint64_t;
50 using EDGE_REF = int64_t;
51 using NODE_REF = int64_t;
52 using NODE_MAP = EDGE_REF *;
53 
54 namespace tesseract {
55 
56 struct NodeChild {
60  NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
61 };
62 
66 
67 enum DawgType {
72 
73  DAWG_TYPE_COUNT // number of enum entries
74 };
75 
76 /*----------------------------------------------------------------------
77  C o n s t a n t s
78 ----------------------------------------------------------------------*/
79 
80 #define FORWARD_EDGE (int32_t) 0
81 #define BACKWARD_EDGE (int32_t) 1
82 #define MAX_NODE_EDGES_DISPLAY (int64_t) 100
83 #define MARKER_FLAG (int64_t) 1
84 #define DIRECTION_FLAG (int64_t) 2
85 #define WERD_END_FLAG (int64_t) 4
86 #define LETTER_START_BIT 0
87 #define NUM_FLAG_BITS 3
88 #define REFFORMAT "%" PRId64
89 
90 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
91  { false, true, true, false }, // for DAWG_TYPE_PUNCTUATION
92  { true, false, false, false }, // for DAWG_TYPE_WORD
93  { true, false, false, false }, // for DAWG_TYPE_NUMBER
94  { false, false, false, false }, // for DAWG_TYPE_PATTERN
95 };
96 
97 static const char kWildcard[] = "*";
98 
99 
100 /*----------------------------------------------------------------------
101  C l a s s e s a n d S t r u c t s
102 ----------------------------------------------------------------------*/
103 //
113 //
114 class Dawg {
115  public:
117  static const int16_t kDawgMagicNumber = 42;
121  static const UNICHAR_ID kPatternUnicharID = 0;
122 
123  inline DawgType type() const { return type_; }
124  inline const STRING &lang() const { return lang_; }
125  inline PermuterType permuter() const { return perm_; }
126 
127  virtual ~Dawg();
128 
130  bool word_in_dawg(const WERD_CHOICE &word) const;
131 
132  // Returns true if the given word prefix is not contraindicated by the dawg.
133  // If requires_complete is true, then the exact complete word must be present.
134  bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const;
135 
138  int check_for_words(const char *filename,
139  const UNICHARSET &unicharset,
140  bool enable_wildcard) const;
141 
142  // For each word in the Dawg, call the given (permanent) callback with the
143  // text (UTF-8) version of the word.
144  void iterate_words(const UNICHARSET& unicharset,
145  std::function<void(const WERD_CHOICE*)> cb) const;
146 
147  // For each word in the Dawg, call the given (permanent) callback with the
148  // text (UTF-8) version of the word.
149  void iterate_words(const UNICHARSET& unicharset,
150  std::function<void(const char*)> cb) const;
151 
152  // Pure virtual function that should be implemented by the derived classes.
153 
155  virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
156  bool word_end) const = 0;
157 
160  virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
161  bool word_end) const = 0;
162 
165  virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0;
166 
169  virtual bool end_of_word(EDGE_REF edge_ref) const = 0;
170 
172  virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0;
173 
176  virtual void print_node(NODE_REF node, int max_num_edges) const = 0;
177 
180  virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id,
181  const UNICHARSET &unicharset,
182  GenericVector<UNICHAR_ID> *vec) const {
183  (void)unichar_id;
184  (void)unicharset;
185  (void)vec;
186  }
187 
191  virtual EDGE_REF pattern_loop_edge(
192  EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const {
193  (void)edge_ref;
194  (void)unichar_id;
195  (void)word_end;
196  return false;
197  }
198 
199  protected:
200  Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
201  : lang_(lang),
202  type_(type),
203  perm_(perm),
204  unicharset_size_(0),
205  debug_level_(debug_level) {}
206 
208  inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
209  return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
210  }
212  inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const {
213  return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0;
214  }
216  inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const {
217  return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ?
219  }
221  inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const {
222  return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
223  }
226  const EDGE_RECORD &edge_rec) const {
227  return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
228  }
230  inline void set_next_node_in_edge_rec(
231  EDGE_RECORD *edge_rec, EDGE_REF value) {
232  *edge_rec &= (~next_node_mask_);
233  *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
234  }
236  inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) {
237  *edge_rec |= (MARKER_FLAG << flag_start_bit_);
238  }
247  bool word_end,
248  UNICHAR_ID unichar_id,
249  const EDGE_RECORD &edge_rec) const {
250  UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
251  NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
252  bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
253  if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
254  curr_word_end, curr_unichar_id)) return 0;
255  if (unichar_id > curr_unichar_id) return 1;
256  if (unichar_id == curr_unichar_id) {
257  if (next_node > curr_next_node) return 1;
258  if (next_node == curr_next_node) {
259  if (word_end > curr_word_end) return 1;
260  }
261  }
262  return -1;
263  }
267  inline bool edge_rec_match(NODE_REF next_node,
268  bool word_end,
269  UNICHAR_ID unichar_id,
270  NODE_REF other_next_node,
271  bool other_word_end,
272  UNICHAR_ID other_unichar_id) const {
273  return ((unichar_id == other_unichar_id) &&
274  (next_node == NO_EDGE || next_node == other_next_node) &&
275  (!word_end || (word_end == other_word_end)));
276  }
277 
280  void init(int unicharset_size);
281 
287  bool match_words(WERD_CHOICE *word, int32_t index,
288  NODE_REF node, UNICHAR_ID wildcard) const;
289 
290  // Recursively iterate over all words in a dawg (see public iterate_words).
291  void iterate_words_rec(const WERD_CHOICE& word_so_far,
292  NODE_REF to_explore,
293  std::function<void(const WERD_CHOICE*)> cb) const;
294 
295  // Member Variables.
297  DawgType type_;
300  // Variables to construct various edge masks. Formerly:
301  // #define NEXT_EDGE_MASK (int64_t) 0xfffffff800000000i64
302  // #define FLAGS_MASK (int64_t) 0x0000000700000000i64
303  // #define LETTER_MASK (int64_t) 0x00000000ffffffffi64
304  uint64_t next_node_mask_ = 0;
305  uint64_t flags_mask_ = 0;
306  uint64_t letter_mask_ = 0;
309  int next_node_start_bit_ = 0;
310  // Level of debug statements to print to stdout.
311  int debug_level_;
312 };
313 
314 //
315 // DawgPosition keeps track of where we are in the primary dawg we're searching
316 // as well as where we may be in the "punctuation dawg" which may provide
317 // surrounding context.
318 //
319 // Example:
320 // punctuation dawg -- space is the "pattern character"
321 // " " // no punctuation
322 // "' '" // leading and trailing apostrophes
323 // " '" // trailing apostrophe
324 // word dawg:
325 // "cat"
326 // "cab"
327 // "cat's"
328 //
329 // DawgPosition(dawg_index, dawg_ref, punc_index, punc_ref, rtp)
330 //
331 // DawgPosition(-1, NO_EDGE, p, pe, false)
332 // We're in the punctuation dawg, no other dawg has been started.
333 // (1) If there's a pattern edge as a punc dawg child of us,
334 // for each punc-following dawg starting with ch, produce:
335 // Result: DawgPosition(k, w, p', false)
336 // (2) If there's a valid continuation in the punc dawg, produce:
337 // Result: DawgPosition(-k, NO_EDGE, p', false)
338 //
339 // DawgPosition(k, w, -1, NO_EDGE, false)
340 // We're in dawg k. Going back to punctuation dawg is not an option.
341 // Follow ch in dawg k.
342 //
343 // DawgPosition(k, w, p, pe, false)
344 // We're in dawg k. Continue in dawg k and/or go back to the punc dawg.
345 // If ending, check that the punctuation dawg is also ok to end here.
346 //
347 // DawgPosition(k, w, p, pe true)
348 // We're back in the punctuation dawg. Continuing there is the only option.
349 struct DawgPosition {
350  DawgPosition() = default;
351  DawgPosition(int dawg_idx, EDGE_REF dawgref,
352  int punc_idx, EDGE_REF puncref,
353  bool backtopunc)
354  : dawg_ref(dawgref), punc_ref(puncref),
355  dawg_index(dawg_idx), punc_index(punc_idx),
356  back_to_punc(backtopunc) {
357  }
358  bool operator==(const DawgPosition &other) {
359  return dawg_index == other.dawg_index &&
360  dawg_ref == other.dawg_ref &&
361  punc_index == other.punc_index &&
362  punc_ref == other.punc_ref &&
363  back_to_punc == other.back_to_punc;
364  }
365 
366  EDGE_REF dawg_ref = NO_EDGE;
367  EDGE_REF punc_ref = NO_EDGE;
368  int8_t dawg_index = -1;
369  int8_t punc_index = -1;
370  // Have we returned to the punc dawg at the end of the word?
371  bool back_to_punc = false;
372 };
373 
374 class DawgPositionVector : public GenericVector<DawgPosition> {
375  public:
378  void clear() { size_used_ = 0; }
382  inline bool add_unique(const DawgPosition &new_pos,
383  bool debug,
384  const char *debug_msg) {
385  for (int i = 0; i < size_used_; ++i) {
386  if (data_[i] == new_pos) return false;
387  }
388  push_back(new_pos);
389  if (debug) {
390  tprintf("%s[%d, " REFFORMAT "] [punc: " REFFORMAT "%s]\n",
391  debug_msg, new_pos.dawg_index, new_pos.dawg_ref,
392  new_pos.punc_ref, new_pos.back_to_punc ? " returned" : "");
393  }
394  return true;
395  }
396 };
397 
398 //
405 //
406 class SquishedDawg : public Dawg {
407  public:
409  int debug_level)
410  : Dawg(type, lang, perm, debug_level) {}
411  SquishedDawg(const char *filename, DawgType type, const STRING &lang,
412  PermuterType perm, int debug_level)
413  : Dawg(type, lang, perm, debug_level) {
414  TFile file;
415  ASSERT_HOST(file.Open(filename, nullptr));
416  ASSERT_HOST(read_squished_dawg(&file));
417  num_forward_edges_in_node0 = num_forward_edges(0);
418  }
419  SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
420  const STRING &lang, PermuterType perm, int unicharset_size,
421  int debug_level)
422  : Dawg(type, lang, perm, debug_level),
423  edges_(edges),
424  num_edges_(num_edges) {
425  init(unicharset_size);
426  num_forward_edges_in_node0 = num_forward_edges(0);
427  if (debug_level > 3) print_all("SquishedDawg:");
428  }
429  ~SquishedDawg() override;
430 
431  // Loads using the given TFile. Returns false on failure.
432  bool Load(TFile *fp) {
433  if (!read_squished_dawg(fp)) return false;
434  num_forward_edges_in_node0 = num_forward_edges(0);
435  return true;
436  }
437 
438  int NumEdges() { return num_edges_; }
439 
441  EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
442  bool word_end) const override;
443 
446  void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
447  bool word_end) const override {
448  EDGE_REF edge = node;
449  if (!edge_occupied(edge) || edge == NO_EDGE) return;
450  assert(forward_edge(edge)); // we don't expect any backward edges to
451  do { // be present when this function is called
452  if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
453  vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
454  }
455  } while (!last_edge(edge++));
456  }
457 
460  NODE_REF next_node(EDGE_REF edge) const override {
461  return next_node_from_edge_rec((edges_[edge]));
462  }
463 
466  bool end_of_word(EDGE_REF edge_ref) const override {
467  return end_of_word_from_edge_rec((edges_[edge_ref]));
468  }
469 
471  UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {
472  return unichar_id_from_edge_rec((edges_[edge_ref]));
473  }
474 
477  void print_node(NODE_REF node, int max_num_edges) const override;
478 
480  bool write_squished_dawg(TFile *file);
481 
484  bool write_squished_dawg(const char *filename) {
485  TFile file;
486  file.OpenWrite(nullptr);
487  if (!this->write_squished_dawg(&file)) {
488  tprintf("Error serializing %s\n", filename);
489  return false;
490  }
491  if (!file.CloseWrite(filename, nullptr)) {
492  tprintf("Error writing file %s\n", filename);
493  return false;
494  }
495  return true;
496  }
497 
498  private:
500  inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) {
501  set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
502  }
504  inline void set_empty_edge(EDGE_REF edge_ref) {
505  (edges_[edge_ref] = next_node_mask_);
506  }
508  inline void clear_all_edges() {
509  for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge);
510  }
512  inline void clear_marker_flag(EDGE_REF edge_ref) {
513  (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_));
514  }
516  inline bool forward_edge(EDGE_REF edge_ref) const {
517  return (edge_occupied(edge_ref) &&
518  (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
519  }
521  inline bool backward_edge(EDGE_REF edge_ref) const {
522  return (edge_occupied(edge_ref) &&
523  (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
524  }
526  inline bool edge_occupied(EDGE_REF edge_ref) const {
527  return (edges_[edge_ref] != next_node_mask_);
528  }
530  inline bool last_edge(EDGE_REF edge_ref) const {
531  return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0;
532  }
533 
535  int32_t num_forward_edges(NODE_REF node) const;
536 
538  bool read_squished_dawg(TFile *file);
539 
541  void print_edge(EDGE_REF edge) const;
542 
544  void print_all(const char* msg) {
545  tprintf("\n__________________________\n%s\n", msg);
546  for (int i = 0; i < num_edges_; ++i) print_edge(i);
547  tprintf("__________________________\n");
548  }
550  std::unique_ptr<EDGE_REF[]> build_node_map(int32_t *num_nodes) const;
551 
552  // Member variables.
553  EDGE_ARRAY edges_ = nullptr;
554  int32_t num_edges_ = 0;
555  int num_forward_edges_in_node0 = 0;
556 };
557 
558 } // namespace tesseract
559 
560 #endif // DICT_DAWG_H_
tesseract::Dawg::edge_letter
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
tesseract::NodeChild
Definition: dawg.h:55
tesseract::SquishedDawg::Load
bool Load(TFile *fp)
Definition: dawg.h:431
elst.h
tesseract::DawgPositionVector::clear
void clear()
Definition: dawg.h:377
tesseract::Dawg::prefix_in_dawg
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const
Definition: dawg.cpp:57
tesseract::Dawg::unichar_id_to_patterns
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
Definition: dawg.h:179
tesseract::DAWG_TYPE_PUNCTUATION
Definition: dawg.h:67
tesseract::SquishedDawg
Definition: dawg.h:405
tesseract::Dawg::end_of_word
virtual bool end_of_word(EDGE_REF edge_ref) const =0
tesseract::SquishedDawg::next_node
NODE_REF next_node(EDGE_REF edge) const override
Definition: dawg.h:459
tesseract::Dawg::match_words
bool match_words(WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_ID wildcard) const
Definition: dawg.cpp:158
tesseract::NodeChild::edge_ref
EDGE_REF edge_ref
Definition: dawg.h:57
tesseract::SquishedDawg::edge_char_of
EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const override
Returns the edge that corresponds to the letter out of this node.
Definition: dawg.cpp:209
tesseract::Dawg::type
DawgType type() const
Definition: dawg.h:122
tesseract::Dawg::flag_start_bit_
int flag_start_bit_
Definition: dawg.h:307
tesseract::DawgPosition
Definition: dawg.h:348
WERD_CHOICE
Definition: ratngs.h:261
tesseract::Dawg::flags_mask_
uint64_t flags_mask_
Definition: dawg.h:304
tesseract::File::Open
static FILE * Open(const std::string &filename, const std::string &mode)
Definition: fileio.cpp:54
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::Dawg::iterate_words_rec
void iterate_words_rec(const WERD_CHOICE &word_so_far, NODE_REF to_explore, std::function< void(const WERD_CHOICE *)> cb) const
Definition: dawg.cpp:140
GenericVector< DawgPosition >::data_
DawgPosition * data_
Definition: genericvector.h:332
tesseract::Dawg::next_node_from_edge_rec
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:207
tesseract::Dawg::lang
const STRING & lang() const
Definition: dawg.h:123
PermuterType
PermuterType
Definition: ratngs.h:230
params.h
tesseract::DAWG_TYPE_NUMBER
Definition: dawg.h:69
tesseract::SquishedDawg::NumEdges
int NumEdges()
Definition: dawg.h:437
tesseract::Dawg::end_of_word_from_edge_rec
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:220
tesseract::DawgPosition::operator==
bool operator==(const DawgPosition &other)
Definition: dawg.h:357
tesseract::Dawg::set_next_node_in_edge_rec
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
Definition: dawg.h:229
tesseract::NodeChildVector
GenericVector< NodeChild > NodeChildVector
Definition: dawg.h:62
STRING
Definition: strngs.h:45
EDGE_ARRAY
EDGE_RECORD * EDGE_ARRAY
Definition: dawg.h:48
tesseract::DawgPosition::punc_ref
EDGE_REF punc_ref
Definition: dawg.h:366
tesseract::Dawg::unichar_ids_of
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const =0
tesseract::Dawg::given_greater_than_edge_rec
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:245
tesseract::Dawg::print_node
virtual void print_node(NODE_REF node, int max_num_edges) const =0
tesseract::DAWG_TYPE_WORD
Definition: dawg.h:68
FORWARD_EDGE
#define FORWARD_EDGE
Definition: dawg.h:79
tesseract::SquishedDawg::SquishedDawg
SquishedDawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Definition: dawg.h:407
tesseract::Dawg::unichar_id_from_edge_rec
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:224
ratngs.h
tesseract::Dawg::perm_
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:298
tesseract::SquishedDawg::~SquishedDawg
~SquishedDawg() override
Definition: dawg.cpp:207
GenericVector< DawgPosition >::push_back
int push_back(DawgPosition object)
Definition: genericvector.h:799
tesseract::Dawg::edge_rec_match
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
Definition: dawg.h:266
tesseract::NodeChild::unichar_id
UNICHAR_ID unichar_id
Definition: dawg.h:56
file
Definition: include_gunit.h:22
tesseract::Dawg::init
void init(int unicharset_size)
Definition: dawg.cpp:190
tesseract::SquishedDawg::end_of_word
bool end_of_word(EDGE_REF edge_ref) const override
Definition: dawg.h:465
tesseract::Dawg::next_node_start_bit_
int next_node_start_bit_
Definition: dawg.h:308
EDGE_RECORD
uint64_t EDGE_RECORD
Definition: dawg.h:47
tesseract::Dawg::debug_level_
int debug_level_
Definition: dawg.h:310
tesseract::Dawg::marker_flag_from_edge_rec
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:211
tesseract::DawgPosition::DawgPosition
DawgPosition(int dawg_idx, EDGE_REF dawgref, int punc_idx, EDGE_REF puncref, bool backtopunc)
Definition: dawg.h:350
tesseract::TFile
Definition: serialis.h:75
DIRECTION_FLAG
#define DIRECTION_FLAG
Definition: dawg.h:83
tesseract::Dawg::Dawg
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Definition: dawg.h:199
UNICHARSET
Definition: unicharset.h:145
tesseract::Dawg::edge_char_of
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
tesseract::NodeChild::NodeChild
NodeChild()
Definition: dawg.h:59
tesseract::Dawg::pattern_loop_edge
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: dawg.h:190
tesseract
Definition: baseapi.h:65
tesseract::Dawg::iterate_words
void iterate_words(const UNICHARSET &unicharset, std::function< void(const WERD_CHOICE *)> cb) const
Definition: dawg.cpp:118
tesseract::DawgPosition::dawg_index
int8_t dawg_index
Definition: dawg.h:367
tesseract::Dawg::next_node
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
REFFORMAT
#define REFFORMAT
Definition: dawg.h:87
tesseract::DawgPosition::punc_index
int8_t punc_index
Definition: dawg.h:368
tesseract::DawgType
DawgType
Definition: dawg.h:66
GenericVector< DawgPosition >::size_used_
int32_t size_used_
Definition: genericvector.h:330
LETTER_START_BIT
#define LETTER_START_BIT
Definition: dawg.h:85
tesseract::DawgPositionVector::add_unique
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
Definition: dawg.h:381
tesseract::DawgPositionVector
Definition: dawg.h:373
tesseract::Dawg::word_in_dawg
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:78
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
GenericVector
Definition: baseapi.h:40
tesseract::DawgPosition::back_to_punc
bool back_to_punc
Definition: dawg.h:370
tesseract::Dawg::kDawgMagicNumber
static const int16_t kDawgMagicNumber
Magic number to determine endianness when reading the Dawg from file.
Definition: dawg.h:116
tesseract::SquishedDawg::unichar_ids_of
void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const override
Definition: dawg.h:445
tesseract::Dawg::set_marker_flag_in_edge_rec
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
Definition: dawg.h:235
tesseract::Dawg::lang_
STRING lang_
Definition: dawg.h:295
tesseract::Dawg
Definition: dawg.h:113
tesseract::DawgPosition::DawgPosition
DawgPosition()=default
MARKER_FLAG
#define MARKER_FLAG
Definition: dawg.h:82
tesseract::Dawg::check_for_words
int check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
Definition: dawg.cpp:82
EDGE_REF
int64_t EDGE_REF
Definition: dawg.h:49
tesseract::Dawg::letter_mask_
uint64_t letter_mask_
Definition: dawg.h:305
tesseract::DAWG_TYPE_COUNT
Definition: dawg.h:72
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::DAWG_TYPE_PATTERN
Definition: dawg.h:70
WERD_END_FLAG
#define WERD_END_FLAG
Definition: dawg.h:84
tesseract::DawgPosition::dawg_ref
EDGE_REF dawg_ref
Definition: dawg.h:365
tesseract::Dawg::kPatternUnicharID
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:120
tesseract::Dawg::direction_from_edge_rec
int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the direction flag of this edge.
Definition: dawg.h:215
tesseract::SquishedDawg::write_squished_dawg
bool write_squished_dawg(TFile *file)
Writes the squished/reduced Dawg to a file.
Definition: dawg.cpp:382
tesseract::SquishedDawg::edge_letter
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Definition: dawg.h:470
tesseract::Dawg::unicharset_size_
int unicharset_size_
Definition: dawg.h:306
tesseract::Dawg::next_node_mask_
uint64_t next_node_mask_
Definition: dawg.h:303
BACKWARD_EDGE
#define BACKWARD_EDGE
Definition: dawg.h:80
tesseract::SquishedDawg::print_node
void print_node(NODE_REF node, int max_num_edges) const override
Definition: dawg.cpp:254
NODE_MAP
EDGE_REF * NODE_MAP
Definition: dawg.h:51
tesseract::Dawg::type_
DawgType type_
Definition: dawg.h:296
tesseract::Dawg::permuter
PermuterType permuter() const
Definition: dawg.h:124
tesseract::Dawg::~Dawg
virtual ~Dawg()
NODE_REF
int64_t NODE_REF
Definition: dawg.h:50