tesseract  5.0.0-alpha-619-ge9db
tesseract::SquishedDawg Class Reference

#include <dawg.h>

Inheritance diagram for tesseract::SquishedDawg:
tesseract::Dawg

Public Member Functions

 SquishedDawg (DawgType type, const STRING &lang, PermuterType perm, int debug_level)
 
 SquishedDawg (const char *filename, DawgType type, const STRING &lang, PermuterType perm, int debug_level)
 
 SquishedDawg (EDGE_ARRAY edges, int num_edges, DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
 
 ~SquishedDawg () override
 
bool Load (TFile *fp)
 
int NumEdges ()
 
EDGE_REF edge_char_of (NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const override
 Returns the edge that corresponds to the letter out of this node. More...
 
void unichar_ids_of (NODE_REF node, NodeChildVector *vec, bool word_end) const override
 
NODE_REF next_node (EDGE_REF edge) const override
 
bool end_of_word (EDGE_REF edge_ref) const override
 
UNICHAR_ID edge_letter (EDGE_REF edge_ref) const override
 Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. More...
 
void print_node (NODE_REF node, int max_num_edges) const override
 
bool write_squished_dawg (TFile *file)
 Writes the squished/reduced Dawg to a file. More...
 
bool write_squished_dawg (const char *filename)
 
- Public Member Functions inherited from tesseract::Dawg
DawgType type () const
 
const STRINGlang () const
 
PermuterType permuter () const
 
virtual ~Dawg ()
 
bool word_in_dawg (const WERD_CHOICE &word) const
 Returns true if the given word is in the Dawg. More...
 
bool prefix_in_dawg (const WERD_CHOICE &prefix, bool requires_complete) const
 
int check_for_words (const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
 
void iterate_words (const UNICHARSET &unicharset, std::function< void(const WERD_CHOICE *)> cb) const
 
void iterate_words (const UNICHARSET &unicharset, std::function< void(const char *)> cb) const
 
virtual void unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
 
virtual EDGE_REF pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
 

Additional Inherited Members

- Static Public Attributes inherited from tesseract::Dawg
static const int16_t kDawgMagicNumber = 42
 Magic number to determine endianness when reading the Dawg from file. More...
 
static const UNICHAR_ID kPatternUnicharID = 0
 
- Protected Member Functions inherited from tesseract::Dawg
 Dawg (DawgType type, const STRING &lang, PermuterType perm, int debug_level)
 
NODE_REF next_node_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the next node visited by following this edge. More...
 
bool marker_flag_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the marker flag of this edge. More...
 
int direction_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the direction flag of this edge. More...
 
bool end_of_word_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns true if this edge marks the end of a word. More...
 
UNICHAR_ID unichar_id_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns UNICHAR_ID recorded in this edge. More...
 
void set_next_node_in_edge_rec (EDGE_RECORD *edge_rec, EDGE_REF value)
 Sets the next node link for this edge in the Dawg. More...
 
void set_marker_flag_in_edge_rec (EDGE_RECORD *edge_rec)
 Sets this edge record to be the last one in a sequence of edges. More...
 
int given_greater_than_edge_rec (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
 
bool edge_rec_match (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
 
void init (int unicharset_size)
 
bool match_words (WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_ID wildcard) const
 
void iterate_words_rec (const WERD_CHOICE &word_so_far, NODE_REF to_explore, std::function< void(const WERD_CHOICE *)> cb) const
 
- Protected Attributes inherited from tesseract::Dawg
STRING lang_
 
DawgType type_
 
PermuterType perm_
 Permuter code that should be used if the word is found in this Dawg. More...
 
uint64_t next_node_mask_ = 0
 
uint64_t flags_mask_ = 0
 
uint64_t letter_mask_ = 0
 
int unicharset_size_
 
int flag_start_bit_ = 0
 
int next_node_start_bit_ = 0
 
int debug_level_
 

Detailed Description

Concrete class that can operate on a compacted (squished) Dawg (read, search and write to file). This class is read-only in the sense that new words can not be added to an instance of SquishedDawg. The underlying representation of the nodes and edges in SquishedDawg is stored as a contiguous EDGE_ARRAY (read from file or given as an argument to the constructor).

Definition at line 405 of file dawg.h.

Constructor & Destructor Documentation

◆ SquishedDawg() [1/3]

tesseract::SquishedDawg::SquishedDawg ( DawgType  type,
const STRING lang,
PermuterType  perm,
int  debug_level 
)
inline

Definition at line 407 of file dawg.h.

407  :
409  int debug_level)

◆ SquishedDawg() [2/3]

tesseract::SquishedDawg::SquishedDawg ( const char *  filename,
DawgType  type,
const STRING lang,
PermuterType  perm,
int  debug_level 
)
inline

Definition at line 410 of file dawg.h.

410  : Dawg(type, lang, perm, debug_level) {}
411  SquishedDawg(const char *filename, DawgType type, const STRING &lang,
412  PermuterType perm, int debug_level)
413  : Dawg(type, lang, perm, debug_level) {
414  TFile file;
415  ASSERT_HOST(file.Open(filename, nullptr));
416  ASSERT_HOST(read_squished_dawg(&file));
417  num_forward_edges_in_node0 = num_forward_edges(0);

◆ SquishedDawg() [3/3]

tesseract::SquishedDawg::SquishedDawg ( EDGE_ARRAY  edges,
int  num_edges,
DawgType  type,
const STRING lang,
PermuterType  perm,
int  unicharset_size,
int  debug_level 
)
inline

Definition at line 418 of file dawg.h.

422  : Dawg(type, lang, perm, debug_level),
423  edges_(edges),
424  num_edges_(num_edges) {
425  init(unicharset_size);
426  num_forward_edges_in_node0 = num_forward_edges(0);
427  if (debug_level > 3) print_all("SquishedDawg:");

◆ ~SquishedDawg()

tesseract::SquishedDawg::~SquishedDawg ( )
override

Definition at line 207 of file dawg.cpp.

208 { // given > vec[k]

Member Function Documentation

◆ edge_char_of()

EDGE_REF tesseract::SquishedDawg::edge_char_of ( NODE_REF  node,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
overridevirtual

Returns the edge that corresponds to the letter out of this node.

Implements tesseract::Dawg.

Definition at line 209 of file dawg.cpp.

210  { // given < vec[k]
211  end = edge - 1;
212  }
213  }
214  } else { // linear search
215  if (edge != NO_EDGE && edge_occupied(edge)) {
216  do {
217  if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
218  (!word_end || end_of_word_from_edge_rec(edges_[edge])))
219  return (edge);
220  } while (!last_edge(edge++));
221  }
222  }
223  return (NO_EDGE); // not found
224 }
225 
226 int32_t SquishedDawg::num_forward_edges(NODE_REF node) const {
227  EDGE_REF edge = node;
228  int32_t num = 0;
229 
230  if (forward_edge (edge)) {
231  do {
232  num++;
233  } while (!last_edge(edge++));
234  }
235 
236  return (num);
237 }
238 
239 void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {

◆ edge_letter()

UNICHAR_ID tesseract::SquishedDawg::edge_letter ( EDGE_REF  edge_ref) const
inlineoverridevirtual

Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 470 of file dawg.h.

471  {
472  return unichar_id_from_edge_rec((edges_[edge_ref]));

◆ end_of_word()

bool tesseract::SquishedDawg::end_of_word ( EDGE_REF  edge_ref) const
inlineoverridevirtual

Returns true if the edge indicated by the given EDGE_REF marks the end of a word.

Implements tesseract::Dawg.

Definition at line 465 of file dawg.h.

466  {
467  return end_of_word_from_edge_rec((edges_[edge_ref]));

◆ Load()

bool tesseract::SquishedDawg::Load ( TFile fp)
inline

Definition at line 431 of file dawg.h.

432  {
433  if (!read_squished_dawg(fp)) return false;
434  num_forward_edges_in_node0 = num_forward_edges(0);
435  return true;

◆ next_node()

NODE_REF tesseract::SquishedDawg::next_node ( EDGE_REF  edge) const
inlineoverridevirtual

Returns the next node visited by following the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 459 of file dawg.h.

460  {
461  return next_node_from_edge_rec((edges_[edge]));

◆ NumEdges()

int tesseract::SquishedDawg::NumEdges ( )
inline

Definition at line 437 of file dawg.h.

438 { return num_edges_; }

◆ print_node()

void tesseract::SquishedDawg::print_node ( NODE_REF  node,
int  max_num_edges 
) const
overridevirtual

Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.

Implements tesseract::Dawg.

Definition at line 254 of file dawg.cpp.

258  {
259  do {
260  direction =
261  forward_edge(edge) ? forward_string : backward_string;
262  is_last = last_edge(edge) ? last_string : not_last_string;
263  eow = end_of_word(edge) ? eow_string : not_eow_string;
264 
265  unichar_id = edge_letter(edge);
266  tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
267  edge, next_node(edge), unichar_id,
268  direction, is_last, eow);
269 
270  if (edge - node > max_num_edges) return;
271  } while (!last_edge(edge++));
272 
273  if (edge < num_edges_ &&
274  edge_occupied(edge) && backward_edge(edge)) {
275  do {
276  direction =
277  forward_edge(edge) ? forward_string : backward_string;
278  is_last = last_edge(edge) ? last_string : not_last_string;
279  eow = end_of_word(edge) ? eow_string : not_eow_string;
280 
281  unichar_id = edge_letter(edge);
282  tprintf(REFFORMAT " : next = " REFFORMAT
283  ", unichar_id = %d, %s %s %s\n",
284  edge, next_node(edge), unichar_id,
285  direction, is_last, eow);
286 
287  if (edge - node > MAX_NODE_EDGES_DISPLAY) return;
288  } while (!last_edge(edge++));
289  }
290  }
291  else {
292  tprintf(REFFORMAT " : no edges in this node\n", node);
293  }
294  tprintf("\n");
295 }
296 
297 void SquishedDawg::print_edge(EDGE_REF edge) const {
298  if (edge == NO_EDGE) {
299  tprintf("NO_EDGE\n");
300  } else {
301  tprintf(REFFORMAT " : next = " REFFORMAT
302  ", unichar_id = '%d', %s %s %s\n", edge,
303  next_node(edge), edge_letter(edge),
304  (forward_edge(edge) ? "FORWARD" : " "),
305  (last_edge(edge) ? "LAST" : " "),
306  (end_of_word(edge) ? "EOW" : ""));
307  }
308 }
309 
310 bool SquishedDawg::read_squished_dawg(TFile *file) {

◆ unichar_ids_of()

void tesseract::SquishedDawg::unichar_ids_of ( NODE_REF  node,
NodeChildVector vec,
bool  word_end 
) const
inlineoverridevirtual

Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.

Implements tesseract::Dawg.

Definition at line 445 of file dawg.h.

447  {
448  EDGE_REF edge = node;
449  if (!edge_occupied(edge) || edge == NO_EDGE) return;
450  assert(forward_edge(edge)); // we don't expect any backward edges to
451  do { // be present when this function is called
452  if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
453  vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
454  }
455  } while (!last_edge(edge++));

◆ write_squished_dawg() [1/2]

bool tesseract::SquishedDawg::write_squished_dawg ( const char *  filename)
inline

Opens the file with the given filename and writes the squished/reduced Dawg to the file.

Definition at line 483 of file dawg.h.

484  {
485  TFile file;
486  file.OpenWrite(nullptr);
487  if (!this->write_squished_dawg(&file)) {
488  tprintf("Error serializing %s\n", filename);
489  return false;
490  }
491  if (!file.CloseWrite(filename, nullptr)) {
492  tprintf("Error writing file %s\n", filename);
493  return false;
494  }
495  return true;

◆ write_squished_dawg() [2/2]

bool tesseract::SquishedDawg::write_squished_dawg ( TFile file)

Writes the squished/reduced Dawg to a file.

Definition at line 382 of file dawg.cpp.

392  {
393  tprintf("%d nodes in DAWG\n", node_count);
394  tprintf("%d edges in DAWG\n", num_edges);
395  }
396 
397  for (edge = 0; edge < num_edges_; edge++) {
398  if (forward_edge(edge)) { // write forward edges
399  do {
400  old_index = next_node_from_edge_rec(edges_[edge]);
401  set_next_node(edge, node_map[old_index]);
402  temp_record = edges_[edge];
403  if (!file->Serialize(&temp_record)) return false;
404  set_next_node(edge, old_index);
405  } while (!last_edge(edge++));
406 
407  if (edge >= num_edges_) break;
408  if (backward_edge(edge)) // skip back links
409  while (!last_edge(edge++));
410 
411  edge--;
412  }
413  }
414  return true;
415 }
416 
417 } // namespace tesseract

The documentation for this class was generated from the following files:
tesseract::SquishedDawg::next_node
NODE_REF next_node(EDGE_REF edge) const override
Definition: dawg.h:459
tesseract::Dawg::type
DawgType type() const
Definition: dawg.h:122
tesseract::File::Open
static FILE * Open(const std::string &filename, const std::string &mode)
Definition: fileio.cpp:54
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::Dawg::next_node_from_edge_rec
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:207
tesseract::Dawg::lang
const STRING & lang() const
Definition: dawg.h:123
PermuterType
PermuterType
Definition: ratngs.h:230
tesseract::Dawg::end_of_word_from_edge_rec
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:220
STRING
Definition: strngs.h:45
tesseract::SquishedDawg::SquishedDawg
SquishedDawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Definition: dawg.h:407
tesseract::Dawg::unichar_id_from_edge_rec
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:224
MAX_NODE_EDGES_DISPLAY
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:81
file
Definition: include_gunit.h:22
tesseract::Dawg::init
void init(int unicharset_size)
Definition: dawg.cpp:190
tesseract::SquishedDawg::end_of_word
bool end_of_word(EDGE_REF edge_ref) const override
Definition: dawg.h:465
tesseract::Dawg::Dawg
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Definition: dawg.h:199
REFFORMAT
#define REFFORMAT
Definition: dawg.h:87
tesseract::DawgType
DawgType
Definition: dawg.h:66
EDGE_REF
int64_t EDGE_REF
Definition: dawg.h:49
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::SquishedDawg::write_squished_dawg
bool write_squished_dawg(TFile *file)
Writes the squished/reduced Dawg to a file.
Definition: dawg.cpp:382
tesseract::SquishedDawg::edge_letter
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Definition: dawg.h:470
tesseract::SquishedDawg::print_node
void print_node(NODE_REF node, int max_num_edges) const override
Definition: dawg.cpp:254
NODE_REF
int64_t NODE_REF
Definition: dawg.h:50