tesseract  5.0.0-alpha-619-ge9db
tesseract::Trie Class Reference

#include <trie.h>

Inheritance diagram for tesseract::Trie:
tesseract::Dawg

Public Types

enum  RTLReversePolicy { RRP_DO_NO_REVERSE, RRP_REVERSE_IF_HAS_RTL, RRP_FORCE_REVERSE }
 

Public Member Functions

 Trie (DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
 
 ~Trie () override
 
void clear ()
 
EDGE_REF edge_char_of (NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override
 
void unichar_ids_of (NODE_REF node, NodeChildVector *vec, bool word_end) const override
 
NODE_REF next_node (EDGE_REF edge_ref) const override
 
bool end_of_word (EDGE_REF edge_ref) const override
 
UNICHAR_ID edge_letter (EDGE_REF edge_ref) const override
 
void KillEdge (EDGE_RECORD *edge_rec) const
 
bool DeadEdge (const EDGE_RECORD &edge_rec) const
 
void print_node (NODE_REF node, int max_num_edges) const override
 
SquishedDawgtrie_to_dawg ()
 
bool read_and_add_word_list (const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
 
bool read_word_list (const char *filename, GenericVector< STRING > *words)
 
bool add_word_list (const GenericVector< STRING > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy)
 
bool read_pattern_list (const char *filename, const UNICHARSET &unicharset)
 
void initialize_patterns (UNICHARSET *unicharset)
 
void unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const override
 
EDGE_REF pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const override
 
bool add_word_to_dawg (const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
 
bool add_word_to_dawg (const WERD_CHOICE &word)
 
- Public Member Functions inherited from tesseract::Dawg
DawgType type () const
 
const STRINGlang () const
 
PermuterType permuter () const
 
virtual ~Dawg ()
 
bool word_in_dawg (const WERD_CHOICE &word) const
 Returns true if the given word is in the Dawg. More...
 
bool prefix_in_dawg (const WERD_CHOICE &prefix, bool requires_complete) const
 
int check_for_words (const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
 
void iterate_words (const UNICHARSET &unicharset, std::function< void(const WERD_CHOICE *)> cb) const
 
void iterate_words (const UNICHARSET &unicharset, std::function< void(const char *)> cb) const
 

Static Public Member Functions

static const char * get_reverse_policy_name (RTLReversePolicy reverse_policy)
 

Static Public Attributes

static const int kSaneNumConcreteChars = 0
 
static const char kAlphaPatternUnicode [] = "\u2000"
 
static const char kDigitPatternUnicode [] = "\u2001"
 
static const char kAlphanumPatternUnicode [] = "\u2002"
 
static const char kPuncPatternUnicode [] = "\u2003"
 
static const char kLowerPatternUnicode [] = "\u2004"
 
static const char kUpperPatternUnicode [] = "\u2005"
 
- Static Public Attributes inherited from tesseract::Dawg
static const int16_t kDawgMagicNumber = 42
 Magic number to determine endianness when reading the Dawg from file. More...
 
static const UNICHAR_ID kPatternUnicharID = 0
 

Protected Member Functions

EDGE_RECORDderef_edge_ref (EDGE_REF edge_ref) const
 
EDGE_REF make_edge_ref (NODE_REF node_index, EDGE_INDEX edge_index) const
 
void link_edge (EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
 
void print_edge_rec (const EDGE_RECORD &edge_rec) const
 
bool can_be_eliminated (const EDGE_RECORD &edge_rec)
 
void print_all (const char *msg, int max_num_edges)
 
bool edge_char_of (NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const
 
bool add_edge_linkage (NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
 
bool add_new_edge (NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
 
void add_word_ending (EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
 
NODE_REF new_dawg_node ()
 
void remove_edge_linkage (NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
 
void remove_edge (NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
 
bool eliminate_redundant_edges (NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
 
bool reduce_lettered_edges (EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes)
 
void sort_edges (EDGE_VECTOR *edges)
 
void reduce_node_input (NODE_REF node, NODE_MARKER reduced_nodes)
 
UNICHAR_ID character_class_to_pattern (char ch)
 
- Protected Member Functions inherited from tesseract::Dawg
 Dawg (DawgType type, const STRING &lang, PermuterType perm, int debug_level)
 
NODE_REF next_node_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the next node visited by following this edge. More...
 
bool marker_flag_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the marker flag of this edge. More...
 
int direction_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the direction flag of this edge. More...
 
bool end_of_word_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns true if this edge marks the end of a word. More...
 
UNICHAR_ID unichar_id_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns UNICHAR_ID recorded in this edge. More...
 
void set_next_node_in_edge_rec (EDGE_RECORD *edge_rec, EDGE_REF value)
 Sets the next node link for this edge in the Dawg. More...
 
void set_marker_flag_in_edge_rec (EDGE_RECORD *edge_rec)
 Sets this edge record to be the last one in a sequence of edges. More...
 
int given_greater_than_edge_rec (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
 
bool edge_rec_match (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
 
void init (int unicharset_size)
 
bool match_words (WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_ID wildcard) const
 
void iterate_words_rec (const WERD_CHOICE &word_so_far, NODE_REF to_explore, std::function< void(const WERD_CHOICE *)> cb) const
 

Protected Attributes

TRIE_NODES nodes_
 
GenericVector< EDGE_INDEXroot_back_freelist_
 
uint64_t num_edges_
 
uint64_t deref_direction_mask_
 
uint64_t deref_node_index_mask_
 
UNICHAR_ID alpha_pattern_
 
UNICHAR_ID digit_pattern_
 
UNICHAR_ID alphanum_pattern_
 
UNICHAR_ID punc_pattern_
 
UNICHAR_ID lower_pattern_
 
UNICHAR_ID upper_pattern_
 
bool initialized_patterns_
 
- Protected Attributes inherited from tesseract::Dawg
STRING lang_
 
DawgType type_
 
PermuterType perm_
 Permuter code that should be used if the word is found in this Dawg. More...
 
uint64_t next_node_mask_ = 0
 
uint64_t flags_mask_ = 0
 
uint64_t letter_mask_ = 0
 
int unicharset_size_
 
int flag_start_bit_ = 0
 
int next_node_start_bit_ = 0
 
int debug_level_
 

Detailed Description

Concrete class for Trie data structure that allows to store a list of words (extends Dawg base class) as well as dynamically add new words. This class stores a vector of pointers to TRIE_NODE_RECORDs, each of which has a vector of forward and backward edges.

Definition at line 54 of file trie.h.

Member Enumeration Documentation

◆ RTLReversePolicy

Enumerator
RRP_DO_NO_REVERSE 
RRP_REVERSE_IF_HAS_RTL 
RRP_FORCE_REVERSE 

Definition at line 56 of file trie.h.

Constructor & Destructor Documentation

◆ Trie()

tesseract::Trie::Trie ( DawgType  type,
const STRING lang,
PermuterType  perm,
int  unicharset_size,
int  debug_level 
)
inline

Definition at line 81 of file trie.h.

84  : Dawg(type, lang, perm, debug_level) {
85  init(unicharset_size);
86  num_edges_ = 0;
88  new_dawg_node(); // need to allocate node 0
89  initialized_patterns_ = false;

◆ ~Trie()

tesseract::Trie::~Trie ( )
inlineoverride

Definition at line 90 of file trie.h.

Member Function Documentation

◆ add_edge_linkage()

bool tesseract::Trie::add_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
protected

Definition at line 130 of file trie.cpp.

134  {
135  EDGE_INDEX edge_index = root_back_freelist_.pop_back();
136  (*vec)[edge_index] = edge_rec;
137  } else if (search_index < vec->size()) {
138  vec->insert(edge_rec, search_index);
139  } else {
140  vec->push_back(edge_rec);
141  }
142  if (debug_level_ > 1) {
143  tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
144  print_edge_rec(edge_rec);
145  tprintf("\n");
146  }
147  num_edges_++;
148  return true;
149 }
150 
151 void Trie::add_word_ending(EDGE_RECORD *edge_ptr,
152  NODE_REF the_next_node,
153  bool marker_flag,
154  UNICHAR_ID unichar_id) {
155  EDGE_RECORD *back_edge_ptr;
156  EDGE_INDEX back_edge_index;
157  ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false,
158  unichar_id, &back_edge_ptr, &back_edge_index));
159  if (marker_flag) {
160  *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
161  *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
162  }
163  // Mark both directions as end of word.
164  *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);

◆ add_new_edge()

bool tesseract::Trie::add_new_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Definition at line 348 of file trie.h.

350  {
351  return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE,
352  word_end, unichar_id) &&
353  add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE,
354  word_end, unichar_id));

◆ add_word_ending()

void tesseract::Trie::add_word_ending ( EDGE_RECORD edge,
NODE_REF  the_next_node,
bool  repeats,
UNICHAR_ID  unichar_id 
)
protected

Definition at line 166 of file trie.cpp.

169  {
170  if (word.length() <= 0) return false; // can't add empty words
171  if (repetitions != nullptr) ASSERT_HOST(repetitions->size() == word.length());
172  // Make sure the word does not contain invalid unchar ids.
173  for (int i = 0; i < word.length(); ++i) {
174  if (word.unichar_id(i) < 0 ||
175  word.unichar_id(i) >= unicharset_size_) return false;
176  }
177 
178  EDGE_RECORD *edge_ptr;
179  NODE_REF last_node = 0;
180  NODE_REF the_next_node;
181  bool marker_flag = false;

◆ add_word_list()

bool tesseract::Trie::add_word_list ( const GenericVector< STRING > &  words,
const UNICHARSET unicharset,
Trie::RTLReversePolicy  reverse_policy 
)

Definition at line 327 of file trie.cpp.

327  : word '%s' not in DAWG after adding it\n",
328  words[i].c_str());
329  return false;
330  }
331  }
332  }
333  return true;
334 }
335 
336 void Trie::initialize_patterns(UNICHARSET *unicharset) {
337  unicharset->unichar_insert(kAlphaPatternUnicode);
338  alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode);
339  unicharset->unichar_insert(kDigitPatternUnicode);
340  digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode);
341  unicharset->unichar_insert(kAlphanumPatternUnicode);
342  alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode);
343  unicharset->unichar_insert(kPuncPatternUnicode);
344  punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode);
345  unicharset->unichar_insert(kLowerPatternUnicode);
346  lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode);
347  unicharset->unichar_insert(kUpperPatternUnicode);
348  upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode);
349  initialized_patterns_ = true;

◆ add_word_to_dawg() [1/2]

bool tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word)
inline

Definition at line 261 of file trie.h.

262  {
263  return add_word_to_dawg(word, nullptr);

◆ add_word_to_dawg() [2/2]

bool tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word,
const GenericVector< bool > *  repetitions 
)

Definition at line 183 of file trie.cpp.

189  : ");
190 
191  UNICHAR_ID unichar_id;
192  for (i = 0; i < word.length() - 1; ++i) {
193  unichar_id = word.unichar_id(i);
194  marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
195  if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
196  if (still_finding_chars) {
197  found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end,
198  unichar_id, &edge_ptr, &edge_index);
199  if (found && debug_level_ > 1) {
200  tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n",
201  edge_index, last_node);
202  }
203  if (!found) {
204  still_finding_chars = false;
205  } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
206  // We hit the end of an existing word, but the new word is longer.
207  // In this case we have to disconnect the existing word from the
208  // backwards root node, mark the current position as end-of-word
209  // and add new nodes for the increased length. Disconnecting the
210  // existing word from the backwards root node requires a linear
211  // search, so it is much faster to add the longest words first,
212  // to avoid having to come here.
213  word_end = true;
214  still_finding_chars = false;
215  remove_edge(last_node, 0, word_end, unichar_id);
216  } else {
217  // We have to add a new branch here for the new word.
218  if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr);
219  last_node = next_node_from_edge_rec(*edge_ptr);
220  }
221  }
222  if (!still_finding_chars) {
223  the_next_node = new_dawg_node();
224  if (debug_level_ > 1)
225  tprintf("adding node " REFFORMAT "\n", the_next_node);
226  if (the_next_node == 0) {
227  add_failed = true;
228  break;
229  }
230  if (!add_new_edge(last_node, the_next_node,
231  marker_flag, word_end, unichar_id)) {
232  add_failed = true;
233  break;
234  }
235  word_end = false;
236  last_node = the_next_node;
237  }
238  }
239  the_next_node = 0;
240  unichar_id = word.unichar_id(i);
241  marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
242  if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
243  if (still_finding_chars &&
244  edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false,
245  unichar_id, &edge_ptr, &edge_index)) {
246  // An extension of this word already exists in the trie, so we
247  // only have to add the ending flags in both directions.
248  add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr),
249  marker_flag, unichar_id);
250  } else {
251  // Add a link to node 0. All leaves connect to node 0 so the back links can
252  // be used in reduction to a dawg. This root backward node has one edge
253  // entry for every word, (except prefixes of longer words) so it is huge.
254  if (!add_failed &&
255  !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id))
256  add_failed = true;
257  }
258  if (add_failed) {
259  tprintf("Re-initializing document dictionary...\n");
260  clear();
261  return false;
262  } else {
263  return true;
264  }
265 }
266 
267 NODE_REF Trie::new_dawg_node() {
268  auto *node = new TRIE_NODE_RECORD();
269  nodes_.push_back(node);
270  return nodes_.size() - 1;
271 }
272 
273 // Sort function to sort words by decreasing order of length.
274 static int sort_strings_by_dec_length(const void* v1, const void* v2) {
275  const auto *s1 = static_cast<const STRING *>(v1);
276  const auto *s2 = static_cast<const STRING *>(v2);
277  return s2->length() - s1->length();
278 }
279 
280 bool Trie::read_and_add_word_list(const char *filename,

◆ can_be_eliminated()

bool tesseract::Trie::can_be_eliminated ( const EDGE_RECORD edge_rec)
inlineprotected

Definition at line 318 of file trie.h.

319  {
320  NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
321  return (node_ref != NO_EDGE &&
322  nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);

◆ character_class_to_pattern()

UNICHAR_ID tesseract::Trie::character_class_to_pattern ( char  ch)
protected

Definition at line 390 of file trie.cpp.

394  {
395  if (!initialized_patterns_) {
396  tprintf("please call initialize_patterns() before read_pattern_list()\n");
397  return false;
398  }
399 
400  FILE *pattern_file = fopen(filename, "rb");
401  if (pattern_file == nullptr) {
402  tprintf("Error opening pattern file %s\n", filename);
403  return false;
404  }
405 
406  int pattern_count = 0;

◆ clear()

void tesseract::Trie::clear ( )

Definition at line 71 of file trie.cpp.

71  {
72  print_node(node_ref, nodes_[node_ref]->forward_edges.size());
73  }
74  }
75  if (node_ref == NO_EDGE) return false;
76  assert(node_ref < nodes_.size());
77  EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ?

◆ DeadEdge()

bool tesseract::Trie::DeadEdge ( const EDGE_RECORD edge_rec) const
inline

Definition at line 150 of file trie.h.

151  {
152  return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;

◆ deref_edge_ref()

EDGE_RECORD* tesseract::Trie::deref_edge_ref ( EDGE_REF  edge_ref) const
inlineprotected

Definition at line 283 of file trie.h.

284  {
285  int edge_index = static_cast<int>(
286  (edge_ref & letter_mask_) >> LETTER_START_BIT);
287  int node_index = static_cast<int>(
288  (edge_ref & deref_node_index_mask_) >> flag_start_bit_);
289  TRIE_NODE_RECORD *node_rec = nodes_[node_index];
290  return &(node_rec->forward_edges[edge_index]);

◆ edge_char_of() [1/2]

bool tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
NODE_REF  next_node,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id,
EDGE_RECORD **  edge_ptr,
EDGE_INDEX edge_index 
) const
protected

Definition at line 79 of file trie.cpp.

80  { // binary search
81  EDGE_INDEX start = 0;
82  EDGE_INDEX end = vec_size - 1;
83  EDGE_INDEX k;
84  int compare;
85  while (start <= end) {
86  k = (start + end) >> 1; // (start + end) / 2
87  compare = given_greater_than_edge_rec(next_node, word_end,
88  unichar_id, vec[k]);
89  if (compare == 0) { // given == vec[k]
90  *edge_ptr = &(vec[k]);
91  *edge_index = k;
92  return true;
93  } else if (compare == 1) { // given > vec[k]
94  start = k + 1;
95  } else { // given < vec[k]
96  end = k - 1;
97  }
98  }
99  } else { // linear search
100  for (int i = 0; i < vec_size; ++i) {
101  EDGE_RECORD &edge_rec = vec[i];
102  if (edge_rec_match(next_node, word_end, unichar_id,
103  next_node_from_edge_rec(edge_rec),
104  end_of_word_from_edge_rec(edge_rec),
105  unichar_id_from_edge_rec(edge_rec))) {
106  *edge_ptr = &(edge_rec);
107  *edge_index = i;
108  return true;
109  }
110  }
111  }
112  return false; // not found
113 }
114 
115 bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag,
116  int direction, bool word_end,
117  UNICHAR_ID unichar_id) {
118  EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
119  &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
120  int search_index;
121  if (node1 == 0 && direction == FORWARD_EDGE) {
122  search_index = 0; // find the index to make the add sorted
123  while (search_index < vec->size() &&
124  given_greater_than_edge_rec(node2, word_end, unichar_id,
125  (*vec)[search_index]) == 1) {
126  search_index++;
127  }
128  } else {

◆ edge_char_of() [2/2]

EDGE_REF tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inlineoverridevirtual

Returns the edge that corresponds to the letter out of this node.

Implements tesseract::Dawg.

Definition at line 96 of file trie.h.

98  {
99  EDGE_RECORD *edge_ptr;
100  EDGE_INDEX edge_index;
101  if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id,
102  &edge_ptr, &edge_index)) return NO_EDGE;
103  return make_edge_ref(node_ref, edge_index);

◆ edge_letter()

UNICHAR_ID tesseract::Trie::edge_letter ( EDGE_REF  edge_ref) const
inlineoverridevirtual

Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 140 of file trie.h.

141  {
142  if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID;
143  return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));

◆ eliminate_redundant_edges()

bool tesseract::Trie::eliminate_redundant_edges ( NODE_REF  node,
const EDGE_RECORD edge1,
const EDGE_RECORD edge2 
)
protected

Definition at line 572 of file trie.cpp.

578  {
579  const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
580  NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
581  UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
582  int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
583  bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
584  add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE,
585  curr_word_end, curr_unichar_id);
586  // Relocate the corresponding forward edge in curr_next_node
587  ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE,
588  curr_word_end, curr_unichar_id,
589  &edge_ptr, &edge_index));
590  set_next_node_in_edge_rec(edge_ptr, next_node1);
591  }
592  int next_node2_num_edges = (next_node2_ptr->forward_edges.size() +
593  next_node2_ptr->backward_edges.size());
594  if (debug_level_ > 1) {
595  tprintf("removed %d edges from node " REFFORMAT "\n",
596  next_node2_num_edges, next_node2);
597  }
598  next_node2_ptr->forward_edges.clear();
599  next_node2_ptr->backward_edges.clear();
600  num_edges_ -= next_node2_num_edges;
601  return true;
602 }
603 
605  UNICHAR_ID unichar_id,
606  NODE_REF node,
607  EDGE_VECTOR* backward_edges,
608  NODE_MARKER reduced_nodes) {
609  if (debug_level_ > 1)
610  tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
611  // Compare each of the edge pairs with the given unichar_id.
612  bool did_something = false;
613  for (int i = edge_index; i < backward_edges->size() - 1; ++i) {
614  // Find the first edge that can be eliminated.
615  UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
616  while (i < backward_edges->size()) {
617  if (!DeadEdge((*backward_edges)[i])) {

◆ end_of_word()

bool tesseract::Trie::end_of_word ( EDGE_REF  edge_ref) const
inlineoverridevirtual

Returns true if the edge indicated by the given EDGE_REF marks the end of a word.

Implements tesseract::Dawg.

Definition at line 134 of file trie.h.

135  {
136  if (edge_ref == NO_EDGE || num_edges_ == 0) return false;
137  return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));

◆ get_reverse_policy_name()

const char * tesseract::Trie::get_reverse_policy_name ( RTLReversePolicy  reverse_policy)
static

Definition at line 66 of file trie.cpp.

66  {
67  if (debug_level_ == 3) {
68  tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT

◆ initialize_patterns()

void tesseract::Trie::initialize_patterns ( UNICHARSET unicharset)

Definition at line 351 of file trie.cpp.

355  {
356  bool is_alpha = unicharset.get_isalpha(unichar_id);
357  if (is_alpha) {
358  vec->push_back(alpha_pattern_);
359  vec->push_back(alphanum_pattern_);
360  if (unicharset.get_islower(unichar_id)) {
361  vec->push_back(lower_pattern_);
362  } else if (unicharset.get_isupper(unichar_id)) {
363  vec->push_back(upper_pattern_);
364  }
365  }
366  if (unicharset.get_isdigit(unichar_id)) {

◆ KillEdge()

void tesseract::Trie::KillEdge ( EDGE_RECORD edge_rec) const
inline

Definition at line 146 of file trie.h.

147  {
148  *edge_rec &= ~letter_mask_;
149  *edge_rec |= (unicharset_size_ << LETTER_START_BIT);

◆ link_edge()

void tesseract::Trie::link_edge ( EDGE_RECORD edge,
NODE_REF  nxt,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Sets up this edge record to the requested values.

Definition at line 298 of file trie.h.

300  {
301  EDGE_RECORD flags = 0;
302  if (repeats) flags |= MARKER_FLAG;
303  if (word_end) flags |= WERD_END_FLAG;
304  if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG;
305  *edge = ((nxt << next_node_start_bit_) |
306  (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
307  (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));

◆ make_edge_ref()

EDGE_REF tesseract::Trie::make_edge_ref ( NODE_REF  node_index,
EDGE_INDEX  edge_index 
) const
inlineprotected

Constructs EDGE_REF from the given node_index and edge_index.

Definition at line 292 of file trie.h.

294  {
295  return ((node_index << flag_start_bit_) |
296  (edge_index << LETTER_START_BIT));

◆ new_dawg_node()

NODE_REF tesseract::Trie::new_dawg_node ( )
protected

Definition at line 282 of file trie.cpp.

282  {
283  GenericVector<STRING> word_list;
284  if (!read_word_list(filename, &word_list)) return false;
285  word_list.sort(sort_strings_by_dec_length);
286  return add_word_list(word_list, unicharset, reverse_policy);

◆ next_node()

NODE_REF tesseract::Trie::next_node ( EDGE_REF  edge_ref) const
inlineoverridevirtual

Returns the next node visited by following the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 125 of file trie.h.

126  {
127  if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE;
128  return next_node_from_edge_rec(*deref_edge_ref(edge_ref));

◆ pattern_loop_edge()

EDGE_REF tesseract::Trie::pattern_loop_edge ( EDGE_REF  edge_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inlineoverridevirtual

Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.

Reimplemented from tesseract::Dawg.

Definition at line 239 of file trie.h.

242  {
243  if (edge_ref == NO_EDGE) return NO_EDGE;
244  EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
245  return (marker_flag_from_edge_rec(*edge_rec) &&
246  unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
247  word_end == end_of_word_from_edge_rec(*edge_rec)) ?
248  edge_ref : NO_EDGE;

◆ print_all()

void tesseract::Trie::print_all ( const char *  msg,
int  max_num_edges 
)
inlineprotected

Definition at line 326 of file trie.h.

327  {
328  tprintf("\n__________________________\n%s\n", msg);
329  for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
330  tprintf("__________________________\n");

◆ print_edge_rec()

void tesseract::Trie::print_edge_rec ( const EDGE_RECORD edge_rec) const
inlineprotected

Prints the given EDGE_RECORD.

Definition at line 309 of file trie.h.

310  {
311  tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
312  marker_flag_from_edge_rec(edge_rec) ? "R," : "",
313  (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
314  end_of_word_from_edge_rec(edge_rec) ? ",E" : "",
315  unichar_id_from_edge_rec(edge_rec));

◆ print_node()

void tesseract::Trie::print_node ( NODE_REF  node,
int  max_num_edges 
) const
overridevirtual

Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.

Implements tesseract::Dawg.

Definition at line 711 of file trie.cpp.

711  : i < num_bkw) &&
712  i < max_num_edges; ++i) {
713  if (DeadEdge((*vec)[i])) continue;
714  print_edge_rec((*vec)[i]);
715  tprintf(" ");
716  }
717  if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("...");
718  tprintf("\n");
719  }
720 }
721 
722 } // namespace tesseract

◆ read_and_add_word_list()

bool tesseract::Trie::read_and_add_word_list ( const char *  filename,
const UNICHARSET unicharset,
Trie::RTLReversePolicy  reverse 
)

Definition at line 295 of file trie.cpp.

298  {
299  chomp_string(line_str); // remove newline
300  STRING word_str(line_str);
301  ++word_count;
302  if (debug_level_ && word_count % 10000 == 0)

◆ read_pattern_list()

bool tesseract::Trie::read_pattern_list ( const char *  filename,
const UNICHARSET unicharset 
)

Definition at line 408 of file trie.cpp.

408  {
409  chomp_string(string); // remove newline
410  // Parse the pattern and construct a unichar id vector.
411  // Record the number of repetitions of each unichar in the parallel vector.
412  WERD_CHOICE word(&unicharset);
413  GenericVector<bool> repetitions_vec;
414  const char *str_ptr = string;
415  int step = unicharset.step(str_ptr);
416  bool failed = false;
417  while (step > 0) {
418  UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
419  if (step == 1 && *str_ptr == '\\') {
420  ++str_ptr;
421  if (*str_ptr == '\\') { // regular '\' unichar that was escaped
422  curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
423  } else {
424  if (word.length() < kSaneNumConcreteChars) {
425  tprintf("Please provide at least %d concrete characters at the"
426  " beginning of the pattern\n", kSaneNumConcreteChars);
427  failed = true;
428  break;
429  }
430  // Parse character class from expression.
431  curr_unichar_id = character_class_to_pattern(*str_ptr);
432  }
433  } else {
434  curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
435  }
436  if (curr_unichar_id == INVALID_UNICHAR_ID) {
437  failed = true;
438  break; // failed to parse this pattern
439  }
440  word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
441  repetitions_vec.push_back(false);
442  str_ptr += step;
443  step = unicharset.step(str_ptr);
444  // Check if there is a repetition pattern specified after this unichar.
445  if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') {
446  repetitions_vec[repetitions_vec.size()-1] = true;
447  str_ptr += 2;
448  step = unicharset.step(str_ptr);
449  }
450  }
451  if (failed) {
452  tprintf("Invalid user pattern %s\n", string);
453  continue;
454  }
455  // Insert the pattern into the trie.
456  if (debug_level_ > 2) {
457  tprintf("Inserting expanded user pattern %s\n",
458  word.debug_string().c_str());
459  }
460  if (!this->word_in_dawg(word)) {
461  this->add_word_to_dawg(word, &repetitions_vec);
462  if (!this->word_in_dawg(word)) {
463  tprintf("Error: failed to insert pattern '%s'\n", string);
464  }
465  }
466  ++pattern_count;
467  }
468  if (debug_level_) {
469  tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
470  }
471  fclose(pattern_file);
472  return true;
473 }
474 
475 void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction,
476  bool word_end, UNICHAR_ID unichar_id) {
477  EDGE_RECORD *edge_ptr = nullptr;
478  EDGE_INDEX edge_index = 0;
479  ASSERT_HOST(edge_char_of(node1, node2, direction, word_end,
480  unichar_id, &edge_ptr, &edge_index));
481  if (debug_level_ > 1) {
482  tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
483  print_edge_rec(*edge_ptr);
484  tprintf("\n");
485  }
486  if (direction == FORWARD_EDGE) {
487  nodes_[node1]->forward_edges.remove(edge_index);
488  } else if (node1 == 0) {

◆ read_word_list()

bool tesseract::Trie::read_word_list ( const char *  filename,
GenericVector< STRING > *  words 
)

Definition at line 304 of file trie.cpp.

314  {
315  for (int i = 0; i < words.size(); ++i) {
316  WERD_CHOICE word(words[i].c_str(), unicharset);
317  if (word.length() == 0 || word.contains_unichar_id(INVALID_UNICHAR_ID))
318  continue;
319  if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
320  word.has_rtl_unichar_id()) ||
321  reverse_policy == RRP_FORCE_REVERSE) {
322  word.reverse_and_mirror_unichar_ids();
323  }
324  if (!word_in_dawg(word)) {
325  add_word_to_dawg(word);

◆ reduce_lettered_edges()

bool tesseract::Trie::reduce_lettered_edges ( EDGE_INDEX  edge_index,
UNICHAR_ID  unichar_id,
NODE_REF  node,
EDGE_VECTOR backward_edges,
NODE_MARKER  reduced_nodes 
)
protected

Definition at line 619 of file trie.cpp.

627  {
628  const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
629  if (DeadEdge(next_edge_rec)) continue;
630  UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec);
631  if (next_id != unichar_id) break;
632  if (end_of_word_from_edge_rec(next_edge_rec) ==
633  end_of_word_from_edge_rec(edge_rec) &&
634  can_be_eliminated(next_edge_rec) &&
635  eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
636  reduced_nodes[next_node_from_edge_rec(edge_rec)] = false;
637  did_something = true;
638  KillEdge(&(*backward_edges)[j]);
639  }
640  }
641  }
642  return did_something;
643 }
644 
645 void Trie::sort_edges(EDGE_VECTOR *edges) {
646  int num_edges = edges->size();
647  if (num_edges <= 1) return;
649  sort_vec.reserve(num_edges);
650  for (int i = 0; i < num_edges; ++i) {
651  sort_vec.push_back(KDPairInc<UNICHAR_ID, EDGE_RECORD>(
652  unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]));
653  }
654  sort_vec.sort();
655  for (int i = 0; i < num_edges; ++i)
656  (*edges)[i] = sort_vec[i].data;
657 }
658 

◆ reduce_node_input()

void tesseract::Trie::reduce_node_input ( NODE_REF  node,
NODE_MARKER  reduced_nodes 
)
protected

Eliminates any redundant edges from this node in the Trie.

Definition at line 674 of file trie.cpp.

675  {
676  UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);
677  if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) break;
678  }
679  }
680  reduced_nodes[node] = true; // mark as reduced
681 
682  if (debug_level_ > 1) {
683  tprintf("Node " REFFORMAT " after reduction:\n", node);
685  }
686 
687  for (int i = 0; i < backward_edges.size(); ++i) {
688  if (DeadEdge(backward_edges[i])) continue;
689  NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]);
690  if (next_node != 0 && !reduced_nodes[next_node]) {
691  reduce_node_input(next_node, reduced_nodes);
692  }
693  }
694 }
695 
696 void Trie::print_node(NODE_REF node, int max_num_edges) const {
697  if (node == NO_EDGE) return; // nothing to print
698  TRIE_NODE_RECORD *node_ptr = nodes_[node];
699  int num_fwd = node_ptr->forward_edges.size();
700  int num_bkw = node_ptr->backward_edges.size();
701  EDGE_VECTOR *vec;
702  for (int dir = 0; dir < 2; ++dir) {
703  if (dir == 0) {
704  vec = &(node_ptr->forward_edges);
705  tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
706  } else {
707  vec = &(node_ptr->backward_edges);
708  tprintf("\t");
709  }

◆ remove_edge()

void tesseract::Trie::remove_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Definition at line 373 of file trie.h.

375  {
376  remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
377  remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);

◆ remove_edge_linkage()

void tesseract::Trie::remove_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
protected

Definition at line 490 of file trie.cpp.

491  {
492  nodes_[node1]->backward_edges.remove(edge_index);
493  }
494  --num_edges_;
495 }
496 
497 // Some optimizations employed in add_word_to_dawg and trie_to_dawg:
498 // 1 Avoid insertion sorting or bubble sorting the tail root node
499 // (back links on node 0, a list of all the leaves.). The node is
500 // huge, and sorting it with n^2 time is terrible.
501 // 2 Avoid using GenericVector::remove on the tail root node.
502 // (a) During add of words to the trie, zero-out the unichars and
503 // keep a freelist of spaces to re-use.
504 // (b) During reduction, just zero-out the unichars of deleted back
505 // links, skipping zero entries while searching.
506 // 3 Avoid linear search of the tail root node. This has to be done when
507 // a suffix is added to an existing word. Adding words by decreasing
508 // length avoids this problem entirely. Words can still be added in
509 // any order, but it is faster to add the longest first.
510 SquishedDawg *Trie::trie_to_dawg() {

◆ sort_edges()

void tesseract::Trie::sort_edges ( EDGE_VECTOR edges)
protected

Order num_edges of consecutive EDGE_RECORDS in the given EDGE_VECTOR in increasing order of unichar ids. This function is normally called for all edges in a single node, and since number of edges in each node is usually quite small, selection sort is used.

Definition at line 660 of file trie.cpp.

660  {
661  EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
662  sort_edges(&backward_edges);
663  if (debug_level_ > 1) {
664  tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
666  }
667 
668  EDGE_INDEX edge_index = 0;
669  while (edge_index < backward_edges.size()) {
670  if (DeadEdge(backward_edges[edge_index])) continue;
671  UNICHAR_ID unichar_id =
672  unichar_id_from_edge_rec(backward_edges[edge_index]);

◆ trie_to_dawg()

SquishedDawg * tesseract::Trie::trie_to_dawg ( )

Definition at line 525 of file trie.cpp.

528  {
529  node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
530  }
531  int num_forward_edges = node_ref_map[i];
532 
533  // Convert nodes_ vector into EDGE_ARRAY translating the next node references
534  // in edges using node_ref_map. Empty nodes and backward edges are dropped.
535  auto edge_array = new EDGE_RECORD[num_forward_edges];
536  EDGE_ARRAY edge_array_ptr = edge_array;
537  for (i = 0; i < nodes_.size(); ++i) {
538  TRIE_NODE_RECORD *node_ptr = nodes_[i];
539  int end = node_ptr->forward_edges.size();
540  for (j = 0; j < end; ++j) {
541  EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
542  NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
543  ASSERT_HOST(node_ref < nodes_.size());
544  UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
545  link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
546  end_of_word_from_edge_rec(edge_rec), unichar_id);
547  if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr);
548  ++edge_array_ptr;
549  }
550  }
551  delete[] node_ref_map;
552 
553  return new SquishedDawg(edge_array, num_forward_edges, type_, lang_,
555 }
556 
558  const EDGE_RECORD &edge1,
559  const EDGE_RECORD &edge2) {
560  if (debug_level_ > 1) {
561  tprintf("\nCollapsing node %" PRIi64 ":\n", node);
563  tprintf("Candidate edges: ");
564  print_edge_rec(edge1);
565  tprintf(", ");
566  print_edge_rec(edge2);
567  tprintf("\n\n");
568  }
569  NODE_REF next_node1 = next_node_from_edge_rec(edge1);
570  NODE_REF next_node2 = next_node_from_edge_rec(edge2);

◆ unichar_id_to_patterns()

void tesseract::Trie::unichar_id_to_patterns ( UNICHAR_ID  unichar_id,
const UNICHARSET unicharset,
GenericVector< UNICHAR_ID > *  vec 
) const
overridevirtual

Fills vec with unichar ids that represent the character classes of the given unichar_id.

Reimplemented from tesseract::Dawg.

Definition at line 368 of file trie.cpp.

370  {
371  vec->push_back(punc_pattern_);
372  }
373 }
374 
376  if (ch == 'c') {
377  return alpha_pattern_;
378  } else if (ch == 'd') {
379  return digit_pattern_;
380  } else if (ch == 'n') {
381  return alphanum_pattern_;
382  } else if (ch == 'p') {
383  return punc_pattern_;
384  } else if (ch == 'a') {
385  return lower_pattern_;
386  } else if (ch == 'A') {
387  return upper_pattern_;
388  } else {

◆ unichar_ids_of()

void tesseract::Trie::unichar_ids_of ( NODE_REF  node,
NodeChildVector vec,
bool  word_end 
) const
inlineoverridevirtual

Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.

Implements tesseract::Dawg.

Definition at line 109 of file trie.h.

111  {
112  const EDGE_VECTOR &forward_edges =
113  nodes_[static_cast<int>(node)]->forward_edges;
114  for (int i = 0; i < forward_edges.size(); ++i) {
115  if (!word_end || end_of_word_from_edge_rec(forward_edges[i])) {
116  vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]),
117  make_edge_ref(node, i)));
118  }
119  }

Member Data Documentation

◆ alpha_pattern_

UNICHAR_ID tesseract::Trie::alpha_pattern_
protected

Definition at line 419 of file trie.h.

◆ alphanum_pattern_

UNICHAR_ID tesseract::Trie::alphanum_pattern_
protected

Definition at line 421 of file trie.h.

◆ deref_direction_mask_

uint64_t tesseract::Trie::deref_direction_mask_
protected

Definition at line 415 of file trie.h.

◆ deref_node_index_mask_

uint64_t tesseract::Trie::deref_node_index_mask_
protected

Definition at line 416 of file trie.h.

◆ digit_pattern_

UNICHAR_ID tesseract::Trie::digit_pattern_
protected

Definition at line 420 of file trie.h.

◆ initialized_patterns_

bool tesseract::Trie::initialized_patterns_
protected

Definition at line 425 of file trie.h.

◆ kAlphanumPatternUnicode

const char tesseract::Trie::kAlphanumPatternUnicode = "\u2002"
static

Definition at line 69 of file trie.h.

◆ kAlphaPatternUnicode

const char tesseract::Trie::kAlphaPatternUnicode = "\u2000"
static

Definition at line 67 of file trie.h.

◆ kDigitPatternUnicode

const char tesseract::Trie::kDigitPatternUnicode = "\u2001"
static

Definition at line 68 of file trie.h.

◆ kLowerPatternUnicode

const char tesseract::Trie::kLowerPatternUnicode = "\u2004"
static

Definition at line 71 of file trie.h.

◆ kPuncPatternUnicode

const char tesseract::Trie::kPuncPatternUnicode = "\u2003"
static

Definition at line 70 of file trie.h.

◆ kSaneNumConcreteChars

const int tesseract::Trie::kSaneNumConcreteChars = 0
static

Definition at line 63 of file trie.h.

◆ kUpperPatternUnicode

const char tesseract::Trie::kUpperPatternUnicode = "\u2005"
static

Definition at line 72 of file trie.h.

◆ lower_pattern_

UNICHAR_ID tesseract::Trie::lower_pattern_
protected

Definition at line 423 of file trie.h.

◆ nodes_

TRIE_NODES tesseract::Trie::nodes_
protected

Definition at line 411 of file trie.h.

◆ num_edges_

uint64_t tesseract::Trie::num_edges_
protected

Definition at line 414 of file trie.h.

◆ punc_pattern_

UNICHAR_ID tesseract::Trie::punc_pattern_
protected

Definition at line 422 of file trie.h.

◆ root_back_freelist_

GenericVector<EDGE_INDEX> tesseract::Trie::root_back_freelist_
protected

Definition at line 413 of file trie.h.

◆ upper_pattern_

UNICHAR_ID tesseract::Trie::upper_pattern_
protected

Definition at line 424 of file trie.h.


The documentation for this class was generated from the following files:
string
std::string string
Definition: equationdetect_test.cc:21
GenericVector::delete_data_pointers
void delete_data_pointers()
Definition: genericvector.h:872
GenericVector::remove
void remove(int index)
Definition: genericvector.h:765
TRIE_NODE_RECORD::forward_edges
EDGE_VECTOR forward_edges
Definition: trie.h:41
tesseract::Trie::edge_char_of
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override
Definition: trie.h:96
tesseract::Trie::upper_pattern_
UNICHAR_ID upper_pattern_
Definition: trie.h:424
tesseract::Trie::deref_node_index_mask_
uint64_t deref_node_index_mask_
Definition: trie.h:416
UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:488
tesseract::Trie::character_class_to_pattern
UNICHAR_ID character_class_to_pattern(char ch)
Definition: trie.cpp:390
tesseract::Dawg::type
DawgType type() const
Definition: dawg.h:122
tesseract::Dawg::flag_start_bit_
int flag_start_bit_
Definition: dawg.h:307
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
WERD_CHOICE
Definition: ratngs.h:261
UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:481
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
EDGE_INDEX
int64_t EDGE_INDEX
Definition: trie.h:36
tesseract::Dawg::next_node_from_edge_rec
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:207
tesseract::Dawg::lang
const STRING & lang() const
Definition: dawg.h:123
tesseract::Dawg::end_of_word_from_edge_rec
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:220
tesseract::Dawg::set_next_node_in_edge_rec
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
Definition: dawg.h:229
STRING
Definition: strngs.h:45
EDGE_ARRAY
EDGE_RECORD * EDGE_ARRAY
Definition: dawg.h:48
tesseract::Trie::print_node
void print_node(NODE_REF node, int max_num_edges) const override
Definition: trie.cpp:711
tesseract::Trie::alphanum_pattern_
UNICHAR_ID alphanum_pattern_
Definition: trie.h:421
tesseract::Trie::next_node
NODE_REF next_node(EDGE_REF edge_ref) const override
Definition: trie.h:125
tesseract::Trie::add_word_ending
void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
Definition: trie.cpp:166
tesseract::Dawg::given_greater_than_edge_rec
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:245
UNICHARSET::step
int step(const char *str) const
Definition: unicharset.cpp:232
tesseract::Trie::reduce_node_input
void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes)
Definition: trie.cpp:674
FORWARD_EDGE
#define FORWARD_EDGE
Definition: dawg.h:79
tesseract::Dawg::unichar_id_from_edge_rec
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:224
tesseract::Trie::kSaneNumConcreteChars
static const int kSaneNumConcreteChars
Definition: trie.h:63
tesseract::Dawg::perm_
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:298
tesseract::Trie::read_word_list
bool read_word_list(const char *filename, GenericVector< STRING > *words)
Definition: trie.cpp:304
MAX_NODE_EDGES_DISPLAY
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:81
tesseract::Trie::make_edge_ref
EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const
Definition: trie.h:292
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
tesseract::Dawg::edge_rec_match
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
Definition: dawg.h:266
tesseract::Trie::alpha_pattern_
UNICHAR_ID alpha_pattern_
Definition: trie.h:419
tesseract::Trie::initialized_patterns_
bool initialized_patterns_
Definition: trie.h:425
tesseract::Trie::punc_pattern_
UNICHAR_ID punc_pattern_
Definition: trie.h:422
chomp_string
void chomp_string(char *str)
Definition: helpers.h:75
tesseract::Trie::digit_pattern_
UNICHAR_ID digit_pattern_
Definition: trie.h:420
tesseract::Trie::sort_edges
void sort_edges(EDGE_VECTOR *edges)
Definition: trie.cpp:660
tesseract::Trie::RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:58
tesseract::Trie::RRP_DO_NO_REVERSE
Definition: trie.h:57
tesseract::Dawg::init
void init(int unicharset_size)
Definition: dawg.cpp:190
tesseract::Trie::link_edge
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:298
tesseract::Dawg::next_node_start_bit_
int next_node_start_bit_
Definition: dawg.h:308
EDGE_RECORD
uint64_t EDGE_RECORD
Definition: dawg.h:47
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
tesseract::Dawg::debug_level_
int debug_level_
Definition: dawg.h:310
tesseract::Dawg::marker_flag_from_edge_rec
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:211
DIRECTION_FLAG
#define DIRECTION_FLAG
Definition: dawg.h:83
tesseract::Dawg::Dawg
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Definition: dawg.h:199
tesseract::Trie::trie_to_dawg
SquishedDawg * trie_to_dawg()
Definition: trie.cpp:525
tesseract::Trie::eliminate_redundant_edges
bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
Definition: trie.cpp:572
tesseract::Trie::add_word_to_dawg
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:183
tesseract::Trie::RTLReversePolicy
RTLReversePolicy
Definition: trie.h:56
REFFORMAT
#define REFFORMAT
Definition: dawg.h:87
GenericVector::pop_back
T pop_back()
Definition: genericvector.h:734
LETTER_START_BIT
#define LETTER_START_BIT
Definition: dawg.h:85
tesseract::Trie::KillEdge
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:146
tesseract::Dawg::word_in_dawg
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:78
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
TRIE_NODE_RECORD
Definition: trie.h:40
GenericVector< EDGE_RECORD >
GenericVector::reserve
void reserve(int size)
Definition: genericvector.h:679
tesseract::Trie::nodes_
TRIE_NODES nodes_
Definition: trie.h:411
UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:495
tesseract::Dawg::set_marker_flag_in_edge_rec
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
Definition: dawg.h:235
tesseract::Dawg::lang_
STRING lang_
Definition: dawg.h:295
tesseract::Trie::remove_edge_linkage
void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:490
MARKER_FLAG
#define MARKER_FLAG
Definition: dawg.h:82
tesseract::Trie::add_edge_linkage
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:130
tesseract::Dawg::letter_mask_
uint64_t letter_mask_
Definition: dawg.h:305
tesseract::Trie::root_back_freelist_
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:413
tesseract::Trie::add_word_list
bool add_word_list(const GenericVector< STRING > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy)
Definition: trie.cpp:327
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::Trie::print_edge_rec
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:309
NODE_MARKER
bool * NODE_MARKER
Definition: trie.h:37
WERD_END_FLAG
#define WERD_END_FLAG
Definition: dawg.h:84
tesseract::Dawg::direction_from_edge_rec
int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the direction flag of this edge.
Definition: dawg.h:215
tesseract::Trie::can_be_eliminated
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
Definition: trie.h:318
tesseract::Trie::reduce_lettered_edges
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes)
Definition: trie.cpp:619
tesseract::Trie::RRP_FORCE_REVERSE
Definition: trie.h:59
tesseract::Dawg::unicharset_size_
int unicharset_size_
Definition: dawg.h:306
GenericVector::sort
void sort()
Definition: genericvector.h:1102
BACKWARD_EDGE
#define BACKWARD_EDGE
Definition: dawg.h:80
TRIE_NODE_RECORD::backward_edges
EDGE_VECTOR backward_edges
Definition: trie.h:42
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::Trie::num_edges_
uint64_t num_edges_
Definition: trie.h:414
tesseract::Dawg::type_
DawgType type_
Definition: dawg.h:296
tesseract::Trie::deref_edge_ref
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:283
tesseract::Trie::DeadEdge
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:150
tesseract::Trie::lower_pattern_
UNICHAR_ID lower_pattern_
Definition: trie.h:423
tesseract::Trie::new_dawg_node
NODE_REF new_dawg_node()
Definition: trie.cpp:282
NODE_REF
int64_t NODE_REF
Definition: dawg.h:50