tesseract
4.0.0-1-g2a2b
|
#include <trie.h>
Public Types | |
enum | RTLReversePolicy { RRP_DO_NO_REVERSE, RRP_REVERSE_IF_HAS_RTL, RRP_FORCE_REVERSE } |
Public Member Functions | |
Trie (DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level) | |
virtual | ~Trie () |
void | clear () |
EDGE_REF | edge_char_of (NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const |
void | unichar_ids_of (NODE_REF node, NodeChildVector *vec, bool word_end) const |
NODE_REF | next_node (EDGE_REF edge_ref) const |
bool | end_of_word (EDGE_REF edge_ref) const |
UNICHAR_ID | edge_letter (EDGE_REF edge_ref) const |
void | KillEdge (EDGE_RECORD *edge_rec) const |
bool | DeadEdge (const EDGE_RECORD &edge_rec) const |
void | print_node (NODE_REF node, int max_num_edges) const |
SquishedDawg * | trie_to_dawg () |
bool | read_and_add_word_list (const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse) |
bool | read_word_list (const char *filename, GenericVector< STRING > *words) |
bool | add_word_list (const GenericVector< STRING > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy) |
bool | read_pattern_list (const char *filename, const UNICHARSET &unicharset) |
void | initialize_patterns (UNICHARSET *unicharset) |
void | unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const |
virtual EDGE_REF | pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const |
bool | add_word_to_dawg (const WERD_CHOICE &word, const GenericVector< bool > *repetitions) |
bool | add_word_to_dawg (const WERD_CHOICE &word) |
Public Member Functions inherited from tesseract::Dawg | |
DawgType | type () const |
const STRING & | lang () const |
PermuterType | permuter () const |
virtual | ~Dawg () |
bool | word_in_dawg (const WERD_CHOICE &word) const |
Returns true if the given word is in the Dawg. More... | |
bool | prefix_in_dawg (const WERD_CHOICE &prefix, bool requires_complete) const |
int | check_for_words (const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const |
void | iterate_words (const UNICHARSET &unicharset, TessCallback1< const WERD_CHOICE *> *cb) const |
void | iterate_words (const UNICHARSET &unicharset, TessCallback1< const char *> *cb) const |
Static Public Member Functions | |
static const char * | get_reverse_policy_name (RTLReversePolicy reverse_policy) |
Static Public Attributes | |
static const int | kSaneNumConcreteChars = 0 |
static const char | kAlphaPatternUnicode [] = "\u2000" |
static const char | kDigitPatternUnicode [] = "\u2001" |
static const char | kAlphanumPatternUnicode [] = "\u2002" |
static const char | kPuncPatternUnicode [] = "\u2003" |
static const char | kLowerPatternUnicode [] = "\u2004" |
static const char | kUpperPatternUnicode [] = "\u2005" |
Static Public Attributes inherited from tesseract::Dawg | |
static const int16_t | kDawgMagicNumber = 42 |
Magic number to determine endianness when reading the Dawg from file. More... | |
static const UNICHAR_ID | kPatternUnicharID = 0 |
Protected Member Functions | |
EDGE_RECORD * | deref_edge_ref (EDGE_REF edge_ref) const |
EDGE_REF | make_edge_ref (NODE_REF node_index, EDGE_INDEX edge_index) const |
void | link_edge (EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id) |
void | print_edge_rec (const EDGE_RECORD &edge_rec) const |
bool | can_be_eliminated (const EDGE_RECORD &edge_rec) |
void | print_all (const char *msg, int max_num_edges) |
bool | edge_char_of (NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const |
bool | add_edge_linkage (NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id) |
bool | add_new_edge (NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id) |
void | add_word_ending (EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id) |
NODE_REF | new_dawg_node () |
void | remove_edge_linkage (NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id) |
void | remove_edge (NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id) |
bool | eliminate_redundant_edges (NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2) |
bool | reduce_lettered_edges (EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes) |
void | sort_edges (EDGE_VECTOR *edges) |
void | reduce_node_input (NODE_REF node, NODE_MARKER reduced_nodes) |
UNICHAR_ID | character_class_to_pattern (char ch) |
Protected Member Functions inherited from tesseract::Dawg | |
Dawg (DawgType type, const STRING &lang, PermuterType perm, int debug_level) | |
NODE_REF | next_node_from_edge_rec (const EDGE_RECORD &edge_rec) const |
Returns the next node visited by following this edge. More... | |
bool | marker_flag_from_edge_rec (const EDGE_RECORD &edge_rec) const |
Returns the marker flag of this edge. More... | |
int | direction_from_edge_rec (const EDGE_RECORD &edge_rec) const |
Returns the direction flag of this edge. More... | |
bool | end_of_word_from_edge_rec (const EDGE_RECORD &edge_rec) const |
Returns true if this edge marks the end of a word. More... | |
UNICHAR_ID | unichar_id_from_edge_rec (const EDGE_RECORD &edge_rec) const |
Returns UNICHAR_ID recorded in this edge. More... | |
void | set_next_node_in_edge_rec (EDGE_RECORD *edge_rec, EDGE_REF value) |
Sets the next node link for this edge in the Dawg. More... | |
void | set_marker_flag_in_edge_rec (EDGE_RECORD *edge_rec) |
Sets this edge record to be the last one in a sequence of edges. More... | |
int | given_greater_than_edge_rec (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const |
bool | edge_rec_match (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const |
void | init (int unicharset_size) |
bool | match_words (WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_ID wildcard) const |
void | iterate_words_rec (const WERD_CHOICE &word_so_far, NODE_REF to_explore, TessCallback1< const WERD_CHOICE *> *cb) const |
Protected Attributes | |
TRIE_NODES | nodes_ |
uint64_t | num_edges_ |
uint64_t | deref_direction_mask_ |
uint64_t | deref_node_index_mask_ |
GenericVector< EDGE_INDEX > | root_back_freelist_ |
bool | initialized_patterns_ |
UNICHAR_ID | alpha_pattern_ |
UNICHAR_ID | digit_pattern_ |
UNICHAR_ID | alphanum_pattern_ |
UNICHAR_ID | punc_pattern_ |
UNICHAR_ID | lower_pattern_ |
UNICHAR_ID | upper_pattern_ |
Protected Attributes inherited from tesseract::Dawg | |
DawgType | type_ |
STRING | lang_ |
PermuterType | perm_ |
Permuter code that should be used if the word is found in this Dawg. More... | |
int | unicharset_size_ |
int | flag_start_bit_ |
int | next_node_start_bit_ |
uint64_t | next_node_mask_ |
uint64_t | flags_mask_ |
uint64_t | letter_mask_ |
int | debug_level_ |
Concrete class for Trie data structure that allows to store a list of words (extends Dawg base class) as well as dynamically add new words. This class stores a vector of pointers to TRIE_NODE_RECORDs, each of which has a vector of forward and backward edges.
|
inline |
Definition at line 88 of file trie.h.
|
protected |
Definition at line 121 of file trie.cpp.
|
inlineprotected |
Definition at line 355 of file trie.h.
|
protected |
Definition at line 157 of file trie.cpp.
bool tesseract::Trie::add_word_list | ( | const GenericVector< STRING > & | words, |
const UNICHARSET & | unicharset, | ||
Trie::RTLReversePolicy | reverse_policy | ||
) |
Definition at line 318 of file trie.cpp.
bool tesseract::Trie::add_word_to_dawg | ( | const WERD_CHOICE & | word, |
const GenericVector< bool > * | repetitions | ||
) |
Definition at line 174 of file trie.cpp.
|
inline |
Definition at line 268 of file trie.h.
|
inlineprotected |
Definition at line 325 of file trie.h.
|
protected |
void tesseract::Trie::clear | ( | ) |
Definition at line 62 of file trie.cpp.
|
inline |
Definition at line 157 of file trie.h.
|
inlineprotected |
|
inlinevirtual |
Returns the edge that corresponds to the letter out of this node.
Implements tesseract::Dawg.
Definition at line 103 of file trie.h.
|
protected |
Definition at line 70 of file trie.cpp.
|
inlinevirtual |
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Implements tesseract::Dawg.
Definition at line 147 of file trie.h.
|
protected |
Definition at line 563 of file trie.cpp.
|
inlinevirtual |
Returns true if the edge indicated by the given EDGE_REF marks the end of a word.
Implements tesseract::Dawg.
Definition at line 141 of file trie.h.
|
static |
void tesseract::Trie::initialize_patterns | ( | UNICHARSET * | unicharset | ) |
Definition at line 342 of file trie.cpp.
|
inline |
|
inlineprotected |
|
inlineprotected |
|
protected |
Returns the next node visited by following the edge indicated by the given EDGE_REF.
Implements tesseract::Dawg.
Definition at line 132 of file trie.h.
|
inlinevirtual |
Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.
Reimplemented from tesseract::Dawg.
Definition at line 246 of file trie.h.
|
inlineprotected |
Definition at line 333 of file trie.h.
|
inlineprotected |
Prints the given EDGE_RECORD.
Definition at line 316 of file trie.h.
|
virtual |
Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.
Implements tesseract::Dawg.
Definition at line 702 of file trie.cpp.
bool tesseract::Trie::read_and_add_word_list | ( | const char * | filename, |
const UNICHARSET & | unicharset, | ||
Trie::RTLReversePolicy | reverse | ||
) |
Definition at line 286 of file trie.cpp.
bool tesseract::Trie::read_pattern_list | ( | const char * | filename, |
const UNICHARSET & | unicharset | ||
) |
Definition at line 399 of file trie.cpp.
bool tesseract::Trie::read_word_list | ( | const char * | filename, |
GenericVector< STRING > * | words | ||
) |
Definition at line 295 of file trie.cpp.
|
protected |
Definition at line 610 of file trie.cpp.
|
protected |
Eliminates any redundant edges from this node in the Trie.
Definition at line 665 of file trie.cpp.
|
inlineprotected |
Definition at line 380 of file trie.h.
|
protected |
Definition at line 481 of file trie.cpp.
|
protected |
Order num_edges of consecutive EDGE_RECORDS in the given EDGE_VECTOR in increasing order of unichar ids. This function is normally called for all edges in a single node, and since number of edges in each node is usually quite small, selection sort is used.
Definition at line 651 of file trie.cpp.
SquishedDawg * tesseract::Trie::trie_to_dawg | ( | ) |
Definition at line 516 of file trie.cpp.
|
virtual |
Fills vec with unichar ids that represent the character classes of the given unichar_id.
Reimplemented from tesseract::Dawg.
Definition at line 359 of file trie.cpp.
|
inlinevirtual |
Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.
Implements tesseract::Dawg.
Definition at line 116 of file trie.h.
|
protected |
|
protected |
|
protected |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
protected |
|
protected |
|
protected |
|
protected |
|
protected |