tesseract
5.0.0-alpha-619-ge9db
|
Go to the documentation of this file.
37 #define NO_EDGE (int64_t) 0xffffffffffffffffi64
40 #define NO_EDGE (int64_t) 0xffffffffffffffffll
80 #define FORWARD_EDGE (int32_t) 0
81 #define BACKWARD_EDGE (int32_t) 1
82 #define MAX_NODE_EDGES_DISPLAY (int64_t) 100
83 #define MARKER_FLAG (int64_t) 1
84 #define DIRECTION_FLAG (int64_t) 2
85 #define WERD_END_FLAG (int64_t) 4
86 #define LETTER_START_BIT 0
87 #define NUM_FLAG_BITS 3
88 #define REFFORMAT "%" PRId64
91 {
false,
true,
true,
false },
92 {
true,
false,
false,
false },
93 {
true,
false,
false,
false },
94 {
false,
false,
false,
false },
97 static const char kWildcard[] =
"*";
140 bool enable_wildcard)
const;
150 std::function<
void(
const char*)> cb)
const;
156 bool word_end)
const = 0;
161 bool word_end)
const = 0;
254 curr_word_end, curr_unichar_id))
return 0;
255 if (unichar_id > curr_unichar_id)
return 1;
256 if (unichar_id == curr_unichar_id) {
257 if (
next_node > curr_next_node)
return 1;
259 if (word_end > curr_word_end)
return 1;
273 return ((unichar_id == other_unichar_id) &&
275 (!word_end || (word_end == other_word_end)));
280 void init(
int unicharset_size);
384 const char *debug_msg) {
386 if (
data_[i] == new_pos)
return false;
417 num_forward_edges_in_node0 = num_forward_edges(0);
424 num_edges_(num_edges) {
425 init(unicharset_size);
426 num_forward_edges_in_node0 = num_forward_edges(0);
427 if (debug_level > 3) print_all(
"SquishedDawg:");
433 if (!read_squished_dawg(fp))
return false;
434 num_forward_edges_in_node0 = num_forward_edges(0);
438 int NumEdges() {
return num_edges_; }
442 bool word_end)
const override;
447 bool word_end)
const override {
449 if (!edge_occupied(edge) || edge == NO_EDGE)
return;
450 assert(forward_edge(edge));
455 }
while (!last_edge(edge++));
486 file.OpenWrite(
nullptr);
488 tprintf(
"Error serializing %s\n", filename);
491 if (!
file.CloseWrite(filename,
nullptr)) {
492 tprintf(
"Error writing file %s\n", filename);
504 inline void set_empty_edge(
EDGE_REF edge_ref) {
508 inline void clear_all_edges() {
509 for (
int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge);
512 inline void clear_marker_flag(
EDGE_REF edge_ref) {
516 inline bool forward_edge(
EDGE_REF edge_ref)
const {
517 return (edge_occupied(edge_ref) &&
521 inline bool backward_edge(
EDGE_REF edge_ref)
const {
522 return (edge_occupied(edge_ref) &&
526 inline bool edge_occupied(
EDGE_REF edge_ref)
const {
530 inline bool last_edge(
EDGE_REF edge_ref)
const {
535 int32_t num_forward_edges(
NODE_REF node)
const;
538 bool read_squished_dawg(TFile *
file);
541 void print_edge(
EDGE_REF edge)
const;
544 void print_all(
const char* msg) {
545 tprintf(
"\n__________________________\n%s\n", msg);
546 for (
int i = 0; i < num_edges_; ++i) print_edge(i);
547 tprintf(
"__________________________\n");
550 std::unique_ptr<EDGE_REF[]> build_node_map(int32_t *num_nodes)
const;
554 int32_t num_edges_ = 0;
555 int num_forward_edges_in_node0 = 0;
560 #endif // DICT_DAWG_H_
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
virtual bool end_of_word(EDGE_REF edge_ref) const =0
NODE_REF next_node(EDGE_REF edge) const override
bool match_words(WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_ID wildcard) const
EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const override
Returns the edge that corresponds to the letter out of this node.
static FILE * Open(const std::string &filename, const std::string &mode)
void iterate_words_rec(const WERD_CHOICE &word_so_far, NODE_REF to_explore, std::function< void(const WERD_CHOICE *)> cb) const
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
const STRING & lang() const
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
bool operator==(const DawgPosition &other)
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
GenericVector< NodeChild > NodeChildVector
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const =0
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
virtual void print_node(NODE_REF node, int max_num_edges) const =0
SquishedDawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
int push_back(DawgPosition object)
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
void init(int unicharset_size)
bool end_of_word(EDGE_REF edge_ref) const override
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
DawgPosition(int dawg_idx, EDGE_REF dawgref, int punc_idx, EDGE_REF puncref, bool backtopunc)
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
void iterate_words(const UNICHARSET &unicharset, std::function< void(const WERD_CHOICE *)> cb) const
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
static const int16_t kDawgMagicNumber
Magic number to determine endianness when reading the Dawg from file.
void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const override
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
int check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
DLLSYM void tprintf(const char *format,...)
static const UNICHAR_ID kPatternUnicharID
int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the direction flag of this edge.
bool write_squished_dawg(TFile *file)
Writes the squished/reduced Dawg to a file.
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
void print_node(NODE_REF node, int max_num_edges) const override
PermuterType permuter() const