30 #pragma warning(disable:4244) // Conversion warnings
31 #pragma warning(disable:4800) // int/bool warnings
50 bool requires_complete)
const {
51 if (word.
length() == 0)
return !requires_complete;
53 int end_index = word.
length() - 1;
54 for (
int i = 0; i < end_index; i++) {
56 if (edge == NO_EDGE) {
76 bool enable_wildcard)
const {
77 if (filename ==
NULL)
return 0;
92 enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
93 tprintf(
"Missing word: %s\n",
string);
97 tprintf(
"Failed to create a valid word from %s\n",
string);
132 for (
int i = 0; i < children.
size(); i++) {
150 if (wildcard != INVALID_UNICHAR_ID && word->
unichar_id(index) == wildcard) {
151 bool any_matched =
false;
154 for (
int i = 0; i < vec.
size(); ++i) {
162 word_end = index == word->
length() - 1;
164 if (edge != NO_EDGE) {
169 }
else if (node != 0) {
178 PermuterType perm,
int unicharset_size,
int debug_level) {
204 bool word_end)
const {
208 EDGE_REF end = num_forward_edges_in_node0 - 1;
210 while (start <= end) {
211 edge = (start + end) >> 1;
213 unichar_id, edges_[edge]);
216 }
else if (compare == 1) {
223 if (edge != NO_EDGE && edge_occupied(edge)) {
228 }
while (!last_edge(edge++));
238 if (forward_edge (edge)) {
241 }
while (!last_edge(edge++));
248 if (node == NO_EDGE)
return;
251 const char *forward_string =
"FORWARD";
252 const char *backward_string =
" ";
254 const char *last_string =
"LAST";
255 const char *not_last_string =
" ";
257 const char *eow_string =
"EOW";
258 const char *not_eow_string =
" ";
266 if (edge_occupied(edge)) {
269 forward_edge(edge) ? forward_string : backward_string;
270 is_last = last_edge(edge) ? last_string : not_last_string;
271 eow =
end_of_word(edge) ? eow_string : not_eow_string;
276 direction, is_last, eow);
278 if (edge - node > max_num_edges)
return;
279 }
while (!last_edge(edge++));
281 if (edge < num_edges_ &&
282 edge_occupied(edge) && backward_edge(edge)) {
285 forward_edge(edge) ? forward_string : backward_string;
286 is_last = last_edge(edge) ? last_string : not_last_string;
287 eow =
end_of_word(edge) ? eow_string : not_eow_string;
291 ", unichar_id = %d, %s %s %s\n",
293 direction, is_last, eow);
296 }
while (!last_edge(edge++));
305 void SquishedDawg::print_edge(
EDGE_REF edge)
const {
306 if (edge == NO_EDGE) {
310 ", unichar_id = '%d', %s %s %s\n", edge,
312 (forward_edge(edge) ?
"FORWARD" :
" "),
313 (last_edge(edge) ?
"LAST" :
" "),
318 void SquishedDawg::read_squished_dawg(FILE *file,
323 if (debug_level)
tprintf(
"Reading squished dawg\n");
328 fread(&magic,
sizeof(
inT16), 1, file);
332 fread(&unicharset_size,
sizeof(
inT32), 1, file);
333 fread(&num_edges_,
sizeof(
inT32), 1, file);
336 ReverseN(&unicharset_size,
sizeof(unicharset_size));
337 ReverseN(&num_edges_,
sizeof(num_edges_));
340 Dawg::init(type, lang, perm, unicharset_size, debug_level);
343 fread(&edges_[0],
sizeof(
EDGE_RECORD), num_edges_, file);
346 for (edge = 0; edge < num_edges_; ++edge) {
347 ReverseN(&edges_[edge],
sizeof(edges_[edge]));
350 if (debug_level > 2) {
351 tprintf(
"type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
353 for (edge = 0; edge < num_edges_; ++edge)
358 NODE_MAP SquishedDawg::build_node_map(
inT32 *num_nodes)
const {
366 for (edge = 0; edge < num_edges_; edge++)
367 node_map [edge] = -1;
369 node_counter = num_forward_edges(0);
372 for (edge = 0; edge < num_edges_; edge++) {
374 if (forward_edge(edge)) {
376 node_map[edge] = (edge ? node_counter : 0);
377 num_edges = num_forward_edges(edge);
378 if (edge != 0) node_counter += num_edges;
380 if (edge >= num_edges_)
break;
381 if (backward_edge(edge))
while (!last_edge(edge++));
391 inT32 node_count = 0;
398 node_map = build_node_map(&node_count);
402 fwrite(&magic,
sizeof(
inT16), 1, file);
403 fwrite(&unicharset_size_,
sizeof(
inT32), 1, file);
407 for (edge=0; edge < num_edges_; edge++)
408 if (forward_edge(edge))
411 fwrite(&num_edges,
sizeof(
inT32), 1, file);
414 tprintf(
"%d nodes in DAWG\n", node_count);
415 tprintf(
"%d edges in DAWG\n", num_edges);
418 for (edge = 0; edge < num_edges_; edge++) {
419 if (forward_edge(edge)) {
422 set_next_node(edge, node_map[old_index]);
423 temp_record = edges_[edge];
424 fwrite(&(temp_record),
sizeof(
EDGE_RECORD), 1, file);
425 set_next_node(edge, old_index);
426 }
while (!last_edge(edge++));
428 if (edge >= num_edges_)
break;
429 if (backward_edge(edge))
430 while (!last_edge(edge++));
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
const STRING & lang() const
void memfree(void *element)
void set_unichar_id(UNICHAR_ID unichar_id, int index)
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
virtual bool end_of_word(EDGE_REF edge_ref) const =0
bool contains_unichar_id(UNICHAR_ID unichar_id) const
void print_node(NODE_REF node, int max_num_edges) const
NODE_REF next_node(EDGE_REF edge) const
void iterate_words_rec(const WERD_CHOICE &word_so_far, NODE_REF to_explore, TessCallback1< const WERD_CHOICE * > *cb) const
int direction(EDGEPT *point)
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
void CallWithUTF8(TessCallback1< const char * > *cb, const WERD_CHOICE *wc)
bool end_of_word(EDGE_REF edge_ref) const
EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const
Returns the edge that corresponds to the letter out of this node.
int check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
void write_squished_dawg(FILE *file)
Writes the squished/reduced Dawg to a file.
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
void iterate_words(const UNICHARSET &unicharset, TessCallback1< const WERD_CHOICE * > *cb) const
void chomp_string(char *str)
static const inT16 kDawgMagicNumber
Magic number to determine endianness when reading the Dawg from file.
const UNICHAR_ID unichar_id(int index) const
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const
bool match_words(WERD_CHOICE *word, inT32 index, NODE_REF node, UNICHAR_ID wildcard) const
#define MAX_NODE_EDGES_DISPLAY
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const =0
void ReverseN(void *ptr, int num_bytes)
FILE * open_file(const char *filename, const char *mode)
const char * string() const
void init(DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0