65 int direction,
bool word_end,
UNICHAR_ID unichar_id,
69 " direction %d word_end %d unichar_id %d, exploring node:\n",
70 node_ref,
next_node, direction, word_end, unichar_id);
71 if (node_ref != NO_EDGE) {
75 if (node_ref == NO_EDGE)
return false;
78 nodes_[node_ref]->forward_edges :
nodes_[node_ref]->backward_edges;
85 while (start <= end) {
86 k = (start + end) >> 1;
90 *edge_ptr = &(vec[k]);
93 }
else if (compare == 1) {
100 for (
int i = 0; i < vec_size; ++i) {
106 *edge_ptr = &(edge_rec);
116 int direction,
bool word_end,
119 &(
nodes_[node1]->forward_edges) : &(
nodes_[node1]->backward_edges);
123 while (search_index < vec->size() &&
125 (*vec)[search_index]) == 1) {
129 search_index = vec->
size();
132 link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
136 (*vec)[edge_index] = edge_rec;
137 }
else if (search_index < vec->size()) {
138 vec->
insert(edge_rec, search_index);
158 unichar_id, &back_edge_ptr, &back_edge_index));
170 if (word.
length() <= 0)
return false;
173 for (
int i = 0; i < word.
length(); ++i) {
181 bool marker_flag =
false;
184 int32_t still_finding_chars =
true;
185 int32_t word_end =
false;
186 bool add_failed =
false;
192 for (i = 0; i < word.
length() - 1; ++i) {
194 marker_flag = (repetitions !=
nullptr) ? (*repetitions)[i] :
false;
196 if (still_finding_chars) {
198 unichar_id, &edge_ptr, &edge_index);
201 edge_index, last_node);
204 still_finding_chars =
false;
214 still_finding_chars =
false;
222 if (!still_finding_chars) {
226 if (the_next_node == 0) {
231 marker_flag, word_end, unichar_id)) {
236 last_node = the_next_node;
241 marker_flag = (repetitions !=
nullptr) ? (*repetitions)[i] :
false;
243 if (still_finding_chars &&
245 unichar_id, &edge_ptr, &edge_index)) {
249 marker_flag, unichar_id);
255 !
add_new_edge(last_node, the_next_node, marker_flag,
true, unichar_id))
259 tprintf(
"Re-initializing document dictionary...\n");
274 static int sort_strings_by_dec_length(
const void* v1,
const void* v2) {
275 const auto *s1 = static_cast<const STRING *>(v1);
276 const auto *s2 = static_cast<const STRING *>(v2);
277 return s2->length() - s1->length();
285 word_list.
sort(sort_strings_by_dec_length);
295 word_file = fopen(filename,
"rb");
296 if (word_file ==
nullptr)
return false;
298 while (fgets(line_str,
sizeof(line_str), word_file) !=
nullptr) {
300 STRING word_str(line_str);
303 tprintf(
"Read %d words so far\n", word_count);
307 tprintf(
"Read %d words total.\n", word_count);
315 for (
int i = 0; i < words.
size(); ++i) {
327 tprintf(
"Error: word '%s' not in DAWG after adding it\n",
356 bool is_alpha = unicharset.
get_isalpha(unichar_id);
378 }
else if (ch ==
'd') {
380 }
else if (ch ==
'n') {
382 }
else if (ch ==
'p') {
384 }
else if (ch ==
'a') {
386 }
else if (ch ==
'A') {
389 return INVALID_UNICHAR_ID;
396 tprintf(
"please call initialize_patterns() before read_pattern_list()\n");
400 FILE *pattern_file = fopen(filename,
"rb");
401 if (pattern_file ==
nullptr) {
402 tprintf(
"Error opening pattern file %s\n", filename);
406 int pattern_count = 0;
414 const char *str_ptr =
string;
415 int step = unicharset.
step(str_ptr);
418 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
419 if (step == 1 && *str_ptr ==
'\\') {
421 if (*str_ptr ==
'\\') {
425 tprintf(
"Please provide at least %d concrete characters at the"
436 if (curr_unichar_id == INVALID_UNICHAR_ID) {
443 step = unicharset.
step(str_ptr);
445 if (step == 1 && *str_ptr ==
'\\' && *(str_ptr+1) ==
'*') {
446 repetitions_vec[repetitions_vec.
size()-1] =
true;
448 step = unicharset.
step(str_ptr);
452 tprintf(
"Invalid user pattern %s\n",
string);
457 tprintf(
"Inserting expanded user pattern %s\n",
463 tprintf(
"Error: failed to insert pattern '%s'\n",
string);
469 tprintf(
"Read %d valid patterns from %s\n", pattern_count, filename);
471 fclose(pattern_file);
480 unichar_id, &edge_ptr, &edge_index));
488 }
else if (node1 == 0) {
516 for (
int i = 0; i <
nodes_.
size(); i++) reduced_nodes[i] =
false;
518 delete[] reduced_nodes;
529 node_ref_map[i+1] = node_ref_map[i] +
nodes_[i]->forward_edges.
size();
531 int num_forward_edges = node_ref_map[i];
535 auto edge_array =
new EDGE_RECORD[num_forward_edges];
540 for (j = 0; j < end; ++j) {
551 delete[] node_ref_map;
553 return new SquishedDawg(edge_array, num_forward_edges,
type_,
lang_,
561 tprintf(
"\nCollapsing node %" PRIi64
":\n", node);
585 curr_word_end, curr_unichar_id);
588 curr_word_end, curr_unichar_id,
589 &edge_ptr, &edge_index));
596 next_node2_num_edges, next_node2);
612 bool did_something =
false;
613 for (
int i = edge_index; i < backward_edges->
size() - 1; ++i) {
615 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
616 while (i < backward_edges->size()) {
617 if (!
DeadEdge((*backward_edges)[i])) {
619 if (curr_unichar_id != unichar_id)
return did_something;
624 if (i == backward_edges->
size())
break;
625 const EDGE_RECORD &edge_rec = (*backward_edges)[i];
627 for (
int j = i + 1; j < backward_edges->
size(); ++j) {
628 const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
629 if (
DeadEdge(next_edge_rec))
continue;
631 if (next_id != unichar_id)
break;
637 did_something =
true;
642 return did_something;
646 int num_edges = edges->
size();
647 if (num_edges <= 1)
return;
650 for (
int i = 0; i < num_edges; ++i) {
651 sort_vec.
push_back(KDPairInc<UNICHAR_ID, EDGE_RECORD>(
655 for (
int i = 0; i < num_edges; ++i)
656 (*edges)[i] = sort_vec[i].data;
669 while (edge_index < backward_edges.
size()) {
670 if (
DeadEdge(backward_edges[edge_index]))
continue;
674 &backward_edges, reduced_nodes));
675 while (++edge_index < backward_edges.
size()) {
677 if (!
DeadEdge(backward_edges[edge_index]) &&
id != unichar_id)
break;
680 reduced_nodes[node] =
true;
687 for (
int i = 0; i < backward_edges.
size(); ++i) {
688 if (
DeadEdge(backward_edges[i]))
continue;
697 if (node == NO_EDGE)
return;
702 for (
int dir = 0; dir < 2; ++dir) {
711 for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
712 i < max_num_edges; ++i) {
717 if (dir == 0 ? i < num_fwd : i < num_bkw)
tprintf(
"...");