tesseract  5.0.0-alpha-619-ge9db
tfacepp.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tfacepp.cpp (Formerly tface++.c)
3  * Description: C++ side of the C/C++ Tess/Editor interface.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include <cmath>
20 
21 #include "blamer.h"
22 #include "errcode.h"
23 #include "ratngs.h"
24 #include "reject.h"
25 #include "tesseractclass.h"
26 #include "werd.h"
27 
28 #define MAX_UNDIVIDED_LENGTH 24
29 
30 
31 
32 /**********************************************************************
33  * recog_word
34  *
35  * Convert the word to tess form and pass it to the tess segmenter.
36  * Convert the output back to editor form.
37  **********************************************************************/
38 namespace tesseract {
39 void Tesseract::recog_word(WERD_RES *word) {
40  if (wordrec_skip_no_truth_words && (word->blamer_bundle == nullptr ||
42  if (classify_debug_level) tprintf("No truth for word - skipping\n");
43  word->tess_failed = true;
44  return;
45  }
48  word->SetupBoxWord();
49  if (word->best_choice->length() != word->box_word->length()) {
50  tprintf("recog_word ASSERT FAIL String:\"%s\"; "
51  "Strlen=%d; #Blobs=%d\n",
52  word->best_choice->debug_string().c_str(),
53  word->best_choice->length(), word->box_word->length());
54  }
55  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
56  // Check that the ratings matrix size matches the sum of all the
57  // segmentation states.
58  if (!word->StatesAllValid()) {
59  tprintf("Not all words have valid states relative to ratings matrix!!");
60  word->DebugWordChoices(true, nullptr);
61  ASSERT_HOST(word->StatesAllValid());
62  }
64  /* Override the permuter type if a straight dictionary check disagrees. */
65  uint8_t perm_type = word->best_choice->permuter();
66  if ((perm_type != SYSTEM_DAWG_PERM) &&
67  (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
68  uint8_t real_dict_perm_type = dict_word(*word->best_choice);
69  if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
70  (real_dict_perm_type == FREQ_DAWG_PERM) ||
71  (real_dict_perm_type == USER_DAWG_PERM)) &&
73  word->best_choice->unichar_lengths().c_str()) > 0)) {
74  word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
75  }
76  }
78  perm_type != word->best_choice->permuter()) {
79  tprintf("Permuter Type Flipped from %d to %d\n",
80  perm_type, word->best_choice->permuter());
81  }
82  }
83  // Factored out from control.cpp
84  ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
85  if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
86  static_cast<int>(strspn(word->best_choice->unichar_string().c_str(),
87  " ")) == word->best_choice->length()) {
88  word->tess_failed = true;
89  word->reject_map.initialise(word->box_word->length());
91  } else {
92  word->tess_failed = false;
93  }
94 }
95 
96 
97 /**********************************************************************
98  * recog_word_recursive
99  *
100  * Convert the word to tess form and pass it to the tess segmenter.
101  * Convert the output back to editor form.
102  **********************************************************************/
104  int word_length = word->chopped_word->NumBlobs(); // no of blobs
105  if (word_length > MAX_UNDIVIDED_LENGTH) {
106  return split_and_recog_word(word);
107  }
108  cc_recog(word);
109  word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
110 
111  // Do sanity checks and minor fixes on best_choice.
112  if (word->best_choice->length() > word_length) {
113  word->best_choice->make_bad(); // should never happen
114  tprintf("recog_word: Discarded long string \"%s\""
115  " (%d characters vs %d blobs)\n",
116  word->best_choice->unichar_string().c_str(),
117  word->best_choice->length(), word_length);
118  tprintf("Word is at:");
119  word->word->bounding_box().print();
120  }
121  if (word->best_choice->length() < word_length) {
122  UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
123  while (word->best_choice->length() < word_length) {
124  word->best_choice->append_unichar_id(space_id, 1, 0.0,
125  word->best_choice->certainty());
126  }
127  }
128 }
129 
130 
131 /**********************************************************************
132  * split_and_recog_word
133  *
134  * Split the word into 2 smaller pieces at the largest gap.
135  * Recognize the pieces and stick the results back together.
136  **********************************************************************/
138  // Find the biggest blob gap in the chopped_word.
139  int bestgap = -INT32_MAX;
140  int split_index = 0;
141  for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
142  TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
143  TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
144  int gap = blob_box.left() - prev_box.right();
145  if (gap > bestgap) {
146  bestgap = gap;
147  split_index = b;
148  }
149  }
150  ASSERT_HOST(split_index > 0);
151 
152  WERD_RES *word2 = nullptr;
153  BlamerBundle *orig_bb = nullptr;
154  split_word(word, split_index, &word2, &orig_bb);
155 
156  // Recognize the first part of the word.
157  recog_word_recursive(word);
158  // Recognize the second part of the word.
159  recog_word_recursive(word2);
160 
161  join_words(word, word2, orig_bb);
162 }
163 
164 
165 /**********************************************************************
166  * split_word
167  *
168  * Split a given WERD_RES in place into two smaller words for recognition.
169  * split_pt is the index of the first blob to go in the second word.
170  * The underlying word is left alone, only the TWERD (and subsequent data)
171  * are split up. orig_blamer_bundle is set to the original blamer bundle,
172  * and will now be owned by the caller. New blamer bundles are forged for the
173  * two pieces.
174  **********************************************************************/
176  int split_pt,
177  WERD_RES **right_piece,
178  BlamerBundle **orig_blamer_bundle) const {
179  ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
180 
181  // Save a copy of the blamer bundle so we can try to reconstruct it below.
182  BlamerBundle *orig_bb =
183  word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
184 
185  auto *word2 = new WERD_RES(*word);
186 
187  // blow away the copied chopped_word, as we want to work with
188  // the blobs from the input chopped_word so seam_arrays can be merged.
189  TWERD *chopped = word->chopped_word;
190  auto *chopped2 = new TWERD;
191  chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
192  for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
193  chopped2->blobs.push_back(chopped->blobs[i]);
194  }
195  chopped->blobs.truncate(split_pt);
196  word->chopped_word = nullptr;
197  delete word2->chopped_word;
198  word2->chopped_word = nullptr;
199 
200  const UNICHARSET &unicharset = *word->uch_set;
201  word->ClearResults();
202  word2->ClearResults();
203  word->chopped_word = chopped;
204  word2->chopped_word = chopped2;
206  word2->SetupBasicsFromChoppedWord(unicharset);
207 
208  // Try to adjust the blamer bundle.
209  if (orig_bb != nullptr) {
210  // TODO(rays) Looks like a leak to me.
211  // orig_bb should take, rather than copy.
212  word->blamer_bundle = new BlamerBundle();
213  word2->blamer_bundle = new BlamerBundle();
214  orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
215  word2->chopped_word->blobs[0]->bounding_box().left(),
217  word->blamer_bundle, word2->blamer_bundle);
218  }
219 
220  *right_piece = word2;
221  *orig_blamer_bundle = orig_bb;
222 }
223 
224 
225 /**********************************************************************
226  * join_words
227  *
228  * The opposite of split_word():
229  * join word2 (including any recognized data / seam array / etc)
230  * onto the right of word and then delete word2.
231  * Also, if orig_bb is provided, stitch it back into word.
232  **********************************************************************/
234  WERD_RES *word2,
235  BlamerBundle *orig_bb) const {
236  TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
237  TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
238  // Tack the word2 outputs onto the end of the word outputs.
239  word->chopped_word->blobs += word2->chopped_word->blobs;
240  word->rebuild_word->blobs += word2->rebuild_word->blobs;
241  word2->chopped_word->blobs.clear();
242  word2->rebuild_word->blobs.clear();
243  TPOINT split_pt;
244  split_pt.x = (prev_box.right() + blob_box.left()) / 2;
245  split_pt.y = (prev_box.top() + prev_box.bottom() +
246  blob_box.top() + blob_box.bottom()) / 4;
247  // Move the word2 seams onto the end of the word1 seam_array.
248  // Since the seam list is one element short, an empty seam marking the
249  // end of the last blob in the first word is needed first.
250  word->seam_array.push_back(new SEAM(0.0f, split_pt));
251  word->seam_array += word2->seam_array;
252  word2->seam_array.truncate(0);
253  // Fix widths and gaps.
254  word->blob_widths += word2->blob_widths;
255  word->blob_gaps += word2->blob_gaps;
256  // Fix the ratings matrix.
257  int rat1 = word->ratings->dimension();
258  int rat2 = word2->ratings->dimension();
259  word->ratings->AttachOnCorner(word2->ratings);
260  ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
261  word->best_state += word2->best_state;
262  // Append the word choices.
263  *word->raw_choice += *word2->raw_choice;
264 
265  // How many alt choices from each should we try to get?
266  const int kAltsPerPiece = 2;
267  // When do we start throwing away extra alt choices?
268  const int kTooManyAltChoices = 100;
269 
270  // Construct the cartesian product of the best_choices of word(1) and word2.
271  WERD_CHOICE_LIST joined_choices;
272  WERD_CHOICE_IT jc_it(&joined_choices);
273  WERD_CHOICE_IT bc1_it(&word->best_choices);
274  WERD_CHOICE_IT bc2_it(&word2->best_choices);
275  int num_word1_choices = word->best_choices.length();
276  int total_joined_choices = num_word1_choices;
277  // Nota Bene: For the main loop here, we operate only on the 2nd and greater
278  // word2 choices, and put them in the joined_choices list. The 1st word2
279  // choice gets added to the original word1 choices in-place after we have
280  // finished with them.
281  int bc2_index = 1;
282  for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
283  if (total_joined_choices >= kTooManyAltChoices &&
284  bc2_index > kAltsPerPiece)
285  break;
286  int bc1_index = 0;
287  for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
288  ++bc1_index, bc1_it.forward()) {
289  if (total_joined_choices >= kTooManyAltChoices &&
290  bc1_index > kAltsPerPiece)
291  break;
292  auto *wc = new WERD_CHOICE(*bc1_it.data());
293  *wc += *bc2_it.data();
294  jc_it.add_after_then_move(wc);
295  ++total_joined_choices;
296  }
297  }
298  // Now that we've filled in as many alternates as we want, paste the best
299  // choice for word2 onto the original word alt_choices.
300  bc1_it.move_to_first();
301  bc2_it.move_to_first();
302  for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
303  *bc1_it.data() += *bc2_it.data();
304  }
305  bc1_it.move_to_last();
306  bc1_it.add_list_after(&joined_choices);
307 
308  // Restore the pointer to original blamer bundle and combine blamer
309  // information recorded in the splits.
310  if (orig_bb != nullptr) {
311  orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
313  delete word->blamer_bundle;
314  word->blamer_bundle = orig_bb;
315  }
316  word->SetupBoxWord();
317  word->reject_map.initialise(word->box_word->length());
318  delete word2;
319 }
320 
321 
322 } // namespace tesseract
WERD_RES::SetupBoxWord
void SetupBoxWord()
Definition: pageres.cpp:845
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
tesseract::Tesseract::split_and_recog_word
void split_and_recog_word(WERD_RES *word)
Definition: tfacepp.cpp:137
tesseract::Wordrec::wordrec_skip_no_truth_words
bool wordrec_skip_no_truth_words
Definition: wordrec.h:230
WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:266
WERD_RES::blob_widths
GenericVector< int > blob_widths
Definition: pageres.h:210
TPOINT
Definition: blobs.h:49
WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:260
WERD_CHOICE
Definition: ratngs.h:261
REJMAP::initialise
void initialise(int16_t length)
Definition: rejctmap.cpp:272
TWERD
Definition: blobs.h:416
tesseractclass.h
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::Tesseract::recog_word
void recog_word(WERD_RES *word)
Definition: tfacepp.cpp:41
SYSTEM_DAWG_PERM
Definition: ratngs.h:239
WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:147
WERD_CHOICE::make_bad
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:431
tesseract::Wordrec::cc_recog
void cc_recog(WERD_RES *word)
Definition: tface.cpp:139
TBOX::print
void print() const
Definition: rect.h:277
tesseract::Wordrec::dict_word
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:103
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
TBOX::top
int16_t top() const
Definition: rect.h:57
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:334
WERD_RES
Definition: pageres.h:160
MAX_UNDIVIDED_LENGTH
#define MAX_UNDIVIDED_LENGTH
Definition: tfacepp.cpp:27
WERD_RES::ratings
MATRIX * ratings
Definition: pageres.h:231
WERD_RES::best_state
GenericVector< int > best_state
Definition: pageres.h:279
WERD_RES::SetupBasicsFromChoppedWord
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:339
BlamerBundle::JoinBlames
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
Definition: blamer.cpp:231
tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:57
GenericVector::back
T & back() const
Definition: genericvector.h:728
SEAM
Definition: seam.h:36
ratngs.h
WERD_RES::tess_failed
bool tess_failed
Definition: pageres.h:289
WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:197
werd.h
WERD_RES::blamer_bundle
BlamerBundle * blamer_bundle
Definition: pageres.h:246
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
WERD_RES::DebugWordChoices
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:476
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
tesseract::Tesseract::tessedit_rejection_debug
bool tessedit_rejection_debug
Definition: tesseractclass.h:1026
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
TPOINT::x
int16_t x
Definition: blobs.h:91
WERD_RES::ClearResults
void ClearResults()
Definition: pageres.cpp:1100
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
TPOINT::y
int16_t y
Definition: blobs.h:92
tesseract::Tesseract::tessedit_override_permuter
bool tessedit_override_permuter
Definition: tesseractclass.h:1049
TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:457
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
UNICHARSET
Definition: unicharset.h:145
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
BandTriMatrix::AttachOnCorner
void AttachOnCorner(BandTriMatrix< T > *array2)
Definition: matrix.h:549
WERD_RES::raw_choice
WERD_CHOICE * raw_choice
Definition: pageres.h:240
WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:206
REJMAP::rej_word_tess_failure
void rej_word_tess_failure()
Definition: rejctmap.cpp:351
WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:288
tesseract
Definition: baseapi.h:65
WERD_CHOICE::debug_string
const STRING debug_string() const
Definition: ratngs.h:493
IRR_NO_TRUTH
Definition: blamer.h:94
WERD_RES::best_choices
WERD_CHOICE_LIST best_choices
Definition: pageres.h:243
tesseract::Tesseract::split_word
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:174
WERD_RES::seam_array
GenericVector< SEAM * > seam_array
Definition: pageres.h:208
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:466
GenericVector::reserve
void reserve(int size)
Definition: genericvector.h:679
WERD_RES::StatesAllValid
bool StatesAllValid()
Definition: pageres.cpp:454
tesseract::Tesseract::recog_word_recursive
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:104
reject.h
tesseract::Wordrec::wordrec_debug_blamer
bool wordrec_debug_blamer
Definition: wordrec.h:231
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
tesseract::BoxWord::length
int length() const
Definition: boxword.h:82
GenericVector::truncate
void truncate(int size)
Definition: genericvector.h:132
TBOX::left
int16_t left() const
Definition: rect.h:71
GenericVector::clear
void clear()
Definition: genericvector.h:857
TBOX::right
int16_t right() const
Definition: rect.h:78
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
WERD_RES::blob_gaps
GenericVector< int > blob_gaps
Definition: pageres.h:213
errcode.h
WERD_RES::word
WERD * word
Definition: pageres.h:180
BlamerBundle::incorrect_result_reason
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:121
BlamerBundle
Definition: blamer.h:103
tesseract::Tesseract::join_words
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:231
blamer.h
WERD_CHOICE::set_permuter
void set_permuter(uint8_t perm)
Definition: ratngs.h:363
WERD_CHOICE::unichar_lengths
const STRING & unichar_lengths() const
Definition: ratngs.h:536
FREQ_DAWG_PERM
Definition: ratngs.h:242
tesseract::Classify::classify_debug_level
int classify_debug_level
Definition: classify.h:430
WERD_CHOICE::append_unichar_id
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:470
TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:446
USER_DAWG_PERM
Definition: ratngs.h:241
BandTriMatrix::dimension
int dimension() const
Definition: matrix.h:532
tesseract::Tesseract::alpha_count
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:494
TBOX
Definition: rect.h:33