tesseract  4.0.0-1-g2a2b
recogtraining.cpp
Go to the documentation of this file.
1 // File: recogtraining.cpp
3 // Description: Functions for ambiguity and parameter training.
4 // Author: Daria Antonova
5 // Created: Mon Aug 13 11:26:43 PDT 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include "tesseractclass.h"
21 
22 #include "boxread.h"
23 #include "control.h"
24 #include "host.h"
25 #include "ratngs.h"
26 #include "reject.h"
27 #include "stopper.h"
28 
29 namespace tesseract {
30 
31 const int16_t kMaxBoxEdgeDiff = 2;
32 
33 // Sets flags necessary for recognition in the training mode.
34 // Opens and returns the pointer to the output file.
37  tessedit_tess_adaption_mode.set_value(0); // turn off adaption
38  tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
39  // Explore all segmentations.
41  }
42 
43  STRING output_fname = fname;
44  const char *lastdot = strrchr(output_fname.string(), '.');
45  if (lastdot != nullptr) output_fname[lastdot - output_fname.string()] = '\0';
46  output_fname += ".txt";
47  FILE *output_file = fopen(output_fname.string(), "a+");
48  if (output_file == nullptr) {
49  tprintf("Error: Could not open file %s\n", output_fname.string());
50  ASSERT_HOST(output_file);
51  }
52  return output_file;
53 }
54 
55 // Copies the bounding box from page_res_it->word() to the given TBOX.
56 static bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
57  while (page_res_it->block() != nullptr && page_res_it->word() == nullptr)
58  page_res_it->forward();
59 
60  if (page_res_it->word() != nullptr) {
61  *tbox = page_res_it->word()->word->bounding_box();
62 
63  // If tbox->left() is negative, the training image has vertical text and
64  // all the coordinates of bounding boxes of page_res are rotated by 90
65  // degrees in a counterclockwise direction. We need to rotate the TBOX back
66  // in order to compare with the TBOXes of box files.
67  if (tbox->left() < 0) {
68  tbox->rotate(FCOORD(0.0, -1.0));
69  }
70 
71  return true;
72  } else {
73  return false;
74  }
75 }
76 
77 // This function takes tif/box pair of files and runs recognition on the image,
78 // while making sure that the word bounds that tesseract identified roughly
79 // match to those specified by the input box file. For each word (ngram in a
80 // single bounding box from the input box file) it outputs the ocred result,
81 // the correct label, rating and certainty.
83  PAGE_RES *page_res,
84  volatile ETEXT_DESC *monitor,
85  FILE *output_file) {
86  STRING box_fname = fname;
87  const char *lastdot = strrchr(box_fname.string(), '.');
88  if (lastdot != nullptr) box_fname[lastdot - box_fname.string()] = '\0';
89  box_fname += ".box";
90  // ReadNextBox() will close box_file
91  FILE *box_file = fopen(box_fname.string(), "r");
92  if (box_file == nullptr) {
93  tprintf("Error: Could not open file %s\n", box_fname.string());
94  ASSERT_HOST(box_file);
95  }
96 
97  PAGE_RES_IT page_res_it;
98  page_res_it.page_res = page_res;
99  page_res_it.restart_page();
100  STRING label;
101 
102  // Process all the words on this page.
103  TBOX tbox; // tesseract-identified box
104  TBOX bbox; // box from the box file
105  bool keep_going;
106  int line_number = 0;
107  int examined_words = 0;
108  do {
109  keep_going = read_t(&page_res_it, &tbox);
110  keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label,
111  &bbox);
112  // Align bottom left points of the TBOXes.
113  while (keep_going &&
114  !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
115  if (bbox.bottom() < tbox.bottom()) {
116  page_res_it.forward();
117  keep_going = read_t(&page_res_it, &tbox);
118  } else {
119  keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
120  &bbox);
121  }
122  }
123  while (keep_going &&
124  !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
125  if (bbox.left() > tbox.left()) {
126  page_res_it.forward();
127  keep_going = read_t(&page_res_it, &tbox);
128  } else {
129  keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
130  &bbox);
131  }
132  }
133  // OCR the word if top right points of the TBOXes are similar.
134  if (keep_going &&
135  NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
136  NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
137  ambigs_classify_and_output(label.string(), &page_res_it, output_file);
138  examined_words++;
139  }
140  page_res_it.forward();
141  } while (keep_going);
142 
143  // Set up scripts on all of the words that did not get sent to
144  // ambigs_classify_and_output. They all should have, but if all the
145  // werd_res's don't get uch_sets, tesseract will crash when you try
146  // to iterate over them. :-(
147  int total_words = 0;
148  for (page_res_it.restart_page(); page_res_it.block() != nullptr;
149  page_res_it.forward()) {
150  if (page_res_it.word()) {
151  if (page_res_it.word()->uch_set == nullptr)
152  page_res_it.word()->SetupFake(unicharset);
153  total_words++;
154  }
155  }
156  if (examined_words < 0.85 * total_words) {
157  tprintf("TODO(antonova): clean up recog_training_segmented; "
158  " It examined only a small fraction of the ambigs image.\n");
159  }
160  tprintf("recog_training_segmented: examined %d / %d words.\n",
161  examined_words, total_words);
162 }
163 
164 // Helper prints the given set of blob choices.
165 static void PrintPath(int length, const BLOB_CHOICE** blob_choices,
166  const UNICHARSET& unicharset,
167  const char *label, FILE *output_file) {
168  float rating = 0.0f;
169  float certainty = 0.0f;
170  for (int i = 0; i < length; ++i) {
171  const BLOB_CHOICE* blob_choice = blob_choices[i];
172  fprintf(output_file, "%s",
173  unicharset.id_to_unichar(blob_choice->unichar_id()));
174  rating += blob_choice->rating();
175  if (certainty > blob_choice->certainty())
176  certainty = blob_choice->certainty();
177  }
178  fprintf(output_file, "\t%s\t%.4f\t%.4f\n",
179  label, rating, certainty);
180 }
181 
182 // Helper recursively prints all paths through the ratings matrix, starting
183 // at column col.
184 static void PrintMatrixPaths(int col, int dim,
185  const MATRIX& ratings,
186  int length, const BLOB_CHOICE** blob_choices,
187  const UNICHARSET& unicharset,
188  const char *label, FILE *output_file) {
189  for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
190  if (ratings.get(col, row) != NOT_CLASSIFIED) {
191  BLOB_CHOICE_IT bc_it(ratings.get(col, row));
192  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
193  blob_choices[length] = bc_it.data();
194  if (row + 1 < dim) {
195  PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices,
196  unicharset, label, output_file);
197  } else {
198  PrintPath(length + 1, blob_choices, unicharset, label, output_file);
199  }
200  }
201  }
202  }
203 }
204 
205 // Runs classify_word_pass1() on the current word. Outputs Tesseract's
206 // raw choice as a result of the classification. For words labeled with a
207 // single unichar also outputs all alternatives from blob_choices of the
208 // best choice.
210  PAGE_RES_IT* pr_it,
211  FILE *output_file) {
212  // Classify word.
213  fflush(stdout);
214  WordData word_data(*pr_it);
215  SetupWordPassN(1, &word_data);
216  classify_word_and_language(1, pr_it, &word_data);
217  WERD_RES* werd_res = word_data.word;
218  WERD_CHOICE *best_choice = werd_res->best_choice;
219  ASSERT_HOST(best_choice != nullptr);
220 
221  // Compute the number of unichars in the label.
222  GenericVector<UNICHAR_ID> encoding;
223  if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
224  tprintf("Not outputting illegal unichar %s\n", label);
225  return;
226  }
227 
228  // Dump all paths through the ratings matrix (which is normally small).
229  int dim = werd_res->ratings->dimension();
230  const BLOB_CHOICE** blob_choices = new const BLOB_CHOICE*[dim];
231  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices,
232  unicharset, label, output_file);
233  delete [] blob_choices;
234 }
235 
236 } // namespace tesseract
BLOCK_RES * block() const
Definition: pageres.h:757
float certainty() const
Definition: ratngs.h:83
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:258
void rotate(const FCOORD &vec)
Definition: rect.h:197
Dict & getDict() override
const char * string() const
Definition: strngs.cpp:196
TBOX bounding_box() const
Definition: werd.cpp:159
Definition: rect.h:34
bool stopper_no_acceptable_choices
Definition: dict.h:625
int bandwidth() const
Definition: matrix.h:535
#define NOT_CLASSIFIED
Definition: matrix.h:44
WERD_RES * restart_page()
Definition: pageres.h:698
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
UNICHARSET unicharset
Definition: ccutil.h:68
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
WERD_RES * word() const
Definition: pageres.h:751
int dimension() const
Definition: matrix.h:533
const int16_t kMaxBoxEdgeDiff
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
PAGE_RES * page_res
Definition: pageres.h:677
FILE * init_recog_training(const STRING &fname)
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:358
float rating() const
Definition: ratngs.h:80
Definition: strngs.h:45
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:182
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:126
MATRIX * ratings
Definition: pageres.h:231
const UNICHARSET * uch_set
Definition: pageres.h:206
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
Definition: points.h:189
int16_t right() const
Definition: rect.h:79
WERD_RES * forward()
Definition: pageres.h:731
Definition: matrix.h:575
void recog_training_segmented(const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
int16_t bottom() const
Definition: rect.h:65
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
WERD_CHOICE * best_choice
Definition: pageres.h:235
T get(ICOORD pos) const
Definition: matrix.h:228
#define ASSERT_HOST(x)
Definition: errcode.h:84
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1338
WERD * word
Definition: pageres.h:189