tesseract  5.0.0-alpha-619-ge9db
recogtraining.cpp
Go to the documentation of this file.
1 // File: recogtraining.cpp
3 // Description: Functions for ambiguity and parameter training.
4 // Author: Daria Antonova
5 //
6 // (C) Copyright 2009, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include "tesseractclass.h"
20 
21 #include "boxread.h"
22 #include "control.h"
23 #include "host.h" // for NearlyEqual
24 #include "ratngs.h"
25 #ifndef DISABLED_LEGACY_ENGINE
26 #include "reject.h"
27 #endif
28 #include "stopper.h"
29 
30 namespace tesseract {
31 
32 const int16_t kMaxBoxEdgeDiff = 2;
33 
34 // Sets flags necessary for recognition in the training mode.
35 // Opens and returns the pointer to the output file.
38  tessedit_tess_adaption_mode.set_value(0); // turn off adaption
39  tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
40  // Explore all segmentations.
42  }
43 
44  STRING output_fname = fname;
45  const char* lastdot = strrchr(output_fname.c_str(), '.');
46  if (lastdot != nullptr)
47  output_fname[lastdot - output_fname.c_str()] = '\0';
48  output_fname += ".txt";
49  FILE* output_file = fopen(output_fname.c_str(), "a+");
50  if (output_file == nullptr) {
51  tprintf("Error: Could not open file %s\n", output_fname.c_str());
52  ASSERT_HOST(output_file);
53  }
54  return output_file;
55 }
56 
57 // Copies the bounding box from page_res_it->word() to the given TBOX.
58 static bool read_t(PAGE_RES_IT* page_res_it, TBOX* tbox) {
59  while (page_res_it->block() != nullptr && page_res_it->word() == nullptr)
60  page_res_it->forward();
61 
62  if (page_res_it->word() != nullptr) {
63  *tbox = page_res_it->word()->word->bounding_box();
64 
65  // If tbox->left() is negative, the training image has vertical text and
66  // all the coordinates of bounding boxes of page_res are rotated by 90
67  // degrees in a counterclockwise direction. We need to rotate the TBOX back
68  // in order to compare with the TBOXes of box files.
69  if (tbox->left() < 0) {
70  tbox->rotate(FCOORD(0.0, -1.0));
71  }
72 
73  return true;
74  } else {
75  return false;
76  }
77 }
78 
79 // This function takes tif/box pair of files and runs recognition on the image,
80 // while making sure that the word bounds that tesseract identified roughly
81 // match to those specified by the input box file. For each word (ngram in a
82 // single bounding box from the input box file) it outputs the ocred result,
83 // the correct label, rating and certainty.
85  PAGE_RES* page_res,
86  volatile ETEXT_DESC* monitor,
87  FILE* output_file) {
88  STRING box_fname = fname;
89  const char* lastdot = strrchr(box_fname.c_str(), '.');
90  if (lastdot != nullptr)
91  box_fname[lastdot - box_fname.c_str()] = '\0';
92  box_fname += ".box";
93  // ReadNextBox() will close box_file
94  FILE* box_file = fopen(box_fname.c_str(), "r");
95  if (box_file == nullptr) {
96  tprintf("Error: Could not open file %s\n", box_fname.c_str());
97  ASSERT_HOST(box_file);
98  }
99 
100  PAGE_RES_IT page_res_it;
101  page_res_it.page_res = page_res;
102  page_res_it.restart_page();
103  STRING label;
104 
105  // Process all the words on this page.
106  TBOX tbox; // tesseract-identified box
107  TBOX bbox; // box from the box file
108  bool keep_going;
109  int line_number = 0;
110  int examined_words = 0;
111  do {
112  keep_going = read_t(&page_res_it, &tbox);
113  keep_going &=
114  ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
115  // Align bottom left points of the TBOXes.
116  while (keep_going &&
117  !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
118  if (bbox.bottom() < tbox.bottom()) {
119  page_res_it.forward();
120  keep_going = read_t(&page_res_it, &tbox);
121  } else {
122  keep_going =
123  ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
124  }
125  }
126  while (keep_going &&
127  !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
128  if (bbox.left() > tbox.left()) {
129  page_res_it.forward();
130  keep_going = read_t(&page_res_it, &tbox);
131  } else {
132  keep_going =
133  ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
134  }
135  }
136  // OCR the word if top right points of the TBOXes are similar.
137  if (keep_going &&
138  NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
139  NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
140  ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);
141  examined_words++;
142  }
143  page_res_it.forward();
144  } while (keep_going);
145 
146  // Set up scripts on all of the words that did not get sent to
147  // ambigs_classify_and_output. They all should have, but if all the
148  // werd_res's don't get uch_sets, tesseract will crash when you try
149  // to iterate over them. :-(
150  int total_words = 0;
151  for (page_res_it.restart_page(); page_res_it.block() != nullptr;
152  page_res_it.forward()) {
153  if (page_res_it.word()) {
154  if (page_res_it.word()->uch_set == nullptr)
155  page_res_it.word()->SetupFake(unicharset);
156  total_words++;
157  }
158  }
159  if (examined_words < 0.85 * total_words) {
160  tprintf(
161  "TODO(antonova): clean up recog_training_segmented; "
162  " It examined only a small fraction of the ambigs image.\n");
163  }
164  tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words,
165  total_words);
166 }
167 
168 // Helper prints the given set of blob choices.
169 static void PrintPath(int length, const BLOB_CHOICE** blob_choices,
170  const UNICHARSET& unicharset, const char* label,
171  FILE* output_file) {
172  float rating = 0.0f;
173  float certainty = 0.0f;
174  for (int i = 0; i < length; ++i) {
175  const BLOB_CHOICE* blob_choice = blob_choices[i];
176  fprintf(output_file, "%s",
177  unicharset.id_to_unichar(blob_choice->unichar_id()));
178  rating += blob_choice->rating();
179  if (certainty > blob_choice->certainty())
180  certainty = blob_choice->certainty();
181  }
182  fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty);
183 }
184 
185 // Helper recursively prints all paths through the ratings matrix, starting
186 // at column col.
187 static void PrintMatrixPaths(int col, int dim, const MATRIX& ratings,
188  int length, const BLOB_CHOICE** blob_choices,
189  const UNICHARSET& unicharset, const char* label,
190  FILE* output_file) {
191  for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
192  if (ratings.get(col, row) != NOT_CLASSIFIED) {
193  BLOB_CHOICE_IT bc_it(ratings.get(col, row));
194  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
195  blob_choices[length] = bc_it.data();
196  if (row + 1 < dim) {
197  PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices,
198  unicharset, label, output_file);
199  } else {
200  PrintPath(length + 1, blob_choices, unicharset, label, output_file);
201  }
202  }
203  }
204  }
205 }
206 
207 // Runs classify_word_pass1() on the current word. Outputs Tesseract's
208 // raw choice as a result of the classification. For words labeled with a
209 // single unichar also outputs all alternatives from blob_choices of the
210 // best choice.
212  PAGE_RES_IT* pr_it,
213  FILE* output_file) {
214  // Classify word.
215  fflush(stdout);
216  WordData word_data(*pr_it);
217  SetupWordPassN(1, &word_data);
218  classify_word_and_language(1, pr_it, &word_data);
219  WERD_RES* werd_res = word_data.word;
220  WERD_CHOICE* best_choice = werd_res->best_choice;
221  ASSERT_HOST(best_choice != nullptr);
222 
223  // Compute the number of unichars in the label.
224  GenericVector<UNICHAR_ID> encoding;
225  if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
226  tprintf("Not outputting illegal unichar %s\n", label);
227  return;
228  }
229 
230  // Dump all paths through the ratings matrix (which is normally small).
231  int dim = werd_res->ratings->dimension();
232  const auto** blob_choices = new const BLOB_CHOICE*[dim];
233  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset,
234  label, output_file);
235  delete[] blob_choices;
236 }
237 
238 } // namespace tesseract
PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:728
boxread.h
host.h
UNICHARSET::encode_string
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:258
WERD_CHOICE
Definition: ratngs.h:261
tesseract::Tesseract::recog_training_segmented
void recog_training_segmented(const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
Definition: recogtraining.cpp:84
tesseractclass.h
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:147
control.h
BLOB_CHOICE::certainty
float certainty() const
Definition: ratngs.h:81
PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:754
MATRIX
Definition: matrix.h:574
PAGE_RES_IT::restart_page
WERD_RES * restart_page()
Definition: pageres.h:695
TBOX::top
int16_t top() const
Definition: rect.h:57
STRING
Definition: strngs.h:45
WERD_RES
Definition: pageres.h:160
tesseract::Tesseract::ambigs_classify_and_output
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
Definition: recogtraining.cpp:211
stopper.h
WERD_RES::ratings
MATRIX * ratings
Definition: pageres.h:231
ETEXT_DESC
Definition: ocrclass.h:95
FCOORD
Definition: points.h:187
BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:75
tesseract::WordData
Definition: tesseractclass.h:144
ReadNextBox
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:134
TBOX::rotate
void rotate(const FCOORD &vec)
Definition: rect.h:196
tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:57
ratngs.h
WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:197
tesseract::Dict::stopper_no_acceptable_choices
bool stopper_no_acceptable_choices
Definition: dict.h:641
tesseract::Tesseract::SetupWordPassN
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:177
tesseract::WordData::word
WERD_RES * word
Definition: tesseractclass.h:155
tesseract::Tesseract::tessedit_tess_adaption_mode
int tessedit_tess_adaption_mode
Definition: tesseractclass.h:883
tesseract::Tesseract::tessedit_enable_doc_dict
bool tessedit_enable_doc_dict
Definition: tesseractclass.h:844
NOT_CLASSIFIED
#define NOT_CLASSIFIED
Definition: matrix.h:40
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
GENERIC_2D_ARRAY::get
T get(ICOORD pos) const
Definition: matrix.h:227
tesseract::kMaxBoxEdgeDiff
const int16_t kMaxBoxEdgeDiff
Definition: recogtraining.cpp:32
UNICHARSET
Definition: unicharset.h:145
tesseract::Tesseract::getDict
Dict & getDict() override
Definition: tesseractclass.cpp:564
tesseract::Tesseract::tessedit_ambigs_training
bool tessedit_ambigs_training
Definition: tesseractclass.h:809
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
tesseract::Tesseract::applybox_page
int applybox_page
Definition: tesseractclass.h:824
tesseract
Definition: baseapi.h:65
PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:748
PAGE_RES
Definition: pageres.h:73
tesseract::Tesseract::classify_word_and_language
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1318
BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:78
GenericVector< UNICHAR_ID >
PAGE_RES_IT
Definition: pageres.h:668
reject.h
BandTriMatrix::bandwidth
int bandwidth() const
Definition: matrix.h:534
WERD_RES::SetupFake
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:348
BLOB_CHOICE
Definition: ratngs.h:49
TBOX::left
int16_t left() const
Definition: rect.h:71
TBOX::right
int16_t right() const
Definition: rect.h:78
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
WERD_RES::word
WERD * word
Definition: pageres.h:180
tesseract::Tesseract::init_recog_training
FILE * init_recog_training(const STRING &fname)
Definition: recogtraining.cpp:36
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
PAGE_RES_IT::page_res
PAGE_RES * page_res
Definition: pageres.h:671
BandTriMatrix::dimension
int dimension() const
Definition: matrix.h:532
TBOX
Definition: rect.h:33