tesseract  5.0.0-alpha-619-ge9db
boxchar.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: boxchar.cpp
3  * Description: Simple class to associate a Tesseract classification unit with
4  * its bounding box so that the boxes can be rotated as the image
5  * is rotated for degradation. Also includes routines to output
6  * the character-tagged boxes to a boxfile.
7  * Author: Ray Smith
8  *
9  * (C) Copyright 2013, Google Inc.
10  * Licensed under the Apache License, Version 2.0 (the "License");
11  * you may not use this file except in compliance with the License.
12  * You may obtain a copy of the License at
13  * http://www.apache.org/licenses/LICENSE-2.0
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  *
20  **********************************************************************/
21 
22 #include "boxchar.h"
23 
24 #include <cstddef>
25 #include <algorithm>
26 #include <vector>
27 
28 #include "fileio.h"
30 #include "normstrngs.h"
31 #include "tprintf.h"
32 #include "unicharset.h"
33 #include "unicode/uchar.h" // from libicu
34 
35 // Absolute Ratio of dx:dy or dy:dx to be a newline.
36 const int kMinNewlineRatio = 5;
37 
38 namespace tesseract {
39 
40 BoxChar::BoxChar(const char* utf8_str, int len)
41  : ch_(utf8_str, len), box_(nullptr), page_(0), rtl_index_(-1) {}
42 
43 BoxChar::~BoxChar() { boxDestroy(&box_); }
44 
45 void BoxChar::AddBox(int x, int y, int width, int height) {
46  box_ = boxCreate(x, y, width, height);
47 }
48 
49 // Increments *num_rtl and *num_ltr according to the directionality of
50 // characters in the box.
51 void BoxChar::GetDirection(int* num_rtl, int* num_ltr) const {
52  // Convert the unichar to UTF32 representation
53  std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(ch_.c_str());
54  if (uni_vector.empty()) {
55  tprintf("Illegal utf8 in boxchar string:%s = ", ch_.c_str());
56  for (size_t c = 0; c < ch_.size(); ++c) {
57  tprintf(" 0x%x", ch_[c]);
58  }
59  tprintf("\n");
60  return;
61  }
62  for (char32 ch : uni_vector) {
63  UCharDirection dir = u_charDirection(ch);
64  if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC ||
65  dir == U_RIGHT_TO_LEFT_ISOLATE) {
66  ++*num_rtl;
67  } else if ((dir == U_ARABIC_NUMBER) ||
68  (dir != U_DIR_NON_SPACING_MARK && dir != U_BOUNDARY_NEUTRAL)) {
69  ++*num_ltr;
70  }
71  }
72 }
73 
74 // Reverses the order of unicodes within the box. If Pango generates a
75 // ligature, these will get reversed on output, so reverse now.
77  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(ch_.c_str());
78  std::reverse(unicodes.begin(), unicodes.end());
79  ch_ = UNICHAR::UTF32ToUTF8(unicodes);
80 }
81 
82 /* static */
83 void BoxChar::TranslateBoxes(int xshift, int yshift,
84  std::vector<BoxChar*>* boxes) {
85  for (size_t i = 0; i < boxes->size(); ++i) {
86  BOX* box = (*boxes)[i]->box_;
87  if (box != nullptr) {
88  box->x += xshift;
89  box->y += yshift;
90  }
91  }
92 }
93 
94 // Prepares for writing the boxes to a file by inserting newlines, spaces,
95 // and re-ordering so the boxes are strictly left-to-right.
96 /* static */
97 void BoxChar::PrepareToWrite(std::vector<BoxChar*>* boxes) {
98  bool rtl_rules = ContainsMostlyRTL(*boxes);
99  bool vertical_rules = MostlyVertical(*boxes);
100  InsertNewlines(rtl_rules, vertical_rules, boxes);
101  InsertSpaces(rtl_rules, vertical_rules, boxes);
102  for (size_t i = 0; i < boxes->size(); ++i) {
103  if ((*boxes)[i]->box_ == nullptr) tprintf("Null box at index %zu\n", i);
104  }
105  if (rtl_rules) {
106  ReorderRTLText(boxes);
107  }
108 }
109 
110 // Inserts newline (tab) characters into the vector at newline positions.
111 /* static */
112 void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules,
113  std::vector<BoxChar*>* boxes) {
114  size_t prev_i = SIZE_MAX;
115  int max_shift = 0;
116  for (size_t i = 0; i < boxes->size(); ++i) {
117  Box* box = (*boxes)[i]->box_;
118  if (box == nullptr) {
119  if (prev_i == SIZE_MAX || prev_i + 1 < i || i + 1 == boxes->size()) {
120  // Erase null boxes at the start of a line and after another null box.
121  do {
122  delete (*boxes)[i];
123  boxes->erase(boxes->begin() + i);
124  if (i == 0) break;
125  } while (i-- == boxes->size() && (*boxes)[i]->box_ == nullptr);
126  }
127  continue;
128  }
129  if (prev_i != SIZE_MAX) {
130  Box* prev_box = (*boxes)[prev_i]->box_;
131  int shift = box->x - prev_box->x;
132  if (vertical_rules) {
133  shift = box->y - prev_box->y;
134  } else if (rtl_rules) {
135  shift = -shift;
136  }
137  if (-shift > max_shift) {
138  // This is a newline. Since nothing cares about the size of the box,
139  // except the out-of-bounds checker, minimize the chance of creating
140  // a box outside the image by making the width and height 1.
141  int width = 1;
142  int height = 1;
143  int x = prev_box->x + prev_box->w;
144  int y = prev_box->y;
145  if (vertical_rules) {
146  x = prev_box->x;
147  y = prev_box->y + prev_box->h;
148  } else if (rtl_rules) {
149  x = prev_box->x - width;
150  if (x < 0) {
151  tprintf("prev x = %d, width=%d\n", prev_box->x, width);
152  x = 0;
153  }
154  }
155  if (prev_i + 1 == i) {
156  // New character needed.
157  BoxChar* new_box = new BoxChar("\t", 1);
158  new_box->AddBox(x, y, width, height);
159  new_box->page_ = (*boxes)[i]->page_;
160  boxes->insert(boxes->begin() + i, new_box);
161  ++i;
162  } else {
163  (*boxes)[i - 1]->AddBox(x, y, width, height);
164  (*boxes)[i - 1]->ch_ = "\t";
165  }
166  max_shift = 0;
167  } else if (shift > max_shift) {
168  max_shift = shift;
169  }
170  }
171  prev_i = i;
172  }
173 }
174 
175 // Converts nullptr boxes to space characters, with appropriate bounding boxes.
176 /* static */
177 void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules,
178  std::vector<BoxChar*>* boxes) {
179  // After InsertNewlines, any remaining null boxes are not newlines, and are
180  // singletons, so add a box to each remaining null box.
181  for (size_t i = 1; i + 1 < boxes->size(); ++i) {
182  Box* box = (*boxes)[i]->box_;
183  if (box == nullptr) {
184  Box* prev = (*boxes)[i - 1]->box_;
185  Box* next = (*boxes)[i + 1]->box_;
186  ASSERT_HOST(prev != nullptr && next != nullptr);
187  int top = std::min(prev->y, next->y);
188  int bottom = std::max(prev->y + prev->h, next->y + next->h);
189  int left = prev->x + prev->w;
190  int right = next->x;
191  if (vertical_rules) {
192  top = prev->y + prev->h;
193  bottom = next->y;
194  left = std::min(prev->x, next->x);
195  right = std::max(prev->x + prev->w, next->x + next->w);
196  } else if (rtl_rules) {
197  // With RTL we have to account for BiDi.
198  // Right becomes the min left of all prior boxes back to the first
199  // space or newline.
200  right = prev->x;
201  left = next->x + next->w;
202  for (int j = i - 2;
203  j >= 0 && (*boxes)[j]->ch_ != " " && (*boxes)[j]->ch_ != "\t";
204  --j) {
205  prev = (*boxes)[j]->box_;
206  ASSERT_HOST(prev != nullptr);
207  if (prev->x < right) {
208  right = prev->x;
209  }
210  }
211  // Left becomes the max right of all next boxes forward to the first
212  // space or newline.
213  for (size_t j = i + 2;
214  j < boxes->size() && (*boxes)[j]->box_ != nullptr &&
215  (*boxes)[j]->ch_ != "\t";
216  ++j) {
217  next = (*boxes)[j]->box_;
218  if (next->x + next->w > left) {
219  left = next->x + next->w;
220  }
221  }
222  }
223  // Italic and stylized characters can produce negative spaces, which
224  // Leptonica doesn't like, so clip to a positive size.
225  if (right <= left) right = left + 1;
226  if (bottom <= top) bottom = top + 1;
227  (*boxes)[i]->AddBox(left, top, right - left, bottom - top);
228  (*boxes)[i]->ch_ = " ";
229  }
230  }
231 }
232 
233 // Reorders text in a right-to-left script in left-to-right order.
234 /* static */
235 void BoxChar::ReorderRTLText(std::vector<BoxChar*>* boxes) {
236  // Ideally we need the inverse of the algorithm used by ResultIterator.
237  // For now, let's try a sort that reverses original positions for RTL
238  // characters, otherwise by x-position. This should be much closer to
239  // correct than just sorting by x-position.
240  size_t num_boxes = boxes->size();
241  for (size_t i = 0; i < num_boxes; ++i) {
242  int num_rtl = 0, num_ltr = 0;
243  (*boxes)[i]->GetDirection(&num_rtl, &num_ltr);
244  if (num_rtl > num_ltr) {
245  (*boxes)[i]->set_rtl_index(i);
246  (*boxes)[i]->ReverseUnicodesInBox();
247  }
248  }
249  BoxCharPtrSort sorter;
250  size_t end = 0;
251  for (size_t start = 0; start < boxes->size(); start = end + 1) {
252  end = start + 1;
253  while (end < boxes->size() && (*boxes)[end]->ch_ != "\t") ++end;
254  std::sort(boxes->begin() + start, boxes->begin() + end, sorter);
255  }
256 }
257 
258 // Returns true if the vector contains mostly RTL characters.
259 /* static */
260 bool BoxChar::ContainsMostlyRTL(const std::vector<BoxChar*>& boxes) {
261  int num_rtl = 0, num_ltr = 0;
262  for (size_t i = 0; i < boxes.size(); ++i) {
263  boxes[i]->GetDirection(&num_rtl, &num_ltr);
264  }
265  return num_rtl > num_ltr;
266 }
267 
268 // Returns true if the text is mostly laid out vertically.
269 /* static */
270 bool BoxChar::MostlyVertical(const std::vector<BoxChar*>& boxes) {
271  int64_t total_dx = 0, total_dy = 0;
272  for (size_t i = 1; i < boxes.size(); ++i) {
273  if (boxes[i - 1]->box_ != nullptr && boxes[i]->box_ != nullptr &&
274  boxes[i - 1]->page_ == boxes[i]->page_) {
275  int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x;
276  int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y;
277  if (abs(dx) > abs(dy) * kMinNewlineRatio ||
278  abs(dy) > abs(dx) * kMinNewlineRatio) {
279  total_dx += dx * dx;
280  total_dy += dy * dy;
281  }
282  }
283  }
284  return total_dy > total_dx;
285 }
286 
287 // Returns the total length of all the strings in the boxes.
288 /* static */
289 int BoxChar::TotalByteLength(const std::vector<BoxChar*>& boxes) {
290  int total_length = 0;
291  for (size_t i = 0; i < boxes.size(); ++i)
292  total_length += boxes[i]->ch_.size();
293  return total_length;
294 }
295 
296 // Rotate the boxes in [start_box, end_box) by the given rotation.
297 // The rotation is in radians clockwise about the given center.
298 /* static */
299 void BoxChar::RotateBoxes(float rotation, int xcenter, int ycenter,
300  int start_box, int end_box,
301  std::vector<BoxChar*>* boxes) {
302  Boxa* orig = boxaCreate(0);
303  for (int i = start_box; i < end_box; ++i) {
304  BOX* box = (*boxes)[i]->box_;
305  if (box) boxaAddBox(orig, box, L_CLONE);
306  }
307  Boxa* rotated = boxaRotate(orig, xcenter, ycenter, rotation);
308  boxaDestroy(&orig);
309  for (int i = start_box, box_ind = 0; i < end_box; ++i) {
310  if ((*boxes)[i]->box_) {
311  boxDestroy(&((*boxes)[i]->box_));
312  (*boxes)[i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE);
313  }
314  }
315  boxaDestroy(&rotated);
316 }
317 
318 const int kMaxLineLength = 1024;
319 /* static */
320 void BoxChar::WriteTesseractBoxFile(const std::string& filename, int height,
321  const std::vector<BoxChar*>& boxes) {
322  std::string output = GetTesseractBoxStr(height, boxes);
323  File::WriteStringToFileOrDie(output, filename);
324 }
325 
326 /* static */
328  const std::vector<BoxChar*>& boxes) {
329  std::string output;
330  char buffer[kMaxLineLength];
331  for (size_t i = 0; i < boxes.size(); ++i) {
332  const Box* box = boxes[i]->box_;
333  if (box == nullptr) {
334  tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n");
335  return "";
336  }
337  int nbytes =
338  snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n",
339  boxes[i]->ch_.c_str(), box->x, height - box->y - box->h,
340  box->x + box->w, height - box->y, boxes[i]->page_);
341  output.append(buffer, nbytes);
342  }
343  return output;
344 }
345 
346 } // namespace tesseract
tesseract::BoxChar::TranslateBoxes
static void TranslateBoxes(int xshift, int yshift, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:83
string
std::string string
Definition: equationdetect_test.cc:21
tesseract::BoxChar::~BoxChar
~BoxChar()
Definition: boxchar.cpp:43
kMinNewlineRatio
const int kMinNewlineRatio
Definition: boxchar.cpp:36
tesseract::BoxChar::ch
const std::string & ch() const
Definition: boxchar.h:42
tesseract::UNICHAR::UTF8ToUTF32
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:215
tesseract::BoxChar::ReorderRTLText
static void ReorderRTLText(std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:235
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::BoxChar::MostlyVertical
static bool MostlyVertical(const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:270
tesseract::BoxCharPtrSort
Definition: boxchar.h:121
tesseract::BoxChar::GetTesseractBoxStr
static std::string GetTesseractBoxStr(int height, const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:327
tesseract::BoxChar::GetDirection
void GetDirection(int *num_rtl, int *num_ltr) const
Definition: boxchar.cpp:51
tesseract::BoxChar::WriteTesseractBoxFile
static void WriteTesseractBoxFile(const std::string &name, int height, const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:320
tesseract::BoxChar::InsertNewlines
static void InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:112
tesseract::BoxChar::RotateBoxes
static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:299
fileio.h
boxchar.h
genericvector.h
tesseract::BoxChar::InsertSpaces
static void InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:177
tesseract::BoxChar::AddBox
void AddBox(int x, int y, int width, int height)
Definition: boxchar.cpp:45
unicharset.h
tesseract::char32
signed int char32
Definition: unichar.h:53
tesseract::UNICHAR::UTF32ToUTF8
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
Definition: unichar.cpp:232
tesseract::File::WriteStringToFileOrDie
static void WriteStringToFileOrDie(const std::string &str, const std::string &filename)
Definition: fileio.cpp:68
tesseract::BoxChar::TotalByteLength
static int TotalByteLength(const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:289
tesseract::BoxChar
Definition: boxchar.h:35
tesseract::kMaxLineLength
const int kMaxLineLength
Definition: boxchar.cpp:318
tesseract::BoxChar::PrepareToWrite
static void PrepareToWrite(std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:97
tesseract::BoxChar::box
const Box * box() const
Definition: boxchar.h:43
tesseract
Definition: baseapi.h:65
tprintf.h
normstrngs.h
tesseract::BoxChar::BoxChar
BoxChar(const char *utf8_str, int len)
Definition: boxchar.cpp:40
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::BoxChar::ContainsMostlyRTL
static bool ContainsMostlyRTL(const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:260
tesseract::BoxChar::ReverseUnicodesInBox
void ReverseUnicodesInBox()
Definition: boxchar.cpp:76