tesseract  5.0.0-alpha-619-ge9db
boxchar.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: boxchar.h
3  * Description: Simple class to associate a Tesseract classification unit with
4  * its bounding box so that the boxes can be rotated as the image
5  * is rotated for degradation. Also includes routines to output
6  * the character-tagged boxes to a boxfile.
7  * Author: Ray Smith
8  * Created: Mon Nov 18 2013
9  *
10  * (C) Copyright 2013, Google Inc.
11  * Licensed under the Apache License, Version 2.0 (the "License");
12  * you may not use this file except in compliance with the License.
13  * You may obtain a copy of the License at
14  * http://www.apache.org/licenses/LICENSE-2.0
15  * Unless required by applicable law or agreed to in writing, software
16  * distributed under the License is distributed on an "AS IS" BASIS,
17  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18  * See the License for the specific language governing permissions and
19  * limitations under the License.
20  *
21  **********************************************************************/
22 
23 #ifndef TESSERACT_TRAINING_BOXCHAR_H_
24 #define TESSERACT_TRAINING_BOXCHAR_H_
25 
26 #include <string>
27 #include <vector>
28 
29 #include "allheaders.h" // from Leptonica
30 #include <tesseract/platform.h>
31 
32 struct Box;
33 
34 namespace tesseract {
35 
36 class BoxChar {
37  public:
38  BoxChar(const char* utf8_str, int len);
39 
40  ~BoxChar();
41 
42  // Accessors.
43  const std::string& ch() const { return ch_; }
44  const Box* box() const { return box_; }
45  const int& page() const { return page_; }
46  void set_rtl_index(int index) { rtl_index_ = index; }
47  const int& rtl_index() const { return rtl_index_; }
48 
49  // Set the box_ member.
50  void AddBox(int x, int y, int width, int height);
51 
52  void set_page(int page) { page_ = page; }
53 
54  std::string* mutable_ch() { return &ch_; }
55  Box* mutable_box() { return box_; }
56 
57  // Sort function for sorting by left edge of box. Note that this will not
58  // work properly until after InsertNewlines and InsertSpaces.
59  bool operator<(const BoxChar& other) const {
60  if (box_ == nullptr) return true;
61  if (other.box_ == nullptr) return false;
62  return box_->x < other.box_->x;
63  }
64  // Increments *num_rtl and *num_ltr according to the directionality of
65  // characters in the box.
66  void GetDirection(int* num_rtl, int* num_ltr) const;
67  // Reverses the order of unicodes within the box. If Pango generates a
68  // ligature, these will get reversed on output, so reverse now.
69  void ReverseUnicodesInBox();
70 
71  static void TranslateBoxes(int xshift, int yshift,
72  std::vector<BoxChar*>* boxes);
73 
74  // Prepares for writing the boxes to a file by inserting newlines, spaces,
75  // and re-ordering so the boxes are strictly left-to-right.
76  static void PrepareToWrite(std::vector<BoxChar*>* boxes);
77  // Inserts newline (tab) characters into the vector at newline positions.
78  static void InsertNewlines(bool rtl_rules, bool vertical_rules,
79  std::vector<BoxChar*>* boxes);
80  // Converts nullptr boxes to space characters, with appropriate bounding
81  // boxes.
82  static void InsertSpaces(bool rtl_rules, bool vertical_rules,
83  std::vector<BoxChar*>* boxes);
84  // Reorders text in a right-to-left script in left-to-right order.
85  static void ReorderRTLText(std::vector<BoxChar*>* boxes);
86  // Returns true if the vector contains mostly RTL characters.
87  static bool ContainsMostlyRTL(const std::vector<BoxChar*>& boxes);
88  // Returns true if the text is mostly laid out vertically.
89  static bool MostlyVertical(const std::vector<BoxChar*>& boxes);
90 
91  // Returns the total length of all the strings in the boxes.
92  static int TotalByteLength(const std::vector<BoxChar*>& boxes);
93 
94  // Rotate the vector of boxes between start and end by the given rotation.
95  // The rotation is in radians clockwise about the given center.
96  static void RotateBoxes(float rotation,
97  int xcenter,
98  int ycenter,
99  int start_box,
100  int end_box,
101  std::vector<BoxChar*>* boxes);
102 
103  // Create a tesseract box file from the vector of boxes. The image height
104  // is needed to convert to tesseract coordinates.
105  static void WriteTesseractBoxFile(const std::string& name, int height,
106  const std::vector<BoxChar*>& boxes);
107  // Gets the tesseract box file as a string from the vector of boxes.
108  // The image height is needed to convert to tesseract coordinates.
109  static std::string GetTesseractBoxStr(int height,
110  const std::vector<BoxChar*>& boxes);
111 
112  private:
113  std::string ch_;
114  Box* box_;
115  int page_;
116  // If the box is an RTL character, contains the original position in the
117  // array of boxes (before reversal), otherwise -1.
118  int rtl_index_;
119 };
120 
121 // Sort predicate to sort a vector of BoxChar*.
123  bool operator()(const BoxChar* box1, const BoxChar* box2) const {
124  if (box1->rtl_index() >= 0 && box2->rtl_index() >= 0)
125  return box2->rtl_index() < box1->rtl_index();
126  return *box1 < *box2;
127  }
128 };
129 
130 } // namespace tesseract
131 
132 #endif // TESSERACT_TRAINING_BOXCHAR_H_
tesseract::BoxChar::TranslateBoxes
static void TranslateBoxes(int xshift, int yshift, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:83
string
std::string string
Definition: equationdetect_test.cc:21
tesseract::BoxChar::~BoxChar
~BoxChar()
Definition: boxchar.cpp:43
tesseract::BoxChar::mutable_ch
std::string * mutable_ch()
Definition: boxchar.h:53
tesseract::BoxChar::ch
const std::string & ch() const
Definition: boxchar.h:42
tesseract::BoxChar::ReorderRTLText
static void ReorderRTLText(std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:235
tesseract::BoxChar::set_page
void set_page(int page)
Definition: boxchar.h:51
tesseract::BoxChar::operator<
bool operator<(const BoxChar &other) const
Definition: boxchar.h:58
platform.h
tesseract::BoxChar::MostlyVertical
static bool MostlyVertical(const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:270
tesseract::BoxCharPtrSort
Definition: boxchar.h:121
tesseract::BoxChar::GetTesseractBoxStr
static std::string GetTesseractBoxStr(int height, const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:327
tesseract::BoxChar::GetDirection
void GetDirection(int *num_rtl, int *num_ltr) const
Definition: boxchar.cpp:51
tesseract::BoxChar::WriteTesseractBoxFile
static void WriteTesseractBoxFile(const std::string &name, int height, const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:320
tesseract::BoxChar::set_rtl_index
void set_rtl_index(int index)
Definition: boxchar.h:45
tesseract::BoxChar::InsertNewlines
static void InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:112
tesseract::BoxChar::RotateBoxes
static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:299
tesseract::BoxChar::InsertSpaces
static void InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:177
tesseract::BoxChar::AddBox
void AddBox(int x, int y, int width, int height)
Definition: boxchar.cpp:45
tesseract::BoxChar::TotalByteLength
static int TotalByteLength(const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:289
tesseract::BoxChar
Definition: boxchar.h:35
tesseract::BoxChar::PrepareToWrite
static void PrepareToWrite(std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:97
tesseract::BoxChar::box
const Box * box() const
Definition: boxchar.h:43
tesseract
Definition: baseapi.h:65
tesseract::BoxChar::rtl_index
const int & rtl_index() const
Definition: boxchar.h:46
tesseract::BoxChar::BoxChar
BoxChar(const char *utf8_str, int len)
Definition: boxchar.cpp:40
tesseract::BoxCharPtrSort::operator()
bool operator()(const BoxChar *box1, const BoxChar *box2) const
Definition: boxchar.h:122
tesseract::BoxChar::mutable_box
Box * mutable_box()
Definition: boxchar.h:54
tesseract::BoxChar::ContainsMostlyRTL
static bool ContainsMostlyRTL(const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:260
tesseract::BoxChar::ReverseUnicodesInBox
void ReverseUnicodesInBox()
Definition: boxchar.cpp:76
tesseract::BoxChar::page
const int & page() const
Definition: boxchar.h:44