All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
boxchar.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: boxchar.h
3  * Description: Simple class to associate a Tesseract classification unit with
4  * its bounding box so that the boxes can be rotated as the image
5  * is rotated for degradation. Also includes routines to output
6  * the character-tagged boxes to a boxfile.
7  * Author: Ray Smith
8  * Created: Mon Nov 18 2013
9  *
10  * (C) Copyright 2013, Google Inc.
11  * Licensed under the Apache License, Version 2.0 (the "License");
12  * you may not use this file except in compliance with the License.
13  * You may obtain a copy of the License at
14  * http://www.apache.org/licenses/LICENSE-2.0
15  * Unless required by applicable law or agreed to in writing, software
16  * distributed under the License is distributed on an "AS IS" BASIS,
17  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18  * See the License for the specific language governing permissions and
19  * limitations under the License.
20  *
21  **********************************************************************/
22 
23 #ifndef TESSERACT_TRAINING_BOXCHAR_H_
24 #define TESSERACT_TRAINING_BOXCHAR_H_
25 
26 #include <string>
27 #include <vector>
28 
29 #include "allheaders.h" // from Leptonica
30 
31 #ifdef USE_STD_NAMESPACE
32 using std::string;
33 using std::vector;
34 #endif
35 
36 struct Box;
37 
38 namespace tesseract {
39 
40 class BoxChar {
41  public:
42  BoxChar(const char* utf8_str, int len);
43 
44  ~BoxChar();
45 
46  // Accessors.
47  const string& ch() const { return ch_; }
48  const Box* box() const { return box_; }
49  const int& page() const { return page_; }
50 
51 
52  // Set the box_ member.
53  void AddBox(int x, int y, int width, int height);
54 
55  void set_page(int page) { page_ = page; }
56 
57  string* mutable_ch() { return &ch_; }
58  Box* mutable_box() { return box_; }
59 
60  // Sort function for sorting by left edge of box. Note that this will not
61  // work properly until after InsertNewlines and InsertSpaces.
62  bool operator<(const BoxChar& other) const {
63  if (box_ == NULL) return true;
64  if (other.box_ == NULL) return false;
65  return box_->x < other.box_->x;
66  }
67 
68  static void TranslateBoxes(int xshift, int yshift,
69  vector<BoxChar*>* boxes);
70 
71  // Prepares for writing the boxes to a file by inserting newlines, spaces,
72  // and re-ordering so the boxes are strictly left-to-right.
73  static void PrepareToWrite(vector<BoxChar*>* boxes);
74  // Inserts newline (tab) characters into the vector at newline positions.
75  static void InsertNewlines(bool rtl_rules, bool vertical_rules,
76  vector<BoxChar*>* boxes);
77  // Converts NULL boxes to space characters, with appropriate bounding boxes.
78  static void InsertSpaces(bool rtl_rules, bool vertical_rules,
79  vector<BoxChar*>* boxes);
80  // Reorders text in a right-to-left script in left-to-right order.
81  static void ReorderRTLText(vector<BoxChar*>* boxes);
82  // Returns true if the vector contains mostly RTL characters.
83  static bool ContainsMostlyRTL(const vector<BoxChar*>& boxes);
84  // Returns true if the text is mostly laid out vertically.
85  static bool MostlyVertical(const vector<BoxChar*>& boxes);
86 
87  // Returns the total length of all the strings in the boxes.
88  static int TotalByteLength(const vector<BoxChar*>& boxes);
89 
90  // Rotate the vector of boxes between start and end by the given rotation.
91  // The rotation is in radians clockwise about the given center.
92  static void RotateBoxes(float rotation,
93  int xcenter,
94  int ycenter,
95  int start_box,
96  int end_box,
97  vector<BoxChar*>* boxes);
98 
99  // Create a tesseract box file from the vector of boxes. The image height
100  // is needed to convert to tesseract coordinates.
101  static void WriteTesseractBoxFile(const string& name, int height,
102  const vector<BoxChar*>& boxes);
103 
104  private:
105  string ch_;
106  Box* box_;
107  int page_;
108 };
109 
110 // Sort predicate to sort a vector of BoxChar*.
112  bool operator()(const BoxChar* box1, const BoxChar* box2) const {
113  return *box1 < *box2;
114  }
115 };
116 
117 } // namespace tesseract
118 
119 #endif // TESSERACT_TRAINING_BOXCHAR_H_
static void ReorderRTLText(vector< BoxChar * > *boxes)
Definition: boxchar.cpp:201
static void TranslateBoxes(int xshift, int yshift, vector< BoxChar * > *boxes)
Definition: boxchar.cpp:52
const string & ch() const
Definition: boxchar.h:47
bool operator()(const BoxChar *box1, const BoxChar *box2) const
Definition: boxchar.h:112
BoxChar(const char *utf8_str, int len)
Definition: boxchar.cpp:41
static void InsertSpaces(bool rtl_rules, bool vertical_rules, vector< BoxChar * > *boxes)
Definition: boxchar.cpp:144
string * mutable_ch()
Definition: boxchar.h:57
void AddBox(int x, int y, int width, int height)
Definition: boxchar.cpp:47
Box * mutable_box()
Definition: boxchar.h:58
const Box * box() const
Definition: boxchar.h:48
static void WriteTesseractBoxFile(const string &name, int height, const vector< BoxChar * > &boxes)
Definition: boxchar.cpp:292
name_table name
static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, vector< BoxChar * > *boxes)
Definition: boxchar.cpp:272
const int & page() const
Definition: boxchar.h:49
static void PrepareToWrite(vector< BoxChar * > *boxes)
Definition: boxchar.cpp:65
void set_page(int page)
Definition: boxchar.h:55
bool operator<(const BoxChar &other) const
Definition: boxchar.h:62
#define NULL
Definition: host.h:144
static void InsertNewlines(bool rtl_rules, bool vertical_rules, vector< BoxChar * > *boxes)
Definition: boxchar.cpp:81
static int TotalByteLength(const vector< BoxChar * > &boxes)
Definition: boxchar.cpp:263
static bool MostlyVertical(const vector< BoxChar * > &boxes)
Definition: boxchar.cpp:244
static bool ContainsMostlyRTL(const vector< BoxChar * > &boxes)
Definition: boxchar.cpp:215