33 #include "unicode/uchar.h"
41 : ch_(utf8_str, len), box_(nullptr), page_(0), rtl_index_(-1) {}
46 box_ = boxCreate(x, y, width, height);
54 if (uni_vector.empty()) {
55 tprintf(
"Illegal utf8 in boxchar string:%s = ", ch_.c_str());
56 for (
size_t c = 0; c < ch_.size(); ++c) {
63 UCharDirection dir = u_charDirection(
ch);
64 if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC ||
65 dir == U_RIGHT_TO_LEFT_ISOLATE) {
67 }
else if ((dir == U_ARABIC_NUMBER) ||
68 (dir != U_DIR_NON_SPACING_MARK && dir != U_BOUNDARY_NEUTRAL)) {
78 std::reverse(unicodes.begin(), unicodes.end());
84 std::vector<BoxChar*>* boxes) {
85 for (
size_t i = 0; i < boxes->size(); ++i) {
86 BOX*
box = (*boxes)[i]->box_;
102 for (
size_t i = 0; i < boxes->size(); ++i) {
103 if ((*boxes)[i]->box_ ==
nullptr)
tprintf(
"Null box at index %zu\n", i);
113 std::vector<BoxChar*>* boxes) {
114 size_t prev_i = SIZE_MAX;
116 for (
size_t i = 0; i < boxes->size(); ++i) {
117 Box*
box = (*boxes)[i]->box_;
118 if (
box ==
nullptr) {
119 if (prev_i == SIZE_MAX || prev_i + 1 < i || i + 1 == boxes->size()) {
123 boxes->erase(boxes->begin() + i);
125 }
while (i-- == boxes->size() && (*boxes)[i]->box_ ==
nullptr);
129 if (prev_i != SIZE_MAX) {
130 Box* prev_box = (*boxes)[prev_i]->box_;
131 int shift =
box->x - prev_box->x;
132 if (vertical_rules) {
133 shift =
box->y - prev_box->y;
134 }
else if (rtl_rules) {
137 if (-shift > max_shift) {
143 int x = prev_box->x + prev_box->w;
145 if (vertical_rules) {
147 y = prev_box->y + prev_box->h;
148 }
else if (rtl_rules) {
149 x = prev_box->x - width;
151 tprintf(
"prev x = %d, width=%d\n", prev_box->x, width);
155 if (prev_i + 1 == i) {
158 new_box->
AddBox(x, y, width, height);
159 new_box->page_ = (*boxes)[i]->page_;
160 boxes->insert(boxes->begin() + i, new_box);
163 (*boxes)[i - 1]->AddBox(x, y, width, height);
164 (*boxes)[i - 1]->ch_ =
"\t";
167 }
else if (shift > max_shift) {
178 std::vector<BoxChar*>* boxes) {
181 for (
size_t i = 1; i + 1 < boxes->size(); ++i) {
182 Box*
box = (*boxes)[i]->box_;
183 if (
box ==
nullptr) {
184 Box* prev = (*boxes)[i - 1]->box_;
185 Box* next = (*boxes)[i + 1]->box_;
187 int top = std::min(prev->y, next->y);
188 int bottom = std::max(prev->y + prev->h, next->y + next->h);
189 int left = prev->x + prev->w;
191 if (vertical_rules) {
192 top = prev->y + prev->h;
194 left = std::min(prev->x, next->x);
195 right = std::max(prev->x + prev->w, next->x + next->w);
196 }
else if (rtl_rules) {
201 left = next->x + next->w;
203 j >= 0 && (*boxes)[j]->ch_ !=
" " && (*boxes)[j]->ch_ !=
"\t";
205 prev = (*boxes)[j]->box_;
207 if (prev->x < right) {
213 for (
size_t j = i + 2;
214 j < boxes->size() && (*boxes)[j]->box_ !=
nullptr &&
215 (*boxes)[j]->ch_ !=
"\t";
217 next = (*boxes)[j]->box_;
218 if (next->x + next->w > left) {
219 left = next->x + next->w;
225 if (right <= left) right = left + 1;
226 if (bottom <= top) bottom = top + 1;
227 (*boxes)[i]->AddBox(left, top, right - left, bottom - top);
228 (*boxes)[i]->ch_ =
" ";
240 size_t num_boxes = boxes->size();
241 for (
size_t i = 0; i < num_boxes; ++i) {
242 int num_rtl = 0, num_ltr = 0;
243 (*boxes)[i]->GetDirection(&num_rtl, &num_ltr);
244 if (num_rtl > num_ltr) {
245 (*boxes)[i]->set_rtl_index(i);
246 (*boxes)[i]->ReverseUnicodesInBox();
251 for (
size_t start = 0; start < boxes->size(); start = end + 1) {
253 while (end < boxes->size() && (*boxes)[end]->ch_ !=
"\t") ++end;
254 std::sort(boxes->begin() + start, boxes->begin() + end, sorter);
261 int num_rtl = 0, num_ltr = 0;
262 for (
size_t i = 0; i < boxes.size(); ++i) {
263 boxes[i]->GetDirection(&num_rtl, &num_ltr);
265 return num_rtl > num_ltr;
271 int64_t total_dx = 0, total_dy = 0;
272 for (
size_t i = 1; i < boxes.size(); ++i) {
273 if (boxes[i - 1]->box_ !=
nullptr && boxes[i]->box_ !=
nullptr &&
274 boxes[i - 1]->page_ == boxes[i]->page_) {
275 int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x;
276 int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y;
284 return total_dy > total_dx;
290 int total_length = 0;
291 for (
size_t i = 0; i < boxes.size(); ++i)
292 total_length += boxes[i]->ch_.size();
300 int start_box,
int end_box,
301 std::vector<BoxChar*>* boxes) {
302 Boxa* orig = boxaCreate(0);
303 for (
int i = start_box; i < end_box; ++i) {
304 BOX*
box = (*boxes)[i]->box_;
305 if (
box) boxaAddBox(orig,
box, L_CLONE);
307 Boxa* rotated = boxaRotate(orig, xcenter, ycenter, rotation);
309 for (
int i = start_box, box_ind = 0; i < end_box; ++i) {
310 if ((*boxes)[i]->box_) {
311 boxDestroy(&((*boxes)[i]->box_));
312 (*boxes)[i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE);
315 boxaDestroy(&rotated);
321 const std::vector<BoxChar*>& boxes) {
328 const std::vector<BoxChar*>& boxes) {
331 for (
size_t i = 0; i < boxes.size(); ++i) {
332 const Box*
box = boxes[i]->box_;
333 if (
box ==
nullptr) {
334 tprintf(
"Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n");
339 boxes[i]->ch_.c_str(),
box->x, height -
box->y -
box->h,
340 box->x +
box->w, height -
box->y, boxes[i]->page_);
341 output.append(buffer, nbytes);