tesseract  4.0.0-1-g2a2b
stringrenderer.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: stringrenderer.cpp
3  * Description: Class for rendering UTF-8 text to an image, and retrieving
4  * bounding boxes around each grapheme cluster.
5  * Author: Ranjith Unnikrishnan
6  * Created: Mon Nov 18 2013
7  *
8  * (C) Copyright 2013, Google Inc.
9  * Licensed under the Apache License, Version 2.0 (the "License");
10  * you may not use this file except in compliance with the License.
11  * You may obtain a copy of the License at
12  * http://www.apache.org/licenses/LICENSE-2.0
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  **********************************************************************/
20 
21 #include "stringrenderer.h"
22 
23 #include <cassert>
24 #include <cstdio>
25 #include <cstring>
26 #include <algorithm>
27 #include <map>
28 #include <utility>
29 #include <vector>
30 
31 #include "allheaders.h" // from leptonica
32 #include "boxchar.h"
33 #include "ligature_table.h"
34 #include "normstrngs.h"
35 #include "pango/pango-font.h"
36 #include "pango/pango-glyph-item.h"
37 #include "tlog.h"
38 #include "unichar.h"
39 #include "unicode/uchar.h" // from libicu
40 #include "util.h"
41 
42 namespace tesseract {
43 
44 static const int kDefaultOutputResolution = 300;
45 
46 // Word joiner (U+2060) inserted after letters in ngram mode, as per
47 // recommendation in http://unicode.org/reports/tr14/ to avoid line-breaks at
48 // hyphens and other non-alpha characters.
49 static const char* kWordJoinerUTF8 = "\u2060";
50 
51 static bool IsCombiner(int ch) {
52  const int char_type = u_charType(ch);
53  return ((char_type == U_NON_SPACING_MARK) ||
54  (char_type == U_ENCLOSING_MARK) ||
55  (char_type == U_COMBINING_SPACING_MARK));
56 }
57 
58 static std::string EncodeAsUTF8(const char32 ch32) {
59  UNICHAR uni_ch(ch32);
60  return std::string(uni_ch.utf8(), uni_ch.utf8_len());
61 }
62 
63 // Returns true with probability 'prob'.
64 static bool RandBool(const double prob, TRand* rand) {
65  if (prob == 1.0) return true;
66  if (prob == 0.0) return false;
67  return rand->UnsignedRand(1.0) < prob;
68 }
69 
70 /* static */
71 static Pix* CairoARGB32ToPixFormat(cairo_surface_t *surface) {
72  if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) {
73  printf("Unexpected surface format %d\n",
74  cairo_image_surface_get_format(surface));
75  return nullptr;
76  }
77  const int width = cairo_image_surface_get_width(surface);
78  const int height = cairo_image_surface_get_height(surface);
79  Pix* pix = pixCreate(width, height, 32);
80  int byte_stride = cairo_image_surface_get_stride(surface);
81 
82  for (int i = 0; i < height; ++i) {
83  memcpy(reinterpret_cast<unsigned char*>(pix->data + i * pix->wpl) + 1,
84  cairo_image_surface_get_data(surface) + i * byte_stride,
85  byte_stride - ((i == height - 1) ? 1 : 0));
86  }
87  return pix;
88 }
89 
90 StringRenderer::StringRenderer(const std::string& font_desc, int page_width,
91  int page_height)
92  : font_(font_desc),
93  page_width_(page_width),
94  page_height_(page_height),
95  h_margin_(50),
96  v_margin_(50),
97  pen_color_{0.0, 0.0, 0.0},
98  char_spacing_(0),
99  leading_(0),
100  vertical_text_(false),
101  gravity_hint_strong_(false),
102  render_fullwidth_latin_(false),
103  underline_start_prob_(0),
104  underline_continuation_prob_(0),
105  underline_style_(PANGO_UNDERLINE_SINGLE),
106  features_(nullptr),
107  drop_uncovered_chars_(true),
108  strip_unrenderable_words_(false),
109  add_ligatures_(false),
110  output_word_boxes_(false),
111  surface_(nullptr),
112  cr_(nullptr),
113  layout_(nullptr),
114  start_box_(0),
115  page_(0),
116  box_padding_(0),
117  page_boxes_(nullptr),
118  total_chars_(0),
119  font_index_(0),
120  last_offset_(0) {
121  set_resolution(kDefaultOutputResolution);
122  set_font(font_desc);
123 }
124 
125 bool StringRenderer::set_font(const std::string& desc) {
126  bool success = font_.ParseFontDescriptionName(desc);
128  return success;
129 }
130 
131 void StringRenderer::set_resolution(const int resolution) {
132  resolution_ = resolution;
133  font_.set_resolution(resolution);
134 }
135 
137  underline_start_prob_ = std::min(std::max(frac, 0.0), 1.0);
138 }
139 
141  underline_continuation_prob_ = std::min(std::max(frac, 0.0), 1.0);
142 }
143 
145  free(features_);
146  ClearBoxes();
147  FreePangoCairo();
148 }
149 
151  FreePangoCairo();
152  surface_ = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, page_width_,
153  page_height_);
154  cr_ = cairo_create(surface_);
155  {
157  layout_ = pango_cairo_create_layout(cr_);
158  }
159 
160  if (vertical_text_) {
161  PangoContext* context = pango_layout_get_context(layout_);
162  pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST);
163  if (gravity_hint_strong_) {
164  pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG);
165  }
166  pango_layout_context_changed(layout_);
167  }
168 
170 }
171 
173  std::string font_desc = font_.DescriptionName();
174  // Specify the font via a description name
175  PangoFontDescription *desc =
176  pango_font_description_from_string(font_desc.c_str());
177  // Assign the font description to the layout
178  pango_layout_set_font_description(layout_, desc);
179  pango_font_description_free(desc); // free the description
180  pango_cairo_context_set_resolution(pango_layout_get_context(layout_),
181  resolution_);
182 
183  int max_width = page_width_ - 2 * h_margin_;
184  int max_height = page_height_ - 2 * v_margin_;
185  tlog(3, "max_width = %d, max_height = %d\n", max_width, max_height);
186  if (vertical_text_) {
187  using std::swap;
188  swap(max_width, max_height);
189  }
190  pango_layout_set_width(layout_, max_width * PANGO_SCALE);
191  // Ultra-wide Thai strings need to wrap at char level.
192  pango_layout_set_wrap(layout_, PANGO_WRAP_WORD_CHAR);
193 
194  // Adjust character spacing
195  PangoAttrList* attr_list = pango_attr_list_new();
196  if (char_spacing_) {
197  PangoAttribute* spacing_attr =
198  pango_attr_letter_spacing_new(char_spacing_ * PANGO_SCALE);
199  spacing_attr->start_index = 0;
200  spacing_attr->end_index = static_cast<guint>(-1);
201  pango_attr_list_change(attr_list, spacing_attr);
202  }
203 #if (PANGO_VERSION_MAJOR == 1 && PANGO_VERSION_MINOR >= 38)
204  if (add_ligatures_) {
205  set_features("liga, clig, dlig, hlig");
206  PangoAttribute* feature_attr = pango_attr_font_features_new(features_);
207  pango_attr_list_change(attr_list, feature_attr);
208  }
209 #endif
210  pango_layout_set_attributes(layout_, attr_list);
211  pango_attr_list_unref(attr_list);
212  // Adjust line spacing
213  if (leading_) {
214  pango_layout_set_spacing(layout_, leading_ * PANGO_SCALE);
215  }
216 }
217 
219  if (layout_) {
220  g_object_unref(layout_);
221  layout_ = nullptr;
222  }
223  if (cr_) {
224  cairo_destroy(cr_);
225  cr_ = nullptr;
226  }
227  if (surface_) {
228  cairo_surface_destroy(surface_);
229  surface_ = nullptr;
230  }
231 }
232 
233 void StringRenderer::SetWordUnderlineAttributes(const std::string& page_text) {
234  if (underline_start_prob_ == 0) return;
235  PangoAttrList* attr_list = pango_layout_get_attributes(layout_);
236 
237  const char* text = page_text.c_str();
238  size_t offset = 0;
239  TRand rand;
240  bool started_underline = false;
241  PangoAttribute* und_attr = nullptr;
242 
243  while (offset < page_text.length()) {
244  offset += SpanUTF8Whitespace(text + offset);
245  if (offset == page_text.length()) break;
246 
247  int word_start = offset;
248  int word_len = SpanUTF8NotWhitespace(text + offset);
249  offset += word_len;
250  if (started_underline) {
251  // Should we continue the underline to the next word?
252  if (RandBool(underline_continuation_prob_, &rand)) {
253  // Continue the current underline to this word.
254  und_attr->end_index = word_start + word_len;
255  } else {
256  // Otherwise end the current underline attribute at the end of the
257  // previous word.
258  pango_attr_list_insert(attr_list, und_attr);
259  started_underline = false;
260  und_attr = nullptr;
261  }
262  }
263  if (!started_underline && RandBool(underline_start_prob_, &rand)) {
264  // Start a new underline attribute
265  und_attr = pango_attr_underline_new(underline_style_);
266  und_attr->start_index = word_start;
267  und_attr->end_index = word_start + word_len;
268  started_underline = true;
269  }
270  }
271  // Finish the current underline attribute at the end of the page.
272  if (started_underline) {
273  und_attr->end_index = page_text.length();
274  pango_attr_list_insert(attr_list, und_attr);
275  }
276 }
277 
278 // Returns offset in utf8 bytes to first page.
280  int text_length) {
281  if (!text_length) return 0;
282  const int max_height = (page_height_ - 2 * v_margin_);
283  const int max_width = (page_width_ - 2 * h_margin_);
284  const int max_layout_height = vertical_text_ ? max_width : max_height;
285 
286  UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
287  const UNICHAR::const_iterator it_end = UNICHAR::end(text, text_length);
288  const int kMaxUnicodeBufLength = 15000;
289  for (int i = 0; i < kMaxUnicodeBufLength && it != it_end; ++it, ++i);
290  int buf_length = it.utf8_data() - text;
291  tlog(1, "len = %d buf_len = %d\n", text_length, buf_length);
292  pango_layout_set_text(layout_, text, buf_length);
293 
294  PangoLayoutIter* line_iter = nullptr;
295  { // Fontconfig caches some info here that is not freed before exit.
297  line_iter = pango_layout_get_iter(layout_);
298  }
299  bool first_page = true;
300  int page_top = 0;
301  int offset = buf_length;
302  do {
303  // Get bounding box of the current line
304  PangoRectangle line_ink_rect;
305  pango_layout_iter_get_line_extents(line_iter, &line_ink_rect, nullptr);
306  pango_extents_to_pixels(&line_ink_rect, nullptr);
307  PangoLayoutLine* line = pango_layout_iter_get_line_readonly(line_iter);
308  if (first_page) {
309  page_top = line_ink_rect.y;
310  first_page = false;
311  }
312  int line_bottom = line_ink_rect.y + line_ink_rect.height;
313  if (line_bottom - page_top > max_layout_height) {
314  offset = line->start_index;
315  tlog(1, "Found offset = %d\n", offset);
316  break;
317  }
318  } while (pango_layout_iter_next_line(line_iter));
319  pango_layout_iter_free(line_iter);
320  return offset;
321 }
322 
323 const std::vector<BoxChar*>& StringRenderer::GetBoxes() const {
324  return boxchars_;
325 }
326 
328  return page_boxes_;
329 }
330 
331 void StringRenderer::RotatePageBoxes(float rotation) {
332  BoxChar::RotateBoxes(rotation, page_width_ / 2, page_height_ / 2,
333  start_box_, boxchars_.size(), &boxchars_);
334 }
335 
336 
338  for (size_t i = 0; i < boxchars_.size(); ++i) delete boxchars_[i];
339  boxchars_.clear();
340  boxaDestroy(&page_boxes_);
341 }
342 
346 }
347 
348 void StringRenderer::WriteAllBoxes(const std::string& filename) {
351 }
352 
353 // Returns cluster strings in logical order.
354 bool StringRenderer::GetClusterStrings(std::vector<std::string>* cluster_text) {
355  std::map<int, std::string> start_byte_to_text;
356  PangoLayoutIter* run_iter = pango_layout_get_iter(layout_);
357  const char* full_text = pango_layout_get_text(layout_);
358  do {
359  PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
360  if (!run) {
361  // End of line nullptr run marker
362  tlog(2, "Found end of line marker\n");
363  continue;
364  }
365  PangoGlyphItemIter cluster_iter;
366  gboolean have_cluster;
367  for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
368  run, full_text);
369  have_cluster;
370  have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
371  const int start_byte_index = cluster_iter.start_index;
372  const int end_byte_index = cluster_iter.end_index;
373  std::string text = std::string(full_text + start_byte_index,
374  end_byte_index - start_byte_index);
375  if (IsUTF8Whitespace(text.c_str())) {
376  tlog(2, "Found whitespace\n");
377  text = " ";
378  }
379  tlog(2, "start_byte=%d end_byte=%d : '%s'\n", start_byte_index,
380  end_byte_index, text.c_str());
381  if (add_ligatures_) {
382  // Make sure the output box files have ligatured text in case the font
383  // decided to use an unmapped glyph.
384  text = LigatureTable::Get()->AddLigatures(text, nullptr);
385  }
386  start_byte_to_text[start_byte_index] = text;
387  }
388  } while (pango_layout_iter_next_run(run_iter));
389  pango_layout_iter_free(run_iter);
390 
391  cluster_text->clear();
392  for (std::map<int, std::string>::const_iterator it = start_byte_to_text.begin();
393  it != start_byte_to_text.end(); ++it) {
394  cluster_text->push_back(it->second);
395  }
396  return !cluster_text->empty();
397 }
398 
399 // Merges an array of BoxChars into words based on the identification of
400 // BoxChars containing the space character as inter-word separators.
401 //
402 // Sometime two adjacent characters in the sequence may be detected as lying on
403 // different lines based on their spatial positions. This may be the result of a
404 // newline character at end of the last word on a line in the source text, or of
405 // a discretionary line-break created by Pango at intra-word locations like
406 // hyphens. When this is detected the word is split at that location into
407 // multiple BoxChars. Otherwise, each resulting BoxChar will contain a word and
408 // its bounding box.
409 static void MergeBoxCharsToWords(std::vector<BoxChar*>* boxchars) {
410  std::vector<BoxChar*> result;
411  bool started_word = false;
412  for (size_t i = 0; i < boxchars->size(); ++i) {
413  if (boxchars->at(i)->ch() == " " || boxchars->at(i)->box() == nullptr) {
414  result.push_back(boxchars->at(i));
415  boxchars->at(i) = nullptr;
416  started_word = false;
417  continue;
418  }
419 
420  if (!started_word) {
421  // Begin new word
422  started_word = true;
423  result.push_back(boxchars->at(i));
424  boxchars->at(i) = nullptr;
425  } else {
426  BoxChar* last_boxchar = result.back();
427  // Compute bounding box union
428  const Box* box = boxchars->at(i)->box();
429  Box* last_box = last_boxchar->mutable_box();
430  int left = std::min(last_box->x, box->x);
431  int right = std::max(last_box->x + last_box->w, box->x + box->w);
432  int top = std::min(last_box->y, box->y);
433  int bottom = std::max(last_box->y + last_box->h, box->y + box->h);
434  // Conclude that the word was broken to span multiple lines based on the
435  // size of the merged bounding box in relation to those of the individual
436  // characters seen so far.
437  if (right - left > last_box->w + 5 * box->w) {
438  tlog(1, "Found line break after '%s'", last_boxchar->ch().c_str());
439  // Insert a fake interword space and start a new word with the current
440  // boxchar.
441  result.push_back(new BoxChar(" ", 1));
442  result.push_back(boxchars->at(i));
443  boxchars->at(i) = nullptr;
444  continue;
445  }
446  // Append to last word
447  last_boxchar->mutable_ch()->append(boxchars->at(i)->ch());
448  last_box->x = left;
449  last_box->w = right - left;
450  last_box->y = top;
451  last_box->h = bottom - top;
452  delete boxchars->at(i);
453  boxchars->at(i) = nullptr;
454  }
455  }
456  boxchars->swap(result);
457 }
458 
459 
461  const char* text = pango_layout_get_text(layout_);
462  PangoLayoutIter* cluster_iter = pango_layout_get_iter(layout_);
463 
464  // Do a first pass to store cluster start indexes.
465  std::vector<int> cluster_start_indices;
466  do {
467  cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter));
468  tlog(3, "Added %d\n", cluster_start_indices.back());
469  } while (pango_layout_iter_next_cluster(cluster_iter));
470  pango_layout_iter_free(cluster_iter);
471  cluster_start_indices.push_back(strlen(text));
472  tlog(3, "Added last index %d\n", cluster_start_indices.back());
473  // Sort the indices and create a map from start to end indices.
474  std::sort(cluster_start_indices.begin(), cluster_start_indices.end());
475  std::map<int, int> cluster_start_to_end_index;
476  for (size_t i = 0; i + 1 < cluster_start_indices.size(); ++i) {
477  cluster_start_to_end_index[cluster_start_indices[i]]
478  = cluster_start_indices[i + 1];
479  }
480 
481  // Iterate again to compute cluster boxes and their text with the obtained
482  // cluster extent information.
483  cluster_iter = pango_layout_get_iter(layout_);
484  // Store BoxChars* sorted by their byte start positions
485  std::map<int, BoxChar*> start_byte_to_box;
486  do {
487  PangoRectangle cluster_rect;
488  pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect, nullptr);
489  pango_extents_to_pixels(&cluster_rect, nullptr);
490  const int start_byte_index = pango_layout_iter_get_index(cluster_iter);
491  const int end_byte_index = cluster_start_to_end_index[start_byte_index];
492  std::string cluster_text = std::string(text + start_byte_index,
493  end_byte_index - start_byte_index);
494  if (!cluster_text.empty() && cluster_text[0] == '\n') {
495  tlog(2, "Skipping newlines at start of text.\n");
496  continue;
497  }
498  if (!cluster_rect.width || !cluster_rect.height ||
499  IsUTF8Whitespace(cluster_text.c_str())) {
500  tlog(2, "Skipping whitespace with boxdim (%d,%d) '%s'\n",
501  cluster_rect.width, cluster_rect.height, cluster_text.c_str());
502  BoxChar* boxchar = new BoxChar(" ", 1);
503  boxchar->set_page(page_);
504  start_byte_to_box[start_byte_index] = boxchar;
505  continue;
506  }
507  // Prepare a boxchar for addition at this byte position.
508  tlog(2, "[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\n",
509  cluster_rect.x, cluster_rect.y,
510  cluster_rect.width, cluster_rect.height,
511  start_byte_index, end_byte_index,
512  cluster_text.c_str());
513  ASSERT_HOST_MSG(cluster_rect.width,
514  "cluster_text:%s start_byte_index:%d\n",
515  cluster_text.c_str(), start_byte_index);
516  ASSERT_HOST_MSG(cluster_rect.height,
517  "cluster_text:%s start_byte_index:%d\n",
518  cluster_text.c_str(), start_byte_index);
519  if (box_padding_) {
520  cluster_rect.x = std::max(0, cluster_rect.x - box_padding_);
521  cluster_rect.width += 2 * box_padding_;
522  cluster_rect.y = std::max(0, cluster_rect.y - box_padding_);
523  cluster_rect.height += 2 * box_padding_;
524  }
525  if (add_ligatures_) {
526  // Make sure the output box files have ligatured text in case the font
527  // decided to use an unmapped glyph.
528  cluster_text = LigatureTable::Get()->AddLigatures(cluster_text, nullptr);
529  }
530  BoxChar* boxchar = new BoxChar(cluster_text.c_str(), cluster_text.size());
531  boxchar->set_page(page_);
532  boxchar->AddBox(cluster_rect.x, cluster_rect.y,
533  cluster_rect.width, cluster_rect.height);
534  start_byte_to_box[start_byte_index] = boxchar;
535  } while (pango_layout_iter_next_cluster(cluster_iter));
536  pango_layout_iter_free(cluster_iter);
537 
538  // There is a subtle bug in the cluster text reported by the PangoLayoutIter
539  // on ligatured characters (eg. The word "Lam-Aliph" in arabic). To work
540  // around this, we use text reported using the PangoGlyphIter which is
541  // accurate.
542  // TODO(ranjith): Revisit whether this is still needed in newer versions of
543  // pango.
544  std::vector<std::string> cluster_text;
545  if (GetClusterStrings(&cluster_text)) {
546  ASSERT_HOST(cluster_text.size() == start_byte_to_box.size());
547  int ind = 0;
548  for (std::map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
549  it != start_byte_to_box.end(); ++it, ++ind) {
550  it->second->mutable_ch()->swap(cluster_text[ind]);
551  }
552  }
553 
554  // Append to the boxchars list in byte order.
555  std::vector<BoxChar*> page_boxchars;
556  page_boxchars.reserve(start_byte_to_box.size());
557  std::string last_ch;
558  for (std::map<int, BoxChar*>::const_iterator it = start_byte_to_box.begin();
559  it != start_byte_to_box.end(); ++it) {
560  if (it->second->ch() == kWordJoinerUTF8) {
561  // Skip zero-width joiner characters (ZWJs) here.
562  delete it->second;
563  } else {
564  page_boxchars.push_back(it->second);
565  }
566  }
567  CorrectBoxPositionsToLayout(&page_boxchars);
568 
570  for (std::map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
571  it != start_byte_to_box.end(); ++it) {
572  // Convert fullwidth Latin characters to their halfwidth forms.
573  std::string half(ConvertFullwidthLatinToBasicLatin(it->second->ch()));
574  it->second->mutable_ch()->swap(half);
575  }
576  }
577 
578  // Merge the character boxes into word boxes if we are rendering n-grams.
579  if (output_word_boxes_) {
580  MergeBoxCharsToWords(&page_boxchars);
581  }
582 
583  boxchars_.insert(boxchars_.end(), page_boxchars.begin(), page_boxchars.end());
584 
585  // Compute the page bounding box
586  Box* page_box = nullptr;
587  Boxa* all_boxes = nullptr;
588  for (size_t i = 0; i < page_boxchars.size(); ++i) {
589  if (page_boxchars[i]->box() == nullptr) continue;
590  if (all_boxes == nullptr) all_boxes = boxaCreate(0);
591  boxaAddBox(all_boxes, page_boxchars[i]->mutable_box(), L_CLONE);
592  }
593  if (all_boxes != nullptr) {
594  boxaGetExtent(all_boxes, nullptr, nullptr, &page_box);
595  boxaDestroy(&all_boxes);
596  if (page_boxes_ == nullptr) page_boxes_ = boxaCreate(0);
597  boxaAddBox(page_boxes_, page_box, L_INSERT);
598  }
599 }
600 
601 
603  std::vector<BoxChar*>* boxchars) {
604  if (vertical_text_) {
605  const double rotation = - pango_gravity_to_rotation(
606  pango_context_get_base_gravity(pango_layout_get_context(layout_)));
609  0, boxchars->size(), boxchars);
610  } else {
612  }
613 }
614 
615 int StringRenderer::StripUnrenderableWords(std::string* utf8_text) const {
616  std::string output_text;
617  const char* text = utf8_text->c_str();
618  size_t offset = 0;
619  int num_dropped = 0;
620  while (offset < utf8_text->length()) {
621  int space_len = SpanUTF8Whitespace(text + offset);
622  output_text.append(text + offset, space_len);
623  offset += space_len;
624  if (offset == utf8_text->length()) break;
625 
626  int word_len = SpanUTF8NotWhitespace(text + offset);
627  if (font_.CanRenderString(text + offset, word_len)) {
628  output_text.append(text + offset, word_len);
629  } else {
630  ++num_dropped;
631  }
632  offset += word_len;
633  }
634  utf8_text->swap(output_text);
635 
636  if (num_dropped > 0) {
637  tprintf("Stripped %d unrenderable words\n", num_dropped);
638  }
639  return num_dropped;
640 }
641 
642 int StringRenderer::RenderToGrayscaleImage(const char* text, int text_length,
643  Pix** pix) {
644  Pix* orig_pix = nullptr;
645  int offset = RenderToImage(text, text_length, &orig_pix);
646  if (orig_pix) {
647  *pix = pixConvertTo8(orig_pix, false);
648  pixDestroy(&orig_pix);
649  }
650  return offset;
651 }
652 
653 int StringRenderer::RenderToBinaryImage(const char* text, int text_length,
654  int threshold, Pix** pix) {
655  Pix* orig_pix = nullptr;
656  int offset = RenderToImage(text, text_length, &orig_pix);
657  if (orig_pix) {
658  Pix* gray_pix = pixConvertTo8(orig_pix, false);
659  pixDestroy(&orig_pix);
660  *pix = pixThresholdToBinary(gray_pix, threshold);
661  pixDestroy(&gray_pix);
662  } else {
663  *pix = orig_pix;
664  }
665  return offset;
666 }
667 
668 // Add word joiner (WJ) characters between adjacent non-space characters except
669 // immediately before a combiner.
670 /* static */
671 std::string StringRenderer::InsertWordJoiners(const std::string& text) {
672  std::string out_str;
673  const UNICHAR::const_iterator it_end = UNICHAR::end(text.c_str(),
674  text.length());
675  for (UNICHAR::const_iterator it = UNICHAR::begin(text.c_str(), text.length());
676  it != it_end; ++it) {
677  // Add the symbol to the output string.
678  out_str.append(it.utf8_data(), it.utf8_len());
679  // Check the next symbol.
680  UNICHAR::const_iterator next_it = it;
681  ++next_it;
682  bool next_char_is_boundary = (next_it == it_end || *next_it == ' ');
683  bool next_char_is_combiner = (next_it == it_end) ?
684  false : IsCombiner(*next_it);
685  if (*it != ' ' && *it != '\n' && !next_char_is_boundary &&
686  !next_char_is_combiner) {
687  out_str += kWordJoinerUTF8;
688  }
689  }
690  return out_str;
691 }
692 
693 // Convert halfwidth Basic Latin characters to their fullwidth forms.
694 std::string StringRenderer::ConvertBasicLatinToFullwidthLatin(const std::string& str) {
695  std::string full_str;
696  const UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(),
697  str.length());
698  for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length());
699  it != it_end; ++it) {
700  // Convert printable and non-space 7-bit ASCII characters to
701  // their fullwidth forms.
702  if (IsInterchangeValid7BitAscii(*it) && isprint(*it) && !isspace(*it)) {
703  // Convert by adding 0xFEE0 to the codepoint of 7-bit ASCII.
704  char32 full_char = *it + 0xFEE0;
705  full_str.append(EncodeAsUTF8(full_char));
706  } else {
707  full_str.append(it.utf8_data(), it.utf8_len());
708  }
709  }
710  return full_str;
711 }
712 
713 // Convert fullwidth Latin characters to their halfwidth forms.
714 std::string StringRenderer::ConvertFullwidthLatinToBasicLatin(const std::string& str) {
715  std::string half_str;
716  UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
717  for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length());
718  it != it_end; ++it) {
719  char32 half_char = FullwidthToHalfwidth(*it);
720  // Convert fullwidth Latin characters to their halfwidth forms
721  // only if halfwidth forms are printable and non-space 7-bit ASCII.
722  if (IsInterchangeValid7BitAscii(half_char) &&
723  isprint(half_char) && !isspace(half_char)) {
724  half_str.append(EncodeAsUTF8(half_char));
725  } else {
726  half_str.append(it.utf8_data(), it.utf8_len());
727  }
728  }
729  return half_str;
730 }
731 
732 // Returns offset to end of text substring rendered in this method.
733 int StringRenderer::RenderToImage(const char* text, int text_length,
734  Pix** pix) {
735  if (pix && *pix) pixDestroy(pix);
736  InitPangoCairo();
737 
738  const int page_offset = FindFirstPageBreakOffset(text, text_length);
739  if (!page_offset) {
740  return 0;
741  }
742  start_box_ = boxchars_.size();
743 
744  if (!vertical_text_) {
745  // Translate by the specified margin
746  cairo_translate(cr_, h_margin_, v_margin_);
747  } else {
748  // Vertical text rendering is achieved by a two-step process of first
749  // performing regular horizontal layout with character orientation set to
750  // EAST, and then translating and rotating the layout before rendering onto
751  // the desired image surface. The settings required for the former step are
752  // done within InitPangoCairo().
753  //
754  // Translate to the top-right margin of page
755  cairo_translate(cr_, page_width_ - h_margin_, v_margin_);
756  // Rotate the layout
757  double rotation = - pango_gravity_to_rotation(
758  pango_context_get_base_gravity(pango_layout_get_context(layout_)));
759  tlog(2, "Rotating by %f radians\n", rotation);
760  cairo_rotate(cr_, rotation);
761  pango_cairo_update_layout(cr_, layout_);
762  }
763  std::string page_text(text, page_offset);
765  // Convert Basic Latin to their fullwidth forms.
766  page_text = ConvertBasicLatinToFullwidthLatin(page_text);
767  }
769  StripUnrenderableWords(&page_text);
770  }
771  if (drop_uncovered_chars_ &&
772  !font_.CoversUTF8Text(page_text.c_str(), page_text.length())) {
773  int num_dropped = font_.DropUncoveredChars(&page_text);
774  if (num_dropped) {
775  tprintf("WARNING: Dropped %d uncovered characters\n", num_dropped);
776  }
777  }
778  if (add_ligatures_) {
779  // Add ligatures wherever possible, including custom ligatures.
780  page_text = LigatureTable::Get()->AddLigatures(page_text, &font_);
781  }
782  if (underline_start_prob_ > 0) {
783  SetWordUnderlineAttributes(page_text);
784  }
785 
786  pango_layout_set_text(layout_, page_text.c_str(), page_text.length());
787 
788  if (pix) {
789  // Set a white background for the target image surface.
790  cairo_set_source_rgb(cr_, 1.0, 1.0, 1.0); // sets drawing colour to white
791  // Fill the surface with the active colour (if you don't do this, you will
792  // be given a surface with a transparent background to draw on)
793  cairo_paint(cr_);
794  // Set the ink color to black
795  cairo_set_source_rgb(cr_, pen_color_[0], pen_color_[1], pen_color_[2]);
796  // If the target surface or transformation properties of the cairo instance
797  // have changed, update the pango layout to reflect this
798  pango_cairo_update_layout(cr_, layout_);
799  {
800  DISABLE_HEAP_LEAK_CHECK; // for Fontconfig
801  // Draw the pango layout onto the cairo surface
802  pango_cairo_show_layout(cr_, layout_);
803  }
804  *pix = CairoARGB32ToPixFormat(surface_);
805  }
807  FreePangoCairo();
808  // Update internal state variables.
809  ++page_;
810  return page_offset;
811 }
812 
813 // Render a string to an image, returning it as an 8 bit pix. Behaves as
814 // RenderString, except that it ignores the font set at construction and works
815 // through all the fonts, returning 0 until they are exhausted, at which point
816 // it returns the value it should have returned all along, but no pix this time.
817 // Fonts that don't contain a given proportion of the characters in the string
818 // get skipped.
819 // Fonts that work each get rendered and the font name gets added
820 // to the image.
821 // NOTE that no boxes are produced by this function.
822 //
823 // Example usage: To render a null terminated char-array "txt"
824 //
825 // int offset = 0;
826 // do {
827 // Pix *pix;
828 // offset += renderer.RenderAllFontsToImage(min_proportion, txt + offset,
829 // strlen(txt + offset), nullptr,
830 // &pix);
831 // ...
832 // } while (offset < strlen(text));
833 //
835  const char* text, int text_length,
836  std::string* font_used, Pix** image) {
837  *image = nullptr;
838  // Select a suitable font to render the title with.
839  const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%";
840  std::string title_font;
841  if (!FontUtils::SelectFont(kTitleTemplate, strlen(kTitleTemplate),
842  &title_font, nullptr)) {
843  tprintf("WARNING: Could not find a font to render image title with!\n");
844  title_font = "Arial";
845  }
846  title_font += " 8";
847  tlog(1, "Selected title font: %s\n", title_font.c_str());
848  if (font_used) font_used->clear();
849 
850  std::string orig_font = font_.DescriptionName();
851  if (char_map_.empty()) {
852  total_chars_ = 0;
853  // Fill the hash table and use that for computing which fonts to use.
854  for (UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
855  it != UNICHAR::end(text, text_length); ++it) {
856  ++total_chars_;
857  ++char_map_[*it];
858  }
859  tprintf("Total chars = %d\n", total_chars_);
860  }
861  const std::vector<std::string>& all_fonts = FontUtils::ListAvailableFonts();
862 
863  for (size_t i = font_index_; i < all_fonts.size(); ++i) {
864  ++font_index_;
865  int raw_score = 0;
866  int ok_chars =
867  FontUtils::FontScore(char_map_, all_fonts[i], &raw_score, nullptr);
868  if (ok_chars > 0 && ok_chars >= total_chars_ * min_coverage) {
869  set_font(all_fonts[i]);
870  int offset = RenderToBinaryImage(text, text_length, 128, image);
871  ClearBoxes(); // Get rid of them as they are garbage.
872  const int kMaxTitleLength = 1024;
873  char title[kMaxTitleLength];
874  snprintf(title, kMaxTitleLength, kTitleTemplate,
875  all_fonts[i].c_str(), ok_chars,
876  100.0 * ok_chars / total_chars_, raw_score,
877  100.0 * raw_score / char_map_.size());
878  tprintf("%s\n", title);
879  // This is a good font! Store the offset to return once we've tried all
880  // the fonts.
881  if (offset) {
882  last_offset_ = offset;
883  if (font_used) *font_used = all_fonts[i];
884  }
885  // Add the font to the image.
886  set_font(title_font);
887  v_margin_ /= 8;
888  Pix* title_image = nullptr;
889  RenderToBinaryImage(title, strlen(title), 128, &title_image);
890  pixOr(*image, *image, title_image);
891  pixDestroy(&title_image);
892 
893  v_margin_ *= 8;
894  set_font(orig_font);
895  // We return the real offset only after cycling through the list of fonts.
896  return 0;
897  } else {
898  tprintf("Font %s failed with %d hits = %.2f%%\n",
899  all_fonts[i].c_str(), ok_chars, 100.0 * ok_chars / total_chars_);
900  }
901  }
902  font_index_ = 0;
903  char_map_.clear();
904  return last_offset_ == 0 ? -1 : last_offset_;
905 }
906 
907 } // namespace tesseract
signed int char32
bool GetClusterStrings(std::vector< std::string > *cluster_text)
static std::string ConvertFullwidthLatinToBasicLatin(const std::string &text)
int RenderAllFontsToImage(double min_coverage, const char *text, int text_length, std::string *font_used, Pix **pix)
int RenderToGrayscaleImage(const char *text, int text_length, Pix **pix)
PangoUnderline underline_style_
void set_underline_continuation_prob(const double frac)
static void PrepareToWrite(std::vector< BoxChar *> *boxes)
Definition: boxchar.cpp:97
#define DISABLE_HEAP_LEAK_CHECK
Definition: util.h:62
const char * utf8_data() const
Definition: unichar.h:136
void set_resolution(const int resolution)
static std::string InsertWordJoiners(const std::string &text)
signed int char32
Definition: unichar.h:52
static void TranslateBoxes(int xshift, int yshift, std::vector< BoxChar *> *boxes)
Definition: boxchar.cpp:83
cairo_surface_t * surface_
const std::vector< BoxChar * > & GetBoxes() const
void RotatePageBoxes(float rotation)
std::string DescriptionName() const
unsigned int SpanUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:233
bool IsInterchangeValid7BitAscii(const char32 ch)
Definition: normstrngs.cpp:276
std::unordered_map< char32, int64_t > char_map_
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:229
int FindFirstPageBreakOffset(const char *text, int text_length)
void set_resolution(const int resolution)
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:202
#define tlog(level,...)
Definition: tlog.h:33
bool CanRenderString(const char *utf8_word, int len, std::vector< std::string > *graphemes) const
void AddBox(int x, int y, int width, int height)
Definition: boxchar.cpp:46
void set_page(int page)
Definition: boxchar.h:52
static std::string GetTesseractBoxStr(int height, const std::vector< BoxChar *> &boxes)
Definition: boxchar.cpp:327
static int FontScore(const std::unordered_map< char32, int64_t > &ch_map, const std::string &fontname, int *raw_score, std::vector< bool > *ch_flags)
static std::string ConvertBasicLatinToFullwidthLatin(const std::string &text)
static bool SelectFont(const char *utf8_word, const int utf8_len, std::string *font_name, std::vector< std::string > *graphemes)
bool CoversUTF8Text(const char *utf8_text, int byte_length) const
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void WriteAllBoxes(const std::string &filename)
std::vector< BoxChar * > boxchars_
bool set_font(const std::string &desc)
static void WriteTesseractBoxFile(const std::string &name, int height, const std::vector< BoxChar *> &boxes)
Definition: boxchar.cpp:320
static LigatureTable * Get()
#define ASSERT_HOST_MSG(x,...)
Definition: errcode.h:90
void SetWordUnderlineAttributes(const std::string &page_text)
char32 FullwidthToHalfwidth(const char32 ch)
Definition: normstrngs.cpp:282
void set_underline_start_prob(const double frac)
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:206
void set_features(const char *features)
void CorrectBoxPositionsToLayout(std::vector< BoxChar *> *boxchars)
bool ParseFontDescriptionName(const std::string &name)
StringRenderer(const std::string &font_desc, int page_width, int page_height)
unsigned int SpanUTF8NotWhitespace(const char *text)
Definition: normstrngs.cpp:243
std::string AddLigatures(const std::string &str, const PangoFontInfo *font) const
static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, std::vector< BoxChar *> *boxes)
Definition: boxchar.cpp:299
int StripUnrenderableWords(std::string *utf8_text) const
int RenderToBinaryImage(const char *text, int text_length, int threshold, Pix **pix)
static const std::vector< std::string > & ListAvailableFonts()
int RenderToImage(const char *text, int text_length, Pix **pix)
int DropUncoveredChars(std::string *utf8_text) const
#define ASSERT_HOST(x)
Definition: errcode.h:84