20 #include "config_auto.h"
26 #include "allheaders.h"
169 static const int kCharWidth = 2;
174 static const int kMaxBytesPerCodepoint = 20;
184 textonly_ = textonly;
188 void TessPDFRenderer::AppendPDFObjectDIY(
size_t objectsize) {
193 void TessPDFRenderer::AppendPDFObject(
const char *data) {
194 AppendPDFObjectDIY(strlen(data));
201 static double prec(
double x) {
202 double kPrecision = 1000.0;
203 double a = round(x * kPrecision) / kPrecision;
209 static long dist2(
int x1,
int y1,
int x2,
int y2) {
210 return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
221 static void GetWordBaseline(
int writing_direction,
int ppi,
int height,
222 int word_x1,
int word_y1,
int word_x2,
int word_y2,
223 int line_x1,
int line_y1,
int line_x2,
int line_y2,
224 double *x0,
double *y0,
double *length) {
226 Swap(&word_x1, &word_x2);
227 Swap(&word_y1, &word_y2);
234 double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
239 double t = ((px - line_x2) * (line_x2 - line_x1) +
240 (py - line_y2) * (line_y2 - line_y1)) / l2;
241 x = line_x2 + t * (line_x2 - line_x1);
242 y = line_y2 + t * (line_y2 - line_y1);
244 word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
246 word_length = word_length * 72.0 / ppi;
248 y = height - (y * 72.0 / ppi);
252 *length = word_length;
263 static void AffineMatrix(
int writing_direction,
264 int line_x1,
int line_y1,
int line_x2,
int line_y2,
265 double *a,
double *b,
double *c,
double *d) {
266 double theta = atan2(static_cast<double>(line_y1 - line_y2),
267 static_cast<double>(line_x2 - line_x1));
272 switch(writing_direction) {
292 static void ClipBaseline(
int ppi,
int x1,
int y1,
int x2,
int y2,
293 int *line_x1,
int *line_y1,
294 int *line_x2,
int *line_y2) {
299 int rise = abs(y2 - y1) * 72;
300 int run = abs(x2 - x1) * 72;
301 if (rise < 2 * ppi && 2 * ppi < run)
302 *line_y1 = *line_y2 = (y1 + y2) / 2;
305 static bool CodepointToUtf16be(
int code,
char utf16[kMaxBytesPerCodepoint]) {
306 if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
307 tprintf(
"Dropping invalid codepoint %d\n", code);
310 if (code < 0x10000) {
311 snprintf(utf16, kMaxBytesPerCodepoint,
"%04X", code);
313 int a = code - 0x010000;
314 int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
315 int low_surrogate = (0x03FF & a) + 0xDC00;
316 snprintf(utf16, kMaxBytesPerCodepoint,
317 "%04X%04X", high_surrogate, low_surrogate);
322 char* TessPDFRenderer::GetPDFTextObjects(
TessBaseAPI* api,
323 double width,
double height) {
324 double ppi = api->GetSourceYResolution();
327 double old_x = 0.0, old_y = 0.0;
328 int old_fontsize = 0;
331 bool new_block =
true;
338 std::stringstream pdf_str;
340 pdf_str.imbue(std::locale::classic());
342 pdf_str.precision(8);
347 pdf_str <<
"q " << prec(width) <<
" 0 0 " << prec(height) <<
" 0 0 cm";
349 pdf_str <<
" /Im1 Do";
358 ResultIterator *res_it = api->GetIterator();
360 if (res_it->IsAtBeginningOf(
RIL_BLOCK)) {
361 pdf_str <<
"BT\n3 Tr";
369 ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
383 res_it->Orientation(&orientation, &writing_direction,
384 &textline_order, &deskew_angle);
386 switch (res_it->WordDirection()) {
394 writing_direction = old_writing_direction;
400 double x, y, word_length;
402 int word_x1, word_y1, word_x2, word_y2;
403 res_it->Baseline(
RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
404 GetWordBaseline(writing_direction, ppi, height,
405 word_x1, word_y1, word_x2, word_y2,
406 line_x1, line_y1, line_x2, line_y2,
407 &x, &y, &word_length);
410 if (writing_direction != old_writing_direction || new_block) {
411 AffineMatrix(writing_direction,
412 line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
413 pdf_str <<
" " << prec(a)
422 double dx = x - old_x;
423 double dy = y - old_y;
424 pdf_str <<
" " << prec(dx * a + dy * b)
425 <<
" " << prec(dx * c + dy * d)
430 old_writing_direction = writing_direction;
437 bool bold, italic, underlined, monospace, serif, smallcaps;
439 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
440 &serif, &smallcaps, &fontsize, &font_id);
441 const int kDefaultFontsize = 8;
443 fontsize = kDefaultFontsize;
444 if (fontsize != old_fontsize) {
445 pdf_str <<
"/f-0-0 " << fontsize <<
" Tf ";
446 old_fontsize = fontsize;
453 int pdf_word_len = 0;
455 const std::unique_ptr<const char[]> grapheme(
457 if (grapheme && grapheme[0] !=
'\0') {
459 char utf16[kMaxBytesPerCodepoint];
460 for (
char32 code : unicodes) {
461 if (CodepointToUtf16be(code, utf16)) {
469 if (res_it->IsAtBeginningOf(
RIL_WORD)) {
473 if (word_length > 0 && pdf_word_len > 0) {
475 kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
476 pdf_str << h_stretch <<
" Tz"
477 <<
" [ <" << pdf_word
480 if (last_word_in_line) {
483 if (last_word_in_block) {
488 char* result =
new char[text.length() + 1];
489 strcpy(result, text.c_str());
495 AppendPDFObject(
"%PDF-1.5\n%\xDE\xAD\xBE\xEB\n");
498 AppendPDFObject(
"1 0 obj\n"
510 AppendPDFObject(
"3 0 obj\n"
512 " /BaseFont /GlyphLessFont\n"
513 " /DescendantFonts [ 4 0 R ]\n"
514 " /Encoding /Identity-H\n"
516 " /ToUnicode 6 0 R\n"
522 std::stringstream stream;
524 stream.imbue(std::locale::classic());
528 " /BaseFont /GlyphLessFont\n"
529 " /CIDToGIDMap 5 0 R\n"
532 " /Ordering (Identity)\n"
533 " /Registry (Adobe)\n"
536 " /FontDescriptor 7 0 R\n"
537 " /Subtype /CIDFontType2\n"
539 " /DW " << (1000 / kCharWidth) <<
"\n"
542 AppendPDFObject(stream.str().c_str());
545 const int kCIDToGIDMapSize = 2 * (1 << 16);
546 const std::unique_ptr<unsigned char[]> cidtogidmap(
547 new unsigned char[kCIDToGIDMapSize]);
548 for (
int i = 0; i < kCIDToGIDMapSize; i++) {
549 cidtogidmap[i] = (i % 2) ? 1 : 0;
552 unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
557 " /Length " << len <<
" /Filter /FlateDecode\n"
561 long objsize = stream.str().size();
562 AppendData(reinterpret_cast<char *>(comp), len);
565 const char *endstream_endobj =
569 objsize += strlen(endstream_endobj);
570 AppendPDFObjectDIY(objsize);
572 const char stream2[] =
573 "/CIDInit /ProcSet findresource begin\n"
578 " /Registry (Adobe)\n"
582 "/CMapName /Adobe-Identify-UCS def\n"
584 "1 begincodespacerange\n"
586 "endcodespacerange\n"
588 "<0000> <FFFF> <0000>\n"
591 "CMapName currentdict /CMap defineresource pop\n"
599 "<< /Length " << (
sizeof(stream2) - 1) <<
" >>\n"
600 "stream\n" << stream2 <<
603 AppendPDFObject(stream.str().c_str());
614 " /FontBBox [ 0 0 " << (1000 / kCharWidth) <<
" 1000 ]\n"
615 " /FontFile2 8 0 R\n"
616 " /FontName /GlyphLessFont\n"
619 " /Type /FontDescriptor\n"
622 AppendPDFObject(stream.str().c_str());
625 stream << datadir_.c_str() <<
"/pdf.ttf";
626 FILE *fp = fopen(stream.str().c_str(),
"rb");
628 tprintf(
"Cannot open file \"%s\"!\n", stream.str().c_str());
631 fseek(fp, 0, SEEK_END);
632 auto size = std::ftell(fp);
637 fseek(fp, 0, SEEK_SET);
638 const std::unique_ptr<char[]> buffer(
new char[size]);
649 " /Length " << size <<
"\n"
650 " /Length1 " << size <<
"\n"
654 objsize = stream.str().size();
658 objsize += strlen(endstream_endobj);
659 AppendPDFObjectDIY(objsize);
663 bool TessPDFRenderer::imageToPDFObj(Pix *pix,
664 const char* filename,
667 long int* pdf_object_size,
668 const int jpg_quality) {
669 if (!pdf_object_size || !pdf_object)
671 *pdf_object =
nullptr;
672 *pdf_object_size = 0;
673 if (!filename && !pix)
676 L_Compressed_Data *cid =
nullptr;
679 if (pixGetInputFormat(pix) == IFF_PNG)
680 sad = pixGenerateCIData(pix, L_FLATE_ENCODE, 0, 0, &cid);
682 sad = l_generateCIDataForPdf(filename, pix, jpg_quality, &cid);
686 l_CIDataDestroy(&cid);
690 const char *group4 =
"";
694 filter =
"/FlateDecode";
697 filter =
"/DCTDecode";
700 filter =
"/CCITTFaxDecode";
704 filter =
"/JPXDecode";
707 l_CIDataDestroy(&cid);
714 std::stringstream colorspace;
716 colorspace.imbue(std::locale::classic());
717 if (cid->ncolors > 0) {
719 <<
" /ColorSpace [ /Indexed /DeviceRGB " << (cid->ncolors - 1)
720 <<
" " << cid->cmapdatahex <<
" ]\n";
724 if (cid->bps == 1 && pixGetInputFormat(pix) == IFF_PNG) {
725 colorspace.str(
" /ColorSpace /DeviceGray\n"
728 colorspace.str(
" /ColorSpace /DeviceGray\n");
732 colorspace.str(
" /ColorSpace /DeviceRGB\n");
735 l_CIDataDestroy(&cid);
740 int predictor = (cid->predictor) ? 14 : 1;
743 std::stringstream b1;
745 b1.imbue(std::locale::classic());
749 " /Length " << cid->nbytescomp <<
"\n"
750 " /Subtype /Image\n";
752 std::stringstream b2;
754 b2.imbue(std::locale::classic());
756 " /Width " << cid->w <<
"\n"
757 " /Height " << cid->h <<
"\n"
758 " /BitsPerComponent " << cid->bps <<
"\n"
759 " /Filter " << filter <<
"\n"
762 " /Predictor " << predictor <<
"\n"
763 " /Colors " << cid->spp <<
"\n" << group4 <<
764 " /Columns " << cid->w <<
"\n"
765 " /BitsPerComponent " << cid->bps <<
"\n"
774 size_t b1_len = b1.str().size();
775 size_t b2_len = b2.str().size();
776 size_t b3_len = strlen(b3);
777 size_t colorspace_len = colorspace.str().size();
780 b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len;
781 *pdf_object =
new char[*pdf_object_size];
783 char *p = *pdf_object;
784 memcpy(p, b1.str().c_str(), b1_len);
786 memcpy(p, colorspace.str().c_str(), colorspace_len);
788 memcpy(p, b2.str().c_str(), b2_len);
790 memcpy(p, cid->datacomp, cid->nbytescomp);
791 p += cid->nbytescomp;
792 memcpy(p, b3, b3_len);
793 l_CIDataDestroy(&cid);
801 if (!pix || ppi <= 0)
803 double width = pixGetWidth(pix) * 72.0 / ppi;
804 double height = pixGetHeight(pix) * 72.0 / ppi;
806 std::stringstream xobject;
808 xobject.imbue(std::locale::classic());
810 xobject <<
"/XObject << /Im1 " << (obj_ + 2) <<
" 0 R >>\n";
814 std::stringstream stream;
816 stream.imbue(std::locale::classic());
818 stream << std::fixed <<
823 " /MediaBox [0 0 " << width <<
" " << height <<
"]\n"
824 " /Contents " << (obj_ + 1) <<
" 0 R\n"
827 " " << xobject.str() <<
828 " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
829 " /Font << /f-0-0 3 0 R >>\n"
834 AppendPDFObject(stream.str().c_str());
837 const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
838 const size_t pdftext_len = strlen(pdftext.get());
840 unsigned char *comp_pdftext = zlibCompress(
841 reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
842 long comp_pdftext_len = len;
847 " /Length " << comp_pdftext_len <<
" /Filter /FlateDecode\n"
851 long objsize = stream.str().size();
852 AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
853 objsize += comp_pdftext_len;
854 lept_free(comp_pdftext);
859 objsize += strlen(b2);
860 AppendPDFObjectDIY(objsize);
863 char *pdf_object =
nullptr;
866 if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize,
871 AppendPDFObjectDIY(objsize);
886 const long int kPagesObjectNumber = 2;
887 offsets_[kPagesObjectNumber] = offsets_.
back();
888 std::stringstream stream;
890 stream.imbue(std::locale::classic());
891 stream << kPagesObjectNumber <<
" 0 obj\n<<\n /Type /Pages\n /Kids [ ";
893 size_t pages_objsize = stream.str().size();
896 stream << pages_[i] <<
" 0 R ";
898 pages_objsize += stream.str().size();
901 stream <<
"]\n /Count " << pages_.
size() <<
"\n>>\nendobj\n";
903 pages_objsize += stream.str().size();
904 offsets_.
back() += pages_objsize;
907 STRING utf16_title =
"FEFF";
909 char utf16[kMaxBytesPerCodepoint];
910 for (
char32 code : unicodes) {
911 if (CodepointToUtf16be(code, utf16)) {
912 utf16_title += utf16;
916 char* datestr = l_getFormattedDate();
919 << obj_ <<
" 0 obj\n"
922 " /CreationDate (D:" << datestr <<
")\n"
923 " /Title <" << utf16_title.
c_str() <<
">\n"
927 AppendPDFObject(stream.str().c_str());
929 stream <<
"xref\n0 " << obj_ <<
"\n0000000000 65535 f \n";
931 for (
int i = 1; i < obj_; i++) {
935 stream << offsets_[i] <<
" 00000 n \n";
940 <<
"trailer\n<<\n /Size " << obj_ <<
"\n"
942 " /Info " << (obj_ - 1) <<
" 0 R\n"
943 ">>\nstartxref\n" << offsets_.
back() <<
"\n%%EOF\n";