20 #include "config_auto.h" 24 #include "allheaders.h" 169 static const int kBasicBufSize = 2048;
172 static const int kCharWidth = 2;
177 static const int kMaxBytesPerCodepoint = 20;
187 textonly_ = textonly;
191 void TessPDFRenderer::AppendPDFObjectDIY(
size_t objectsize) {
196 void TessPDFRenderer::AppendPDFObject(
const char *data) {
197 AppendPDFObjectDIY(strlen(data));
204 static double prec(
double x) {
205 double kPrecision = 1000.0;
206 double a = round(x * kPrecision) / kPrecision;
212 static long dist2(
int x1,
int y1,
int x2,
int y2) {
213 return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
224 static void GetWordBaseline(
int writing_direction,
int ppi,
int height,
225 int word_x1,
int word_y1,
int word_x2,
int word_y2,
226 int line_x1,
int line_y1,
int line_x2,
int line_y2,
227 double *x0,
double *y0,
double *length) {
229 Swap(&word_x1, &word_x2);
230 Swap(&word_y1, &word_y2);
237 double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
242 double t = ((px - line_x2) * (line_x2 - line_x1) +
243 (py - line_y2) * (line_y2 - line_y1)) / l2;
244 x = line_x2 + t * (line_x2 - line_x1);
245 y = line_y2 + t * (line_y2 - line_y1);
247 word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
249 word_length = word_length * 72.0 / ppi;
251 y = height - (y * 72.0 / ppi);
255 *length = word_length;
266 static void AffineMatrix(
int writing_direction,
267 int line_x1,
int line_y1,
int line_x2,
int line_y2,
268 double *a,
double *b,
double *c,
double *d) {
269 double theta = atan2(static_cast<double>(line_y1 - line_y2),
270 static_cast<double>(line_x2 - line_x1));
275 switch(writing_direction) {
295 static void ClipBaseline(
int ppi,
int x1,
int y1,
int x2,
int y2,
296 int *line_x1,
int *line_y1,
297 int *line_x2,
int *line_y2) {
302 int rise = abs(y2 - y1) * 72;
303 int run = abs(x2 - x1) * 72;
304 if (rise < 2 * ppi && 2 * ppi < run)
305 *line_y1 = *line_y2 = (y1 + y2) / 2;
308 static bool CodepointToUtf16be(
int code,
char utf16[kMaxBytesPerCodepoint]) {
309 if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
310 tprintf(
"Dropping invalid codepoint %d\n", code);
313 if (code < 0x10000) {
314 snprintf(utf16, kMaxBytesPerCodepoint,
"%04X", code);
316 int a = code - 0x010000;
317 int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
318 int low_surrogate = (0x03FF & a) + 0xDC00;
319 snprintf(utf16, kMaxBytesPerCodepoint,
320 "%04X%04X", high_surrogate, low_surrogate);
325 char* TessPDFRenderer::GetPDFTextObjects(
TessBaseAPI* api,
326 double width,
double height) {
328 double ppi = api->GetSourceYResolution();
331 double old_x = 0.0, old_y = 0.0;
332 int old_fontsize = 0;
335 bool new_block =
true;
346 pdf_str.add_str_double(
"", prec(width));
348 pdf_str.add_str_double(
"", prec(height));
349 pdf_str +=
" 0 0 cm";
351 pdf_str +=
" /Im1 Do";
360 ResultIterator *res_it = api->GetIterator();
362 if (res_it->IsAtBeginningOf(
RIL_BLOCK)) {
363 pdf_str +=
"BT\n3 Tr";
371 ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
385 res_it->Orientation(&orientation, &writing_direction,
386 &textline_order, &deskew_angle);
388 switch (res_it->WordDirection()) {
396 writing_direction = old_writing_direction;
402 double x, y, word_length;
404 int word_x1, word_y1, word_x2, word_y2;
405 res_it->Baseline(
RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
406 GetWordBaseline(writing_direction, ppi, height,
407 word_x1, word_y1, word_x2, word_y2,
408 line_x1, line_y1, line_x2, line_y2,
409 &x, &y, &word_length);
412 if (writing_direction != old_writing_direction || new_block) {
413 AffineMatrix(writing_direction,
414 line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
415 pdf_str.add_str_double(
" ", prec(a));
416 pdf_str.add_str_double(
" ", prec(b));
417 pdf_str.add_str_double(
" ", prec(c));
418 pdf_str.add_str_double(
" ", prec(d));
419 pdf_str.add_str_double(
" ", prec(x));
420 pdf_str.add_str_double(
" ", prec(y));
424 double dx = x - old_x;
425 double dy = y - old_y;
426 pdf_str.add_str_double(
" ", prec(dx * a + dy * b));
427 pdf_str.add_str_double(
" ", prec(dx * c + dy * d));
432 old_writing_direction = writing_direction;
439 bool bold, italic, underlined, monospace, serif, smallcaps;
441 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
442 &serif, &smallcaps, &fontsize, &font_id);
443 const int kDefaultFontsize = 8;
445 fontsize = kDefaultFontsize;
446 if (fontsize != old_fontsize) {
448 snprintf(textfont,
sizeof(textfont),
"/f-0-0 %d Tf ", fontsize);
450 old_fontsize = fontsize;
457 int pdf_word_len = 0;
459 const std::unique_ptr<const char[]> grapheme(
461 if (grapheme && grapheme[0] !=
'\0') {
463 char utf16[kMaxBytesPerCodepoint];
464 for (
char32 code : unicodes) {
465 if (CodepointToUtf16be(code, utf16)) {
473 if (word_length > 0 && pdf_word_len > 0) {
475 kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
476 pdf_str.add_str_double(
"", h_stretch);
482 if (last_word_in_line) {
485 if (last_word_in_block) {
489 char *ret =
new char[pdf_str.length() + 1];
490 strcpy(ret, pdf_str.string());
496 char buf[kBasicBufSize];
499 n = snprintf(buf,
sizeof(buf),
502 0xDE, 0xAD, 0xBE, 0xEB);
503 if (n >=
sizeof(buf))
return false;
504 AppendPDFObject(buf);
507 n = snprintf(buf,
sizeof(buf),
515 if (n >=
sizeof(buf))
return false;
516 AppendPDFObject(buf);
524 n = snprintf(buf,
sizeof(buf),
527 " /BaseFont /GlyphLessFont\n" 528 " /DescendantFonts [ %ld 0 R ]\n" 529 " /Encoding /Identity-H\n" 531 " /ToUnicode %ld 0 R\n" 538 if (n >=
sizeof(buf))
return false;
539 AppendPDFObject(buf);
542 n = snprintf(buf,
sizeof(buf),
545 " /BaseFont /GlyphLessFont\n" 546 " /CIDToGIDMap %ld 0 R\n" 549 " /Ordering (Identity)\n" 550 " /Registry (Adobe)\n" 553 " /FontDescriptor %ld 0 R\n" 554 " /Subtype /CIDFontType2\n" 562 if (n >=
sizeof(buf))
return false;
563 AppendPDFObject(buf);
566 const int kCIDToGIDMapSize = 2 * (1 << 16);
567 const std::unique_ptr<unsigned char[]> cidtogidmap(
568 new unsigned char[kCIDToGIDMapSize]);
569 for (
int i = 0; i < kCIDToGIDMapSize; i++) {
570 cidtogidmap[i] = (i % 2) ? 1 : 0;
573 unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
574 n = snprintf(buf,
sizeof(buf),
577 " /Length %lu /Filter /FlateDecode\n" 581 if (n >=
sizeof(buf)) {
586 long objsize = strlen(buf);
587 AppendData(reinterpret_cast<char *>(comp), len);
590 const char *endstream_endobj =
594 objsize += strlen(endstream_endobj);
595 AppendPDFObjectDIY(objsize);
598 "/CIDInit /ProcSet findresource begin\n" 603 " /Registry (Adobe)\n" 607 "/CMapName /Adobe-Identify-UCS def\n" 609 "1 begincodespacerange\n" 611 "endcodespacerange\n" 613 "<0000> <FFFF> <0000>\n" 616 "CMapName currentdict /CMap defineresource pop\n" 621 n = snprintf(buf,
sizeof(buf),
623 "<< /Length %lu >>\n" 627 "endobj\n", (
unsigned long) strlen(stream), stream);
628 if (n >=
sizeof(buf))
return false;
629 AppendPDFObject(buf);
632 n = snprintf(buf,
sizeof(buf),
639 " /FontBBox [ 0 0 %d %d ]\n" 640 " /FontFile2 %ld 0 R\n" 641 " /FontName /GlyphLessFont\n" 644 " /Type /FontDescriptor\n" 653 if (n >=
sizeof(buf))
return false;
654 AppendPDFObject(buf);
656 n = snprintf(buf,
sizeof(buf),
"%s/pdf.ttf", datadir_.c_str());
657 if (n >=
sizeof(buf))
return false;
658 FILE *fp = fopen(buf,
"rb");
660 tprintf(
"Can not open file \"%s\"!\n", buf);
663 fseek(fp, 0, SEEK_END);
664 long int size = ftell(fp);
669 fseek(fp, 0, SEEK_SET);
670 const std::unique_ptr<char[]> buffer(
new char[size]);
677 n = snprintf(buf,
sizeof(buf),
683 "stream\n", size, size);
684 if (n >=
sizeof(buf)) {
688 objsize = strlen(buf);
692 objsize += strlen(endstream_endobj);
693 AppendPDFObjectDIY(objsize);
697 bool TessPDFRenderer::imageToPDFObj(Pix *pix,
698 const char* filename,
701 long int* pdf_object_size,
702 const int jpg_quality) {
704 char b0[kBasicBufSize];
705 char b1[kBasicBufSize];
706 char b2[kBasicBufSize];
707 if (!pdf_object_size || !pdf_object)
709 *pdf_object =
nullptr;
710 *pdf_object_size = 0;
711 if (!filename && !pix)
714 L_Compressed_Data *cid =
nullptr;
717 if (pixGetInputFormat(pix) == IFF_PNG)
718 sad = pixGenerateCIData(pix, L_FLATE_ENCODE, 0, 0, &cid);
720 sad = l_generateCIDataForPdf(filename, pix, jpg_quality, &cid);
724 l_CIDataDestroy(&cid);
728 const char *group4 =
"";
732 filter =
"/FlateDecode";
735 filter =
"/DCTDecode";
738 filter =
"/CCITTFaxDecode";
742 filter =
"/JPXDecode";
745 l_CIDataDestroy(&cid);
752 const char *colorspace;
753 if (cid->ncolors > 0) {
754 n = snprintf(b0,
sizeof(b0),
755 " /ColorSpace [ /Indexed /DeviceRGB %d %s ]\n",
756 cid->ncolors - 1, cid->cmapdatahex);
757 if (n >=
sizeof(b0)) {
758 l_CIDataDestroy(&cid);
765 colorspace =
" /ColorSpace /DeviceGray\n";
768 colorspace =
" /ColorSpace /DeviceRGB\n";
771 l_CIDataDestroy(&cid);
776 int predictor = (cid->predictor) ? 14 : 1;
779 n = snprintf(b1,
sizeof(b1),
783 " /Subtype /Image\n",
784 objnum, (
unsigned long) cid->nbytescomp);
785 if (n >=
sizeof(b1)) {
786 l_CIDataDestroy(&cid);
790 n = snprintf(b2,
sizeof(b2),
793 " /BitsPerComponent %d\n" 801 " /BitsPerComponent %d\n" 805 cid->w, cid->h, cid->bps, filter, predictor, cid->spp,
806 group4, cid->w, cid->bps);
807 if (n >=
sizeof(b2)) {
808 l_CIDataDestroy(&cid);
816 size_t b1_len = strlen(b1);
817 size_t b2_len = strlen(b2);
818 size_t b3_len = strlen(b3);
819 size_t colorspace_len = strlen(colorspace);
822 b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len;
823 *pdf_object =
new char[*pdf_object_size];
825 char *p = *pdf_object;
826 memcpy(p, b1, b1_len);
828 memcpy(p, colorspace, colorspace_len);
830 memcpy(p, b2, b2_len);
832 memcpy(p, cid->datacomp, cid->nbytescomp);
833 p += cid->nbytescomp;
834 memcpy(p, b3, b3_len);
835 l_CIDataDestroy(&cid);
841 char buf[kBasicBufSize];
842 char buf2[kBasicBufSize];
846 if (!pix || ppi <= 0)
848 double width = pixGetWidth(pix) * 72.0 / ppi;
849 double height = pixGetHeight(pix) * 72.0 / ppi;
851 snprintf(buf2,
sizeof(buf2),
"/XObject << /Im1 %ld 0 R >>\n", obj_ + 2);
852 const char *xobject = (textonly_) ?
"" : buf2;
855 n = snprintf(buf,
sizeof(buf),
860 " /MediaBox [0 0 %.2f %.2f]\n" 861 " /Contents %ld 0 R\n" 865 " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n" 866 " /Font << /f-0-0 %ld 0 R >>\n" 876 if (n >=
sizeof(buf))
return false;
878 AppendPDFObject(buf);
881 const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
882 const size_t pdftext_len = strlen(pdftext.get());
884 unsigned char *comp_pdftext = zlibCompress(
885 reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
886 long comp_pdftext_len = len;
887 n = snprintf(buf,
sizeof(buf),
890 " /Length %ld /Filter /FlateDecode\n" 892 "stream\n", obj_, comp_pdftext_len);
893 if (n >=
sizeof(buf)) {
894 lept_free(comp_pdftext);
898 long objsize = strlen(buf);
899 AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
900 objsize += comp_pdftext_len;
901 lept_free(comp_pdftext);
906 objsize += strlen(b2);
907 AppendPDFObjectDIY(objsize);
910 char *pdf_object =
nullptr;
913 if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize,
918 AppendPDFObjectDIY(objsize);
927 char buf[kBasicBufSize];
936 const long int kPagesObjectNumber = 2;
937 offsets_[kPagesObjectNumber] = offsets_.
back();
938 n = snprintf(buf,
sizeof(buf),
942 " /Kids [ ", kPagesObjectNumber);
943 if (n >=
sizeof(buf))
return false;
945 size_t pages_objsize = strlen(buf);
947 n = snprintf(buf,
sizeof(buf),
948 "%ld 0 R ", pages_[i]);
949 if (n >=
sizeof(buf))
return false;
951 pages_objsize += strlen(buf);
953 n = snprintf(buf,
sizeof(buf),
957 "endobj\n", pages_.
size());
958 if (n >=
sizeof(buf))
return false;
960 pages_objsize += strlen(buf);
961 offsets_.
back() += pages_objsize;
964 STRING utf16_title =
"FEFF";
966 char utf16[kMaxBytesPerCodepoint];
967 for (
char32 code : unicodes) {
968 if (CodepointToUtf16be(code, utf16)) {
969 utf16_title += utf16;
973 char* datestr = l_getFormattedDate();
974 n = snprintf(buf,
sizeof(buf),
977 " /Producer (Tesseract %s)\n" 978 " /CreationDate (D:%s)\n" 983 datestr, utf16_title.
c_str());
985 if (n >=
sizeof(buf))
return false;
986 AppendPDFObject(buf);
987 n = snprintf(buf,
sizeof(buf),
990 "0000000000 65535 f \n", obj_);
991 if (n >=
sizeof(buf))
return false;
993 for (
int i = 1; i < obj_; i++) {
994 n = snprintf(buf,
sizeof(buf),
"%010ld 00000 n \n", offsets_[i]);
995 if (n >=
sizeof(buf))
return false;
998 n = snprintf(buf,
sizeof(buf),
1012 if (n >=
sizeof(buf))
return false;
virtual bool AddImageHandler(TessBaseAPI *api)
virtual bool BeginDocumentHandler()
struct TessBaseAPI TessBaseAPI
const char * c_str() const
void AppendData(const char *s, int len)
TessPDFRenderer(const char *outputbase, const char *datadir, bool textonly=false)
virtual bool EndDocumentHandler()
size_t unsigned_size() const
bool GetIntVariable(const char *name, int *value) const
DLLSYM void tprintf(const char *format,...)
int GetSourceYResolution()
const char * title() const
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
void AppendString(const char *s)
static const char * Version()
const char * GetInputName()
bool DeSerialize(FILE *fp, char *data, size_t n)