tesseract  4.0.0-1-g2a2b
tesseract::TessPDFRenderer Class Reference

#include <renderer.h>

Inheritance diagram for tesseract::TessPDFRenderer:
tesseract::TessResultRenderer

Public Member Functions

 TessPDFRenderer (const char *outputbase, const char *datadir, bool textonly=false)
 
- Public Member Functions inherited from tesseract::TessResultRenderer
virtual ~TessResultRenderer ()
 
void insert (TessResultRenderer *next)
 
TessResultRenderernext ()
 
bool BeginDocument (const char *title)
 
bool AddImage (TessBaseAPI *api)
 
bool EndDocument ()
 
const char * file_extension () const
 
const char * title () const
 
bool happy ()
 
int imagenum () const
 

Protected Member Functions

virtual bool BeginDocumentHandler ()
 
virtual bool AddImageHandler (TessBaseAPI *api)
 
virtual bool EndDocumentHandler ()
 
- Protected Member Functions inherited from tesseract::TessResultRenderer
 TessResultRenderer (const char *outputbase, const char *extension)
 
void AppendString (const char *s)
 
void AppendData (const char *s, int len)
 

Detailed Description

Renders tesseract output into searchable PDF

Definition at line 189 of file renderer.h.

Constructor & Destructor Documentation

◆ TessPDFRenderer()

tesseract::TessPDFRenderer::TessPDFRenderer ( const char *  outputbase,
const char *  datadir,
bool  textonly = false 
)

Definition at line 182 of file pdfrenderer.cpp.

184  : TessResultRenderer(outputbase, "pdf"),
185  datadir_(datadir) {
186  obj_ = 0;
187  textonly_ = textonly;
188  offsets_.push_back(0);
189 }
int push_back(T object)
TessResultRenderer(const char *outputbase, const char *extension)
Definition: renderer.cpp:33

Member Function Documentation

◆ AddImageHandler()

bool tesseract::TessPDFRenderer::AddImageHandler ( TessBaseAPI api)
protectedvirtual

Implements tesseract::TessResultRenderer.

Definition at line 839 of file pdfrenderer.cpp.

839  {
840  size_t n;
841  char buf[kBasicBufSize];
842  char buf2[kBasicBufSize];
843  Pix *pix = api->GetInputImage();
844  const char* filename = api->GetInputName();
845  int ppi = api->GetSourceYResolution();
846  if (!pix || ppi <= 0)
847  return false;
848  double width = pixGetWidth(pix) * 72.0 / ppi;
849  double height = pixGetHeight(pix) * 72.0 / ppi;
850 
851  snprintf(buf2, sizeof(buf2), "/XObject << /Im1 %ld 0 R >>\n", obj_ + 2);
852  const char *xobject = (textonly_) ? "" : buf2;
853 
854  // PAGE
855  n = snprintf(buf, sizeof(buf),
856  "%ld 0 obj\n"
857  "<<\n"
858  " /Type /Page\n"
859  " /Parent %ld 0 R\n"
860  " /MediaBox [0 0 %.2f %.2f]\n"
861  " /Contents %ld 0 R\n"
862  " /Resources\n"
863  " <<\n"
864  " %s"
865  " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
866  " /Font << /f-0-0 %ld 0 R >>\n"
867  " >>\n"
868  ">>\n"
869  "endobj\n",
870  obj_,
871  2L, // Pages object
872  width, height,
873  obj_ + 1, // Contents object
874  xobject, // Image object
875  3L); // Type0 Font
876  if (n >= sizeof(buf)) return false;
877  pages_.push_back(obj_);
878  AppendPDFObject(buf);
879 
880  // CONTENTS
881  const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
882  const size_t pdftext_len = strlen(pdftext.get());
883  size_t len;
884  unsigned char *comp_pdftext = zlibCompress(
885  reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
886  long comp_pdftext_len = len;
887  n = snprintf(buf, sizeof(buf),
888  "%ld 0 obj\n"
889  "<<\n"
890  " /Length %ld /Filter /FlateDecode\n"
891  ">>\n"
892  "stream\n", obj_, comp_pdftext_len);
893  if (n >= sizeof(buf)) {
894  lept_free(comp_pdftext);
895  return false;
896  }
897  AppendString(buf);
898  long objsize = strlen(buf);
899  AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
900  objsize += comp_pdftext_len;
901  lept_free(comp_pdftext);
902  const char *b2 =
903  "endstream\n"
904  "endobj\n";
905  AppendString(b2);
906  objsize += strlen(b2);
907  AppendPDFObjectDIY(objsize);
908 
909  if (!textonly_) {
910  char *pdf_object = nullptr;
911  int jpg_quality;
912  api->GetIntVariable("jpg_quality", &jpg_quality);
913  if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize,
914  jpg_quality)) {
915  return false;
916  }
917  AppendData(pdf_object, objsize);
918  AppendPDFObjectDIY(objsize);
919  delete[] pdf_object;
920  }
921  return true;
922 }
void AppendData(const char *s, int len)
Definition: renderer.cpp:106
int push_back(T object)
void AppendString(const char *s)
Definition: renderer.cpp:102

◆ BeginDocumentHandler()

bool tesseract::TessPDFRenderer::BeginDocumentHandler ( )
protectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 495 of file pdfrenderer.cpp.

495  {
496  char buf[kBasicBufSize];
497  size_t n;
498 
499  n = snprintf(buf, sizeof(buf),
500  "%%PDF-1.5\n"
501  "%%%c%c%c%c\n",
502  0xDE, 0xAD, 0xBE, 0xEB);
503  if (n >= sizeof(buf)) return false;
504  AppendPDFObject(buf);
505 
506  // CATALOG
507  n = snprintf(buf, sizeof(buf),
508  "1 0 obj\n"
509  "<<\n"
510  " /Type /Catalog\n"
511  " /Pages %ld 0 R\n"
512  ">>\n"
513  "endobj\n",
514  2L);
515  if (n >= sizeof(buf)) return false;
516  AppendPDFObject(buf);
517 
518  // We are reserving object #2 for the /Pages
519  // object, which I am going to create and write
520  // at the end of the PDF file.
521  AppendPDFObject("");
522 
523  // TYPE0 FONT
524  n = snprintf(buf, sizeof(buf),
525  "3 0 obj\n"
526  "<<\n"
527  " /BaseFont /GlyphLessFont\n"
528  " /DescendantFonts [ %ld 0 R ]\n"
529  " /Encoding /Identity-H\n"
530  " /Subtype /Type0\n"
531  " /ToUnicode %ld 0 R\n"
532  " /Type /Font\n"
533  ">>\n"
534  "endobj\n",
535  4L, // CIDFontType2 font
536  6L // ToUnicode
537  );
538  if (n >= sizeof(buf)) return false;
539  AppendPDFObject(buf);
540 
541  // CIDFONTTYPE2
542  n = snprintf(buf, sizeof(buf),
543  "4 0 obj\n"
544  "<<\n"
545  " /BaseFont /GlyphLessFont\n"
546  " /CIDToGIDMap %ld 0 R\n"
547  " /CIDSystemInfo\n"
548  " <<\n"
549  " /Ordering (Identity)\n"
550  " /Registry (Adobe)\n"
551  " /Supplement 0\n"
552  " >>\n"
553  " /FontDescriptor %ld 0 R\n"
554  " /Subtype /CIDFontType2\n"
555  " /Type /Font\n"
556  " /DW %d\n"
557  ">>\n"
558  "endobj\n",
559  5L, // CIDToGIDMap
560  7L, // Font descriptor
561  1000 / kCharWidth);
562  if (n >= sizeof(buf)) return false;
563  AppendPDFObject(buf);
564 
565  // CIDTOGIDMAP
566  const int kCIDToGIDMapSize = 2 * (1 << 16);
567  const std::unique_ptr<unsigned char[]> cidtogidmap(
568  new unsigned char[kCIDToGIDMapSize]);
569  for (int i = 0; i < kCIDToGIDMapSize; i++) {
570  cidtogidmap[i] = (i % 2) ? 1 : 0;
571  }
572  size_t len;
573  unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
574  n = snprintf(buf, sizeof(buf),
575  "5 0 obj\n"
576  "<<\n"
577  " /Length %lu /Filter /FlateDecode\n"
578  ">>\n"
579  "stream\n",
580  (unsigned long)len);
581  if (n >= sizeof(buf)) {
582  lept_free(comp);
583  return false;
584  }
585  AppendString(buf);
586  long objsize = strlen(buf);
587  AppendData(reinterpret_cast<char *>(comp), len);
588  objsize += len;
589  lept_free(comp);
590  const char *endstream_endobj =
591  "endstream\n"
592  "endobj\n";
593  AppendString(endstream_endobj);
594  objsize += strlen(endstream_endobj);
595  AppendPDFObjectDIY(objsize);
596 
597  const char *stream =
598  "/CIDInit /ProcSet findresource begin\n"
599  "12 dict begin\n"
600  "begincmap\n"
601  "/CIDSystemInfo\n"
602  "<<\n"
603  " /Registry (Adobe)\n"
604  " /Ordering (UCS)\n"
605  " /Supplement 0\n"
606  ">> def\n"
607  "/CMapName /Adobe-Identify-UCS def\n"
608  "/CMapType 2 def\n"
609  "1 begincodespacerange\n"
610  "<0000> <FFFF>\n"
611  "endcodespacerange\n"
612  "1 beginbfrange\n"
613  "<0000> <FFFF> <0000>\n"
614  "endbfrange\n"
615  "endcmap\n"
616  "CMapName currentdict /CMap defineresource pop\n"
617  "end\n"
618  "end\n";
619 
620  // TOUNICODE
621  n = snprintf(buf, sizeof(buf),
622  "6 0 obj\n"
623  "<< /Length %lu >>\n"
624  "stream\n"
625  "%s"
626  "endstream\n"
627  "endobj\n", (unsigned long) strlen(stream), stream);
628  if (n >= sizeof(buf)) return false;
629  AppendPDFObject(buf);
630 
631  // FONT DESCRIPTOR
632  n = snprintf(buf, sizeof(buf),
633  "7 0 obj\n"
634  "<<\n"
635  " /Ascent %d\n"
636  " /CapHeight %d\n"
637  " /Descent -1\n" // Spec says must be negative
638  " /Flags 5\n" // FixedPitch + Symbolic
639  " /FontBBox [ 0 0 %d %d ]\n"
640  " /FontFile2 %ld 0 R\n"
641  " /FontName /GlyphLessFont\n"
642  " /ItalicAngle 0\n"
643  " /StemV 80\n"
644  " /Type /FontDescriptor\n"
645  ">>\n"
646  "endobj\n",
647  1000,
648  1000,
649  1000 / kCharWidth,
650  1000,
651  8L // Font data
652  );
653  if (n >= sizeof(buf)) return false;
654  AppendPDFObject(buf);
655 
656  n = snprintf(buf, sizeof(buf), "%s/pdf.ttf", datadir_.c_str());
657  if (n >= sizeof(buf)) return false;
658  FILE *fp = fopen(buf, "rb");
659  if (!fp) {
660  tprintf("Can not open file \"%s\"!\n", buf);
661  return false;
662  }
663  fseek(fp, 0, SEEK_END);
664  long int size = ftell(fp);
665  if (size < 0) {
666  fclose(fp);
667  return false;
668  }
669  fseek(fp, 0, SEEK_SET);
670  const std::unique_ptr<char[]> buffer(new char[size]);
671  if (!tesseract::DeSerialize(fp, buffer.get(), size)) {
672  fclose(fp);
673  return false;
674  }
675  fclose(fp);
676  // FONTFILE2
677  n = snprintf(buf, sizeof(buf),
678  "8 0 obj\n"
679  "<<\n"
680  " /Length %ld\n"
681  " /Length1 %ld\n"
682  ">>\n"
683  "stream\n", size, size);
684  if (n >= sizeof(buf)) {
685  return false;
686  }
687  AppendString(buf);
688  objsize = strlen(buf);
689  AppendData(buffer.get(), size);
690  objsize += size;
691  AppendString(endstream_endobj);
692  objsize += strlen(endstream_endobj);
693  AppendPDFObjectDIY(objsize);
694  return true;
695 }
void AppendData(const char *s, int len)
Definition: renderer.cpp:106
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void AppendString(const char *s)
Definition: renderer.cpp:102
bool DeSerialize(FILE *fp, char *data, size_t n)
Definition: serialis.cpp:27

◆ EndDocumentHandler()

bool tesseract::TessPDFRenderer::EndDocumentHandler ( )
protectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 925 of file pdfrenderer.cpp.

925  {
926  size_t n;
927  char buf[kBasicBufSize];
928 
929  // We reserved the /Pages object number early, so that the /Page
930  // objects could refer to their parent. We finally have enough
931  // information to go fill it in. Using lower level calls to manipulate
932  // the offset record in two spots, because we are placing objects
933  // out of order in the file.
934 
935  // PAGES
936  const long int kPagesObjectNumber = 2;
937  offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
938  n = snprintf(buf, sizeof(buf),
939  "%ld 0 obj\n"
940  "<<\n"
941  " /Type /Pages\n"
942  " /Kids [ ", kPagesObjectNumber);
943  if (n >= sizeof(buf)) return false;
944  AppendString(buf);
945  size_t pages_objsize = strlen(buf);
946  for (size_t i = 0; i < pages_.unsigned_size(); i++) {
947  n = snprintf(buf, sizeof(buf),
948  "%ld 0 R ", pages_[i]);
949  if (n >= sizeof(buf)) return false;
950  AppendString(buf);
951  pages_objsize += strlen(buf);
952  }
953  n = snprintf(buf, sizeof(buf),
954  "]\n"
955  " /Count %d\n"
956  ">>\n"
957  "endobj\n", pages_.size());
958  if (n >= sizeof(buf)) return false;
959  AppendString(buf);
960  pages_objsize += strlen(buf);
961  offsets_.back() += pages_objsize; // manipulation #2
962 
963  // INFO
964  STRING utf16_title = "FEFF"; // byte_order_marker
965  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());
966  char utf16[kMaxBytesPerCodepoint];
967  for (char32 code : unicodes) {
968  if (CodepointToUtf16be(code, utf16)) {
969  utf16_title += utf16;
970  }
971  }
972 
973  char* datestr = l_getFormattedDate();
974  n = snprintf(buf, sizeof(buf),
975  "%ld 0 obj\n"
976  "<<\n"
977  " /Producer (Tesseract %s)\n"
978  " /CreationDate (D:%s)\n"
979  " /Title <%s>\n"
980  ">>\n"
981  "endobj\n",
983  datestr, utf16_title.c_str());
984  lept_free(datestr);
985  if (n >= sizeof(buf)) return false;
986  AppendPDFObject(buf);
987  n = snprintf(buf, sizeof(buf),
988  "xref\n"
989  "0 %ld\n"
990  "0000000000 65535 f \n", obj_);
991  if (n >= sizeof(buf)) return false;
992  AppendString(buf);
993  for (int i = 1; i < obj_; i++) {
994  n = snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]);
995  if (n >= sizeof(buf)) return false;
996  AppendString(buf);
997  }
998  n = snprintf(buf, sizeof(buf),
999  "trailer\n"
1000  "<<\n"
1001  " /Size %ld\n"
1002  " /Root %ld 0 R\n"
1003  " /Info %ld 0 R\n"
1004  ">>\n"
1005  "startxref\n"
1006  "%ld\n"
1007  "%%%%EOF\n",
1008  obj_,
1009  1L, // catalog
1010  obj_ - 1, // info
1011  offsets_.back());
1012  if (n >= sizeof(buf)) return false;
1013  AppendString(buf);
1014  return true;
1015 }
signed int char32
int size() const
Definition: genericvector.h:71
T & back() const
const char * c_str() const
Definition: strngs.cpp:207
size_t unsigned_size() const
Definition: genericvector.h:75
const char * title() const
Definition: renderer.h:81
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:213
void AppendString(const char *s)
Definition: renderer.cpp:102
Definition: strngs.h:45
static const char * Version()
Definition: baseapi.cpp:223

The documentation for this class was generated from the following files: