tesseract  5.0.0-alpha-619-ge9db
tesseract::TessPDFRenderer Class Reference

#include <renderer.h>

Inheritance diagram for tesseract::TessPDFRenderer:
tesseract::TessResultRenderer

Public Member Functions

 TessPDFRenderer (const char *outputbase, const char *datadir, bool textonly=false)
 
- Public Member Functions inherited from tesseract::TessResultRenderer
virtual ~TessResultRenderer ()
 
void insert (TessResultRenderer *next)
 
TessResultRenderernext ()
 
bool BeginDocument (const char *title)
 
bool AddImage (TessBaseAPI *api)
 
bool EndDocument ()
 
const char * file_extension () const
 
const char * title () const
 
bool happy ()
 
int imagenum () const
 

Protected Member Functions

bool BeginDocumentHandler () override
 
bool AddImageHandler (TessBaseAPI *api) override
 
bool EndDocumentHandler () override
 
- Protected Member Functions inherited from tesseract::TessResultRenderer
 TessResultRenderer (const char *outputbase, const char *extension)
 
void AppendString (const char *s)
 
void AppendData (const char *s, int len)
 

Detailed Description

Renders tesseract output into searchable PDF

Definition at line 215 of file renderer.h.

Constructor & Destructor Documentation

◆ TessPDFRenderer()

tesseract::TessPDFRenderer::TessPDFRenderer ( const char *  outputbase,
const char *  datadir,
bool  textonly = false 
)

Definition at line 178 of file pdfrenderer.cpp.

181  : TessResultRenderer(outputbase, "pdf"),
182  datadir_(datadir) {
183  obj_ = 0;
184  textonly_ = textonly;
185  offsets_.push_back(0);

Member Function Documentation

◆ AddImageHandler()

bool tesseract::TessPDFRenderer::AddImageHandler ( TessBaseAPI api)
overrideprotectedvirtual

Implements tesseract::TessResultRenderer.

Definition at line 796 of file pdfrenderer.cpp.

797  {
798  Pix *pix = api->GetInputImage();
799  const char* filename = api->GetInputName();
800  int ppi = api->GetSourceYResolution();
801  if (!pix || ppi <= 0)
802  return false;
803  double width = pixGetWidth(pix) * 72.0 / ppi;
804  double height = pixGetHeight(pix) * 72.0 / ppi;
805 
806  std::stringstream xobject;
807  // Use "C" locale (needed for int values larger than 999).
808  xobject.imbue(std::locale::classic());
809  if (!textonly_) {
810  xobject << "/XObject << /Im1 " << (obj_ + 2) << " 0 R >>\n";
811  }
812 
813  // PAGE
814  std::stringstream stream;
815  // Use "C" locale (needed for double values width and height).
816  stream.imbue(std::locale::classic());
817  stream.precision(2);
818  stream << std::fixed <<
819  obj_ << " 0 obj\n"
820  "<<\n"
821  " /Type /Page\n"
822  " /Parent 2 0 R\n" // Pages object
823  " /MediaBox [0 0 " << width << " " << height << "]\n"
824  " /Contents " << (obj_ + 1) << " 0 R\n" // Contents object
825  " /Resources\n"
826  " <<\n"
827  " " << xobject.str() << // Image object
828  " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
829  " /Font << /f-0-0 3 0 R >>\n" // Type0 Font
830  " >>\n"
831  ">>\n"
832  "endobj\n";
833  pages_.push_back(obj_);
834  AppendPDFObject(stream.str().c_str());
835 
836  // CONTENTS
837  const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
838  const size_t pdftext_len = strlen(pdftext.get());
839  size_t len;
840  unsigned char *comp_pdftext = zlibCompress(
841  reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
842  long comp_pdftext_len = len;
843  stream.str("");
844  stream <<
845  obj_ << " 0 obj\n"
846  "<<\n"
847  " /Length " << comp_pdftext_len << " /Filter /FlateDecode\n"
848  ">>\n"
849  "stream\n";
850  AppendString(stream.str().c_str());
851  long objsize = stream.str().size();
852  AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
853  objsize += comp_pdftext_len;
854  lept_free(comp_pdftext);
855  const char *b2 =
856  "endstream\n"
857  "endobj\n";
858  AppendString(b2);
859  objsize += strlen(b2);
860  AppendPDFObjectDIY(objsize);
861 
862  if (!textonly_) {
863  char *pdf_object = nullptr;
864  int jpg_quality;
865  api->GetIntVariable("jpg_quality", &jpg_quality);
866  if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize,
867  jpg_quality)) {
868  return false;
869  }
870  AppendData(pdf_object, objsize);
871  AppendPDFObjectDIY(objsize);
872  delete[] pdf_object;
873  }
874  return true;

◆ BeginDocumentHandler()

bool tesseract::TessPDFRenderer::BeginDocumentHandler ( )
overrideprotectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 493 of file pdfrenderer.cpp.

494  {
495  AppendPDFObject("%PDF-1.5\n%\xDE\xAD\xBE\xEB\n");
496 
497  // CATALOG
498  AppendPDFObject("1 0 obj\n"
499  "<<\n"
500  " /Type /Catalog\n"
501  " /Pages 2 0 R\n"
502  ">>\nendobj\n");
503 
504  // We are reserving object #2 for the /Pages
505  // object, which I am going to create and write
506  // at the end of the PDF file.
507  AppendPDFObject("");
508 
509  // TYPE0 FONT
510  AppendPDFObject("3 0 obj\n"
511  "<<\n"
512  " /BaseFont /GlyphLessFont\n"
513  " /DescendantFonts [ 4 0 R ]\n" // CIDFontType2 font
514  " /Encoding /Identity-H\n"
515  " /Subtype /Type0\n"
516  " /ToUnicode 6 0 R\n" // ToUnicode
517  " /Type /Font\n"
518  ">>\n"
519  "endobj\n");
520 
521  // CIDFONTTYPE2
522  std::stringstream stream;
523  // Use "C" locale (needed for int values larger than 999).
524  stream.imbue(std::locale::classic());
525  stream <<
526  "4 0 obj\n"
527  "<<\n"
528  " /BaseFont /GlyphLessFont\n"
529  " /CIDToGIDMap 5 0 R\n" // CIDToGIDMap
530  " /CIDSystemInfo\n"
531  " <<\n"
532  " /Ordering (Identity)\n"
533  " /Registry (Adobe)\n"
534  " /Supplement 0\n"
535  " >>\n"
536  " /FontDescriptor 7 0 R\n" // Font descriptor
537  " /Subtype /CIDFontType2\n"
538  " /Type /Font\n"
539  " /DW " << (1000 / kCharWidth) << "\n"
540  ">>\n"
541  "endobj\n";
542  AppendPDFObject(stream.str().c_str());
543 
544  // CIDTOGIDMAP
545  const int kCIDToGIDMapSize = 2 * (1 << 16);
546  const std::unique_ptr<unsigned char[]> cidtogidmap(
547  new unsigned char[kCIDToGIDMapSize]);
548  for (int i = 0; i < kCIDToGIDMapSize; i++) {
549  cidtogidmap[i] = (i % 2) ? 1 : 0;
550  }
551  size_t len;
552  unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
553  stream.str("");
554  stream <<
555  "5 0 obj\n"
556  "<<\n"
557  " /Length " << len << " /Filter /FlateDecode\n"
558  ">>\n"
559  "stream\n";
560  AppendString(stream.str().c_str());
561  long objsize = stream.str().size();
562  AppendData(reinterpret_cast<char *>(comp), len);
563  objsize += len;
564  lept_free(comp);
565  const char *endstream_endobj =
566  "endstream\n"
567  "endobj\n";
568  AppendString(endstream_endobj);
569  objsize += strlen(endstream_endobj);
570  AppendPDFObjectDIY(objsize);
571 
572  const char stream2[] =
573  "/CIDInit /ProcSet findresource begin\n"
574  "12 dict begin\n"
575  "begincmap\n"
576  "/CIDSystemInfo\n"
577  "<<\n"
578  " /Registry (Adobe)\n"
579  " /Ordering (UCS)\n"
580  " /Supplement 0\n"
581  ">> def\n"
582  "/CMapName /Adobe-Identify-UCS def\n"
583  "/CMapType 2 def\n"
584  "1 begincodespacerange\n"
585  "<0000> <FFFF>\n"
586  "endcodespacerange\n"
587  "1 beginbfrange\n"
588  "<0000> <FFFF> <0000>\n"
589  "endbfrange\n"
590  "endcmap\n"
591  "CMapName currentdict /CMap defineresource pop\n"
592  "end\n"
593  "end\n";
594 
595  // TOUNICODE
596  stream.str("");
597  stream <<
598  "6 0 obj\n"
599  "<< /Length " << (sizeof(stream2) - 1) << " >>\n"
600  "stream\n" << stream2 <<
601  "endstream\n"
602  "endobj\n";
603  AppendPDFObject(stream.str().c_str());
604 
605  // FONT DESCRIPTOR
606  stream.str("");
607  stream <<
608  "7 0 obj\n"
609  "<<\n"
610  " /Ascent 1000\n"
611  " /CapHeight 1000\n"
612  " /Descent -1\n" // Spec says must be negative
613  " /Flags 5\n" // FixedPitch + Symbolic
614  " /FontBBox [ 0 0 " << (1000 / kCharWidth) << " 1000 ]\n"
615  " /FontFile2 8 0 R\n"
616  " /FontName /GlyphLessFont\n"
617  " /ItalicAngle 0\n"
618  " /StemV 80\n"
619  " /Type /FontDescriptor\n"
620  ">>\n"
621  "endobj\n";
622  AppendPDFObject(stream.str().c_str());
623 
624  stream.str("");
625  stream << datadir_.c_str() << "/pdf.ttf";
626  FILE *fp = fopen(stream.str().c_str(), "rb");
627  if (!fp) {
628  tprintf("Cannot open file \"%s\"!\n", stream.str().c_str());
629  return false;
630  }
631  fseek(fp, 0, SEEK_END);
632  auto size = std::ftell(fp);
633  if (size < 0) {
634  fclose(fp);
635  return false;
636  }
637  fseek(fp, 0, SEEK_SET);
638  const std::unique_ptr<char[]> buffer(new char[size]);
639  if (!tesseract::DeSerialize(fp, buffer.get(), size)) {
640  fclose(fp);
641  return false;
642  }
643  fclose(fp);
644  // FONTFILE2
645  stream.str("");
646  stream <<
647  "8 0 obj\n"
648  "<<\n"
649  " /Length " << size << "\n"
650  " /Length1 " << size << "\n"
651  ">>\n"
652  "stream\n";
653  AppendString(stream.str().c_str());
654  objsize = stream.str().size();
655  AppendData(buffer.get(), size);
656  objsize += size;
657  AppendString(endstream_endobj);
658  objsize += strlen(endstream_endobj);
659  AppendPDFObjectDIY(objsize);
660  return true;

◆ EndDocumentHandler()

bool tesseract::TessPDFRenderer::EndDocumentHandler ( )
overrideprotectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 877 of file pdfrenderer.cpp.

878  {
879  // We reserved the /Pages object number early, so that the /Page
880  // objects could refer to their parent. We finally have enough
881  // information to go fill it in. Using lower level calls to manipulate
882  // the offset record in two spots, because we are placing objects
883  // out of order in the file.
884 
885  // PAGES
886  const long int kPagesObjectNumber = 2;
887  offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
888  std::stringstream stream;
889  // Use "C" locale (needed for int values larger than 999).
890  stream.imbue(std::locale::classic());
891  stream << kPagesObjectNumber << " 0 obj\n<<\n /Type /Pages\n /Kids [ ";
892  AppendString(stream.str().c_str());
893  size_t pages_objsize = stream.str().size();
894  for (size_t i = 0; i < pages_.unsigned_size(); i++) {
895  stream.str("");
896  stream << pages_[i] << " 0 R ";
897  AppendString(stream.str().c_str());
898  pages_objsize += stream.str().size();
899  }
900  stream.str("");
901  stream << "]\n /Count " << pages_.size() << "\n>>\nendobj\n";
902  AppendString(stream.str().c_str());
903  pages_objsize += stream.str().size();
904  offsets_.back() += pages_objsize; // manipulation #2
905 
906  // INFO
907  STRING utf16_title = "FEFF"; // byte_order_marker
908  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());
909  char utf16[kMaxBytesPerCodepoint];
910  for (char32 code : unicodes) {
911  if (CodepointToUtf16be(code, utf16)) {
912  utf16_title += utf16;
913  }
914  }
915 
916  char* datestr = l_getFormattedDate();
917  stream.str("");
918  stream
919  << obj_ << " 0 obj\n"
920  "<<\n"
921  " /Producer (Tesseract " << tesseract::TessBaseAPI::Version() << ")\n"
922  " /CreationDate (D:" << datestr << ")\n"
923  " /Title <" << utf16_title.c_str() << ">\n"
924  ">>\n"
925  "endobj\n";
926  lept_free(datestr);
927  AppendPDFObject(stream.str().c_str());
928  stream.str("");
929  stream << "xref\n0 " << obj_ << "\n0000000000 65535 f \n";
930  AppendString(stream.str().c_str());
931  for (int i = 1; i < obj_; i++) {
932  stream.str("");
933  stream.width(10);
934  stream.fill('0');
935  stream << offsets_[i] << " 00000 n \n";
936  AppendString(stream.str().c_str());
937  }
938  stream.str("");
939  stream
940  << "trailer\n<<\n /Size " << obj_ << "\n"
941  " /Root 1 0 R\n" // catalog
942  " /Info " << (obj_ - 1) << " 0 R\n" // info
943  ">>\nstartxref\n" << offsets_.back() << "\n%%EOF\n";
944  AppendString(stream.str().c_str());
945  return true;

The documentation for this class was generated from the following files:
tesseract::UNICHAR::UTF8ToUTF32
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:215
GenericVector::unsigned_size
size_t unsigned_size() const
Definition: genericvector.h:75
STRING
Definition: strngs.h:45
tesseract::TessResultRenderer::AppendString
void AppendString(const char *s)
Definition: renderer.cpp:101
GenericVector::back
T & back() const
Definition: genericvector.h:728
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::TessResultRenderer::TessResultRenderer
TessResultRenderer(const char *outputbase, const char *extension)
Definition: renderer.cpp:32
tesseract::TessBaseAPI::Version
static const char * Version()
Definition: baseapi.cpp:233
tesseract::TessResultRenderer::title
const char * title() const
Definition: renderer.h:89
char32
signed int char32
Definition: pango_font_info.h:33
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::DeSerialize
bool DeSerialize(FILE *fp, char *data, size_t n=1)
Definition: serialis.cpp:41
tesseract::TessResultRenderer::AppendData
void AppendData(const char *s, int len)
Definition: renderer.cpp:105