X-Git-Url: https://www.fi.muni.cz/~kas/git//home/kas/public_html/git/?a=blobdiff_plain;ds=inline;f=pdf%2Fxpdf%2Fpdftotext.cc;h=cb8c8961a494ecefa5f02340a62b2d69e907e8f0;hb=a2f683fe644fded868c536909907282555b1b777;hp=189d4898fc89c333d115f409e5dde1c26fd87036;hpb=50e9d31c05e9ca11ad43cc570556094782c1b956;p=evince.git diff --git a/pdf/xpdf/pdftotext.cc b/pdf/xpdf/pdftotext.cc index 189d4898..cb8c8961 100644 --- a/pdf/xpdf/pdftotext.cc +++ b/pdf/xpdf/pdftotext.cc @@ -2,10 +2,11 @@ // // pdftotext.cc // -// Copyright 1997 Derek B. Noonburg +// Copyright 1997-2003 Glyph & Cog, LLC // //======================================================================== +#include #include #include #include @@ -13,6 +14,7 @@ #include "parseargs.h" #include "GString.h" #include "gmem.h" +#include "GlobalParams.h" #include "Object.h" #include "Stream.h" #include "Array.h" @@ -22,38 +24,64 @@ #include "Page.h" #include "PDFDoc.h" #include "TextOutputDev.h" -#include "Params.h" +#include "CharTypes.h" +#include "UnicodeMap.h" #include "Error.h" -#include "config.h" +#include "xpdfconfig.h" + +static void printInfoString(FILE *f, Dict *infoDict, char *key, + char *text1, char *text2, UnicodeMap *uMap); +static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt); static int firstPage = 1; static int lastPage = 0; -static GBool useASCII7 = gFalse; -#if JAPANESE_SUPPORT -static GBool useEUCJP = gFalse; -#endif +static GBool physLayout = gFalse; static GBool rawOrder = gFalse; -GBool printCommands = gFalse; +static GBool htmlMeta = gFalse; +static char textEncName[128] = ""; +static char textEOL[16] = ""; +static GBool noPageBreaks = gFalse; +static char ownerPassword[33] = "\001"; +static char userPassword[33] = "\001"; +static GBool quiet = gFalse; +static char cfgFileName[256] = ""; +static GBool printVersion = gFalse; static GBool printHelp = gFalse; static ArgDesc argDesc[] = { - {"-f", argInt, &firstPage, 0, + {"-f", argInt, &firstPage, 0, "first page to convert"}, - {"-l", argInt, &lastPage, 0, + {"-l", argInt, &lastPage, 0, "last page to convert"}, - {"-ascii7", argFlag, &useASCII7, 0, - "convert to 7-bit ASCII (default is 8-bit ISO Latin-1)"}, -#if JAPANESE_SUPPORT - {"-eucjp", argFlag, &useEUCJP, 0, - "convert Japanese text to EUC-JP"}, -#endif - {"-raw", argFlag, &rawOrder, 0, + {"-layout", argFlag, &physLayout, 0, + "maintain original physical layout"}, + {"-raw", argFlag, &rawOrder, 0, "keep strings in content stream order"}, - {"-q", argFlag, &errQuiet, 0, + {"-htmlmeta", argFlag, &htmlMeta, 0, + "generate a simple HTML file, including the meta information"}, + {"-enc", argString, textEncName, sizeof(textEncName), + "output text encoding name"}, + {"-eol", argString, textEOL, sizeof(textEOL), + "output end-of-line convention (unix, dos, or mac)"}, + {"-nopgbrk", argFlag, &noPageBreaks, 0, + "don't insert page breaks between pages"}, + {"-opw", argString, ownerPassword, sizeof(ownerPassword), + "owner password (for encrypted files)"}, + {"-upw", argString, userPassword, sizeof(userPassword), + "user password (for encrypted files)"}, + {"-q", argFlag, &quiet, 0, "don't print any messages or errors"}, - {"-h", argFlag, &printHelp, 0, + {"-cfg", argString, cfgFileName, sizeof(cfgFileName), + "configuration file to use in place of .xpdfrc"}, + {"-v", argFlag, &printVersion, 0, + "print copyright and version info"}, + {"-h", argFlag, &printHelp, 0, + "print usage information"}, + {"-help", argFlag, &printHelp, 0, + "print usage information"}, + {"--help", argFlag, &printHelp, 0, "print usage information"}, - {"-help", argFlag, &printHelp, 0, + {"-?", argFlag, &printHelp, 0, "print usage information"}, {NULL} }; @@ -62,36 +90,80 @@ int main(int argc, char *argv[]) { PDFDoc *doc; GString *fileName; GString *textFileName; + GString *ownerPW, *userPW; TextOutputDev *textOut; + FILE *f; + UnicodeMap *uMap; + Object info; GBool ok; char *p; + int exitCode; + + exitCode = 99; // parse args ok = parseArgs(argDesc, &argc, argv); - if (!ok || argc < 2 || argc > 3 || printHelp) { + if (!ok || argc < 2 || argc > 3 || printVersion || printHelp) { fprintf(stderr, "pdftotext version %s\n", xpdfVersion); fprintf(stderr, "%s\n", xpdfCopyright); - printUsage("pdftotext", " []", argDesc); - exit(1); + if (!printVersion) { + printUsage("pdftotext", " []", argDesc); + } + goto err0; } fileName = new GString(argv[1]); - // init error file - errorInit(); - // read config file - initParams(xpdfConfigFile); + globalParams = new GlobalParams(cfgFileName); + if (textEncName[0]) { + globalParams->setTextEncoding(textEncName); + } + if (textEOL[0]) { + if (!globalParams->setTextEOL(textEOL)) { + fprintf(stderr, "Bad '-eol' value on command line\n"); + } + } + if (noPageBreaks) { + globalParams->setTextPageBreaks(gFalse); + } + if (quiet) { + globalParams->setErrQuiet(quiet); + } + + // get mapping to output encoding + if (!(uMap = globalParams->getTextEncoding())) { + error(-1, "Couldn't get text encoding"); + delete fileName; + goto err1; + } // open PDF file - xref = NULL; - doc = new PDFDoc(fileName); + if (ownerPassword[0] != '\001') { + ownerPW = new GString(ownerPassword); + } else { + ownerPW = NULL; + } + if (userPassword[0] != '\001') { + userPW = new GString(userPassword); + } else { + userPW = NULL; + } + doc = new PDFDoc(fileName, ownerPW, userPW); + if (userPW) { + delete userPW; + } + if (ownerPW) { + delete ownerPW; + } if (!doc->isOk()) { - goto err1; + exitCode = 1; + goto err2; } // check for copy permission if (!doc->okToCopy()) { error(-1, "Copying of text from this document is not allowed."); + exitCode = 3; goto err2; } @@ -100,39 +172,161 @@ int main(int argc, char *argv[]) { textFileName = new GString(argv[2]); } else { p = fileName->getCString() + fileName->getLength() - 4; - if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) + if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) { textFileName = new GString(fileName->getCString(), fileName->getLength() - 4); - else + } else { textFileName = fileName->copy(); - textFileName->append(".txt"); + } + textFileName->append(htmlMeta ? ".html" : ".txt"); } // get page range - if (firstPage < 1) + if (firstPage < 1) { firstPage = 1; - if (lastPage < 1 || lastPage > doc->getNumPages()) + } + if (lastPage < 1 || lastPage > doc->getNumPages()) { lastPage = doc->getNumPages(); + } + + // write HTML header + if (htmlMeta) { + if (!textFileName->cmp("-")) { + f = stdout; + } else { + if (!(f = fopen(textFileName->getCString(), "wb"))) { + error(-1, "Couldn't open text file '%s'", textFileName->getCString()); + exitCode = 2; + goto err3; + } + } + fputs("\n", f); + fputs("\n", f); + doc->getDocInfo(&info); + if (info.isDict()) { + printInfoString(f, info.getDict(), "Title", "", "\n", + uMap); + printInfoString(f, info.getDict(), "Subject", + "\n", uMap); + printInfoString(f, info.getDict(), "Keywords", + "\n", uMap); + printInfoString(f, info.getDict(), "Author", + "\n", uMap); + printInfoString(f, info.getDict(), "Creator", + "\n", uMap); + printInfoString(f, info.getDict(), "Producer", + "\n", uMap); + printInfoDate(f, info.getDict(), "CreationDate", + "\n"); + printInfoDate(f, info.getDict(), "LastModifiedDate", + "\n"); + } + info.free(); + fputs("\n", f); + fputs("\n", f); + fputs("
\n", f);
+    if (f != stdout) {
+      fclose(f);
+    }
+  }
 
   // write text file
-#if JAPANESE_SUPPORT
-  useASCII7 |= useEUCJP;
-#endif
-  textOut = new TextOutputDev(textFileName->getCString(), useASCII7, rawOrder);
-  if (textOut->isOk())
-    doc->displayPages(textOut, firstPage, lastPage, 72, 0);
+  textOut = new TextOutputDev(textFileName->getCString(),
+			      physLayout, rawOrder, htmlMeta);
+  if (textOut->isOk()) {
+    doc->displayPages(textOut, firstPage, lastPage, 72, 72, 0, gTrue, gFalse);
+  } else {
+    delete textOut;
+    exitCode = 2;
+    goto err3;
+  }
   delete textOut;
 
+  // write end of HTML file
+  if (htmlMeta) {
+    if (!textFileName->cmp("-")) {
+      f = stdout;
+    } else {
+      if (!(f = fopen(textFileName->getCString(), "ab"))) {
+	error(-1, "Couldn't open text file '%s'", textFileName->getCString());
+	exitCode = 2;
+	goto err3;
+      }
+    }
+    fputs("
\n", f); + fputs("\n", f); + fputs("\n", f); + if (f != stdout) { + fclose(f); + } + } + + exitCode = 0; + // clean up + err3: delete textFileName; err2: delete doc; + uMap->decRefCnt(); err1: - freeParams(); + delete globalParams; + err0: // check for memory leaks Object::memCheck(stderr); gMemReport(stderr); - return 0; + return exitCode; +} + +static void printInfoString(FILE *f, Dict *infoDict, char *key, + char *text1, char *text2, UnicodeMap *uMap) { + Object obj; + GString *s1; + GBool isUnicode; + Unicode u; + char buf[8]; + int i, n; + + if (infoDict->lookup(key, &obj)->isString()) { + fputs(text1, f); + s1 = obj.getString(); + if ((s1->getChar(0) & 0xff) == 0xfe && + (s1->getChar(1) & 0xff) == 0xff) { + isUnicode = gTrue; + i = 2; + } else { + isUnicode = gFalse; + i = 0; + } + while (i < obj.getString()->getLength()) { + if (isUnicode) { + u = ((s1->getChar(i) & 0xff) << 8) | + (s1->getChar(i+1) & 0xff); + i += 2; + } else { + u = s1->getChar(i) & 0xff; + ++i; + } + n = uMap->mapUnicode(u, buf, sizeof(buf)); + fwrite(buf, 1, n, f); + } + fputs(text2, f); + } + obj.free(); +} + +static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt) { + Object obj; + char *s; + + if (infoDict->lookup(key, &obj)->isString()) { + s = obj.getString()->getCString(); + if (s[0] == 'D' && s[1] == ':') { + s += 2; + } + fprintf(f, fmt, s); + } + obj.free(); }