//
// pdftotext.cc
//
-// Copyright 1997 Derek B. Noonburg
+// Copyright 1997-2003 Glyph & Cog, LLC
//
//========================================================================
+#include <aconf.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include "parseargs.h"
#include "GString.h"
#include "gmem.h"
+#include "GlobalParams.h"
#include "Object.h"
#include "Stream.h"
#include "Array.h"
#include "Page.h"
#include "PDFDoc.h"
#include "TextOutputDev.h"
-#include "Params.h"
+#include "CharTypes.h"
+#include "UnicodeMap.h"
#include "Error.h"
#include "config.h"
+static void printInfoString(FILE *f, Dict *infoDict, char *key,
+ char *text1, char *text2, UnicodeMap *uMap);
+static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt);
+
static int firstPage = 1;
static int lastPage = 0;
-static GBool useASCII7 = gFalse;
-GBool printCommands = gFalse;
+static GBool physLayout = gFalse;
+static GBool rawOrder = gFalse;
+static GBool htmlMeta = gFalse;
+static char textEncName[128] = "";
+static char textEOL[16] = "";
+static GBool noPageBreaks = gFalse;
+static char ownerPassword[33] = "\001";
+static char userPassword[33] = "\001";
+static GBool quiet = gFalse;
+static char cfgFileName[256] = "";
+static GBool printVersion = gFalse;
static GBool printHelp = gFalse;
static ArgDesc argDesc[] = {
- {"-f", argInt, &firstPage, 0,
+ {"-f", argInt, &firstPage, 0,
"first page to convert"},
- {"-l", argInt, &lastPage, 0,
+ {"-l", argInt, &lastPage, 0,
"last page to convert"},
- {"-ascii7", argFlag, &useASCII7, 0,
- "convert to 7-bit ASCII (default is 8-bit ISO Latin-1)"},
- {"-h", argFlag, &printHelp, 0,
+ {"-layout", argFlag, &physLayout, 0,
+ "maintain original physical layout"},
+ {"-raw", argFlag, &rawOrder, 0,
+ "keep strings in content stream order"},
+ {"-htmlmeta", argFlag, &htmlMeta, 0,
+ "generate a simple HTML file, including the meta information"},
+ {"-enc", argString, textEncName, sizeof(textEncName),
+ "output text encoding name"},
+ {"-eol", argString, textEOL, sizeof(textEOL),
+ "output end-of-line convention (unix, dos, or mac)"},
+ {"-nopgbrk", argFlag, &noPageBreaks, 0,
+ "don't insert page breaks between pages"},
+ {"-opw", argString, ownerPassword, sizeof(ownerPassword),
+ "owner password (for encrypted files)"},
+ {"-upw", argString, userPassword, sizeof(userPassword),
+ "user password (for encrypted files)"},
+ {"-q", argFlag, &quiet, 0,
+ "don't print any messages or errors"},
+ {"-cfg", argString, cfgFileName, sizeof(cfgFileName),
+ "configuration file to use in place of .xpdfrc"},
+ {"-v", argFlag, &printVersion, 0,
+ "print copyright and version info"},
+ {"-h", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"--help", argFlag, &printHelp, 0,
"print usage information"},
- {"-help", argFlag, &printHelp, 0,
+ {"-?", argFlag, &printHelp, 0,
"print usage information"},
{NULL}
};
PDFDoc *doc;
GString *fileName;
GString *textFileName;
+ GString *ownerPW, *userPW;
TextOutputDev *textOut;
+ FILE *f;
+ UnicodeMap *uMap;
+ Object info;
GBool ok;
char *p;
+ int exitCode;
+
+ exitCode = 99;
// parse args
ok = parseArgs(argDesc, &argc, argv);
- if (!ok || argc < 2 || argc > 3 || printHelp) {
+ if (!ok || argc < 2 || argc > 3 || printVersion || printHelp) {
fprintf(stderr, "pdftotext version %s\n", xpdfVersion);
fprintf(stderr, "%s\n", xpdfCopyright);
- printUsage("pdftotext", "<PDF-file> [<text-file>]", argDesc);
- exit(1);
+ if (!printVersion) {
+ printUsage("pdftotext", "<PDF-file> [<text-file>]", argDesc);
+ }
+ goto err0;
}
fileName = new GString(argv[1]);
- // init error file
- errorInit();
-
// read config file
- initParams(xpdfConfigFile);
+ globalParams = new GlobalParams(cfgFileName);
+ if (textEncName[0]) {
+ globalParams->setTextEncoding(textEncName);
+ }
+ if (textEOL[0]) {
+ if (!globalParams->setTextEOL(textEOL)) {
+ fprintf(stderr, "Bad '-eol' value on command line\n");
+ }
+ }
+ if (noPageBreaks) {
+ globalParams->setTextPageBreaks(gFalse);
+ }
+ if (quiet) {
+ globalParams->setErrQuiet(quiet);
+ }
+
+ // get mapping to output encoding
+ if (!(uMap = globalParams->getTextEncoding())) {
+ error(-1, "Couldn't get text encoding");
+ delete fileName;
+ goto err1;
+ }
// open PDF file
- xref = NULL;
- doc = new PDFDoc(fileName);
- if (!doc->isOk())
- exit(1);
+ if (ownerPassword[0] != '\001') {
+ ownerPW = new GString(ownerPassword);
+ } else {
+ ownerPW = NULL;
+ }
+ if (userPassword[0] != '\001') {
+ userPW = new GString(userPassword);
+ } else {
+ userPW = NULL;
+ }
+ doc = new PDFDoc(fileName, ownerPW, userPW);
+ if (userPW) {
+ delete userPW;
+ }
+ if (ownerPW) {
+ delete ownerPW;
+ }
+ if (!doc->isOk()) {
+ exitCode = 1;
+ goto err2;
+ }
+
+ // check for copy permission
+ if (!doc->okToCopy()) {
+ error(-1, "Copying of text from this document is not allowed.");
+ exitCode = 3;
+ goto err2;
+ }
// construct text file name
if (argc == 3) {
textFileName = new GString(argv[2]);
} else {
p = fileName->getCString() + fileName->getLength() - 4;
- if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF"))
+ if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) {
textFileName = new GString(fileName->getCString(),
fileName->getLength() - 4);
- else
+ } else {
textFileName = fileName->copy();
- textFileName->append(".txt");
+ }
+ textFileName->append(htmlMeta ? ".html" : ".txt");
}
// get page range
- if (firstPage < 1)
+ if (firstPage < 1) {
firstPage = 1;
- if (lastPage < 1 || lastPage > doc->getNumPages())
+ }
+ if (lastPage < 1 || lastPage > doc->getNumPages()) {
lastPage = doc->getNumPages();
+ }
+
+ // write HTML header
+ if (htmlMeta) {
+ if (!textFileName->cmp("-")) {
+ f = stdout;
+ } else {
+ if (!(f = fopen(textFileName->getCString(), "wb"))) {
+ error(-1, "Couldn't open text file '%s'", textFileName->getCString());
+ exitCode = 2;
+ goto err3;
+ }
+ }
+ fputs("<html>\n", f);
+ fputs("<head>\n", f);
+ doc->getDocInfo(&info);
+ if (info.isDict()) {
+ printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n",
+ uMap);
+ printInfoString(f, info.getDict(), "Subject",
+ "<meta name=\"Subject\" content=\"", "\">\n", uMap);
+ printInfoString(f, info.getDict(), "Keywords",
+ "<meta name=\"Keywords\" content=\"", "\">\n", uMap);
+ printInfoString(f, info.getDict(), "Author",
+ "<meta name=\"Author\" content=\"", "\">\n", uMap);
+ printInfoString(f, info.getDict(), "Creator",
+ "<meta name=\"Creator\" content=\"", "\">\n", uMap);
+ printInfoString(f, info.getDict(), "Producer",
+ "<meta name=\"Producer\" content=\"", "\">\n", uMap);
+ printInfoDate(f, info.getDict(), "CreationDate",
+ "<meta name=\"CreationDate\" content=\"\">\n");
+ printInfoDate(f, info.getDict(), "LastModifiedDate",
+ "<meta name=\"ModDate\" content=\"\">\n");
+ }
+ info.free();
+ fputs("</head>\n", f);
+ fputs("<body>\n", f);
+ fputs("<pre>\n", f);
+ if (f != stdout) {
+ fclose(f);
+ }
+ }
// write text file
- textOut = new TextOutputDev(textFileName->getCString(), useASCII7);
- if (textOut->isOk())
- doc->displayPages(textOut, firstPage, lastPage, 72, 0);
+ textOut = new TextOutputDev(textFileName->getCString(),
+ physLayout, rawOrder, htmlMeta);
+ if (textOut->isOk()) {
+ doc->displayPages(textOut, firstPage, lastPage, 72, 72, 0, gTrue, gFalse);
+ } else {
+ delete textOut;
+ exitCode = 2;
+ goto err3;
+ }
delete textOut;
+ // write end of HTML file
+ if (htmlMeta) {
+ if (!textFileName->cmp("-")) {
+ f = stdout;
+ } else {
+ if (!(f = fopen(textFileName->getCString(), "ab"))) {
+ error(-1, "Couldn't open text file '%s'", textFileName->getCString());
+ exitCode = 2;
+ goto err3;
+ }
+ }
+ fputs("</pre>\n", f);
+ fputs("</body>\n", f);
+ fputs("</html>\n", f);
+ if (f != stdout) {
+ fclose(f);
+ }
+ }
+
+ exitCode = 0;
+
// clean up
+ err3:
delete textFileName;
+ err2:
delete doc;
- freeParams();
+ uMap->decRefCnt();
+ err1:
+ delete globalParams;
+ err0:
// check for memory leaks
- Object::memCheck(errFile);
- gMemReport(errFile);
+ Object::memCheck(stderr);
+ gMemReport(stderr);
+
+ return exitCode;
+}
- return 0;
+static void printInfoString(FILE *f, Dict *infoDict, char *key,
+ char *text1, char *text2, UnicodeMap *uMap) {
+ Object obj;
+ GString *s1;
+ GBool isUnicode;
+ Unicode u;
+ char buf[8];
+ int i, n;
+
+ if (infoDict->lookup(key, &obj)->isString()) {
+ fputs(text1, f);
+ s1 = obj.getString();
+ if ((s1->getChar(0) & 0xff) == 0xfe &&
+ (s1->getChar(1) & 0xff) == 0xff) {
+ isUnicode = gTrue;
+ i = 2;
+ } else {
+ isUnicode = gFalse;
+ i = 0;
+ }
+ while (i < obj.getString()->getLength()) {
+ if (isUnicode) {
+ u = ((s1->getChar(i) & 0xff) << 8) |
+ (s1->getChar(i+1) & 0xff);
+ i += 2;
+ } else {
+ u = s1->getChar(i) & 0xff;
+ ++i;
+ }
+ n = uMap->mapUnicode(u, buf, sizeof(buf));
+ fwrite(buf, 1, n, f);
+ }
+ fputs(text2, f);
+ }
+ obj.free();
+}
+
+static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt) {
+ Object obj;
+ char *s;
+
+ if (infoDict->lookup(key, &obj)->isString()) {
+ s = obj.getString()->getCString();
+ if (s[0] == 'D' && s[1] == ':') {
+ s += 2;
+ }
+ fprintf(f, fmt, s);
+ }
+ obj.free();
}