From: Davide Capodaglio Date: Wed, 20 May 2009 16:02:03 +0000 (+0200) Subject: [pdf] Parse xml metadata to detect PDF/A documents X-Git-Tag: EVINCE_2_27_3~71 X-Git-Url: https://www.fi.muni.cz/~kas/git//home/kas/public_html/git/?a=commitdiff_plain;h=f73ef450f00bd6e8d89d4a102d7cb8447c67c8f1;p=evince.git [pdf] Parse xml metadata to detect PDF/A documents --- diff --git a/backend/pdf/ev-poppler.cc b/backend/pdf/ev-poppler.cc index ec2bbc08..24d002e0 100644 --- a/backend/pdf/ev-poppler.cc +++ b/backend/pdf/ev-poppler.cc @@ -52,6 +52,11 @@ #include "ev-attachment.h" #include "ev-image.h" +#include +#include +#include +#include + #if (defined (HAVE_POPPLER_PAGE_RENDER)) && (defined (HAVE_CAIRO_PDF) || defined (HAVE_CAIRO_PS)) #define HAVE_CAIRO_PRINT #endif @@ -573,6 +578,94 @@ pdf_document_set_password (EvDocumentSecurity *document_security, document->password = g_strdup (password); } + +/* reference: +http://www.pdfa.org/lib/exe/fetch.php?id=pdfa%3Aen%3Atechdoc&cache=cache&media=pdfa:techdoc:tn0001_pdfa-1_and_namespaces_2008-03-18.pdf */ +static char * +pdf_document_get_format_from_metadata (const char *metadata) +{ + xmlDocPtr doc; + xmlXPathContextPtr xpathCtx; + xmlXPathObjectPtr xpathObj; + xmlChar *part = NULL; + xmlChar *conf = NULL; + char *result = NULL; + int i; + + doc = xmlParseMemory (metadata, strlen (metadata)); + if (doc == NULL) + return NULL; /* invalid xml metadata */ + + xpathCtx = xmlXPathNewContext (doc); + if (xpathCtx == NULL) { + xmlFreeDoc (doc); + return NULL; /* invalid xpath context */ + } + + /* add pdf/a namespaces */ + xmlXPathRegisterNs (xpathCtx, BAD_CAST "x", BAD_CAST "adobe:ns:meta/"); + xmlXPathRegisterNs (xpathCtx, BAD_CAST "rdf", BAD_CAST "http://www.w3.org/1999/02/22-rdf-syntax-ns#"); + xmlXPathRegisterNs (xpathCtx, BAD_CAST "pdfaid", BAD_CAST "http://www.aiim.org/pdfa/ns/id/"); + + /* reads pdf/a part */ + /* first syntax: child node */ + xpathObj = xmlXPathEvalExpression (BAD_CAST "/x:xmpmeta/rdf:RDF/rdf:Description/pdfaid:part", xpathCtx); + if (xpathObj != NULL) { + if (xpathObj->nodesetval != NULL && xpathObj->nodesetval->nodeNr != 0) + part = xmlNodeGetContent (xpathObj->nodesetval->nodeTab[0]); + + xmlXPathFreeObject (xpathObj); + } + if (part == NULL) { + /* second syntax: attribute */ + xpathObj = xmlXPathEvalExpression (BAD_CAST "/x:xmpmeta/rdf:RDF/rdf:Description/@pdfaid:part", xpathCtx); + if (xpathObj != NULL) { + if (xpathObj->nodesetval != NULL && xpathObj->nodesetval->nodeNr != 0) + part = xmlNodeGetContent (xpathObj->nodesetval->nodeTab[0]); + + xmlXPathFreeObject (xpathObj); + } + } + + /* reads pdf/a conformance */ + /* first syntax: child node */ + xpathObj = xmlXPathEvalExpression (BAD_CAST "/x:xmpmeta/rdf:RDF/rdf:Description/pdfaid:conformance", xpathCtx); + if (xpathObj != NULL) { + if (xpathObj->nodesetval != NULL && xpathObj->nodesetval->nodeNr != 0) + conf = xmlNodeGetContent (xpathObj->nodesetval->nodeTab[0]); + + xmlXPathFreeObject (xpathObj); + } + if (conf == NULL) { + /* second syntax: attribute */ + xpathObj = xmlXPathEvalExpression (BAD_CAST "/x:xmpmeta/rdf:RDF/rdf:Description/@pdfaid:conformance", xpathCtx); + if (xpathObj != NULL) { + if (xpathObj->nodesetval != NULL && xpathObj->nodesetval->nodeNr != 0) + conf = xmlNodeGetContent (xpathObj->nodesetval->nodeTab[0]); + + xmlXPathFreeObject (xpathObj); + } + } + + if (part != NULL && conf != NULL) { + /* makes conf lowercase */ + for (i = 0; conf[i]; i++) + conf[i] = g_ascii_tolower (conf[i]); + + /* return buffer */ + result = g_strdup_printf ("PDF/A - %s%s", part, conf); + } + + /* Cleanup */ + xmlFree (part); + xmlFree (conf); + xmlXPathFreeContext (xpathCtx); + xmlFreeDoc (doc); + + return result; +} + + static EvDocumentInfo * pdf_document_get_info (EvDocument *document) { @@ -582,6 +675,8 @@ pdf_document_get_info (EvDocument *document) PopplerViewerPreferences view_prefs; PopplerPermissions permissions; EvPage *page; + char *metadata; + char *fmt; info = g_new0 (EvDocumentInfo, 1); @@ -618,8 +713,18 @@ pdf_document_get_info (EvDocument *document) "creation-date", &(info->creation_date), "mod-date", &(info->modified_date), "linearized", &(info->linearized), + "metadata", &metadata, NULL); + if (metadata != NULL) { + fmt = pdf_document_get_format_from_metadata(metadata); + if (fmt != NULL) { + g_free (info->format); + info->format = fmt; + } + g_free (metadata); + } + info->n_pages = ev_document_get_n_pages (document); if (info->n_pages > 0) { diff --git a/configure.ac b/configure.ac index e84d30b8..0c9e38a5 100644 --- a/configure.ac +++ b/configure.ac @@ -357,7 +357,7 @@ AC_ARG_ENABLE(pdf, if test "x$enable_pdf" = "xyes"; then POPPLER_REQUIRED=0.11.0 - PKG_CHECK_MODULES(POPPLER, poppler-glib >= $POPPLER_REQUIRED,enable_pdf=yes,enable_pdf=no) + PKG_CHECK_MODULES(POPPLER, poppler-glib >= $POPPLER_REQUIRED libxml-2.0 >= $LIBXML_REQUIRED,enable_pdf=yes,enable_pdf=no) if test "x$enable_pdf" = "xyes"; then AC_DEFINE([ENABLE_PDF], [1], [Enable pdf support.])