/*-- Copyright 2001-2003 Elliotte Rusty Harold. All rights reserved. This file is part of XIncluder, a Java class library for integrating XInclude processing with SAX, DOM, and JDOM. XIncluder is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation. XIncluder is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with XIncluder; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ELLIOTTE RUSTY HAROLD OR ANY OTHER CONTRIBUTORS TO THIS PACKAGE BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Need to test xml:base???? // Comments from DTD being reported in output // as if they were children of the document node. This // is a bug in Xerces. Crimson // doesn't have it. It should be fixed in 2.0.2 package com.elharo.xml.xinclude; import java.net.URL; import java.net.URLConnection; import java.net.MalformedURLException; import java.util.Stack; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.OutputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.io.InputStreamReader; import java.io.BufferedInputStream; import java.io.InputStream; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import org.xml.sax.InputSource; import org.w3c.dom.Element; import org.w3c.dom.Document; import org.w3c.dom.Comment; import org.w3c.dom.ProcessingInstruction; import org.w3c.dom.DocumentType; import org.w3c.dom.Text; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Attr; import org.w3c.dom.DocumentFragment; import org.w3c.dom.DOMImplementation; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.apache.xml.serialize.OutputFormat; import org.apache.xml.serialize.XMLSerializer; /** *

* DOMXIncluder provides methods to * resolve DOM elements and documents to produce * a new Document or Element * with all XInclude references resolved. *

* *

* This program works with Xerces-2. * It does not yet work with GNU JAXP. * It was working with Crimson but stopped recently. I'm not sure why. *

* *

* XPointers are not yet supported. *

* *

* The xinclude:fallback element is not yet supported. *

* *

* It does not yet handle the merging of unparsed entity * and notation information items from the included infosets. * Furthermore it does not include the source document's doctype * declaration if that contains an internal DTD subset. * This may be the result of a Xerces bug. *

* * @author Elliotte Rusty Harold * @version 1.0d11, March 9, 2003 */ public class DOMXIncluder { public final static String XINCLUDE_NAMESPACE = "http://www.w3.org/2001/XInclude"; public final static String XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"; private DocumentBuilderFactory factory; private DocumentBuilder parser; // Only private instances allowed. Each instance has its own // factory and parser private DOMXIncluder() throws ParserConfigurationException { this.factory = DocumentBuilderFactory.newInstance(); this.factory.setNamespaceAware(true); // Merge CDATA sections with text nodes because infoset doesn't // distinguish these this.factory.setCoalescing(true); this.parser = factory.newDocumentBuilder(); } // There's a potential thread safety problem since the parser // is shared between multiple threads. I'm going to fix this by // creating a new private XIncluder object for each call to the // static include method and use instance methods internally. // If this proves to be a performance issue it could be // optimized with a pool, but I doubt this is necessary. /** *

* This method resolves a DOM Document * and merges in all XInclude references. * The Document object returned is a new document. * The original Document object is not changed. *

* *

* This method depends on the ability to clone a DOM Document * which not all DOM parsers may be able to do. * It definitely exercises a bug in Xerces-J 1.3.1. * This bug is fixed in Xerces-J 1.4.0 through 1.4.3 * but reappears in Xerces-J 1.4.4 * Xerces-J 2.0.x can do this. *

* * @param original Document that will be processed * @param base String form of the base URI against which * relative URLs will be resolved. This can be null if the * document includes an xml:base attribute or * does not use relative URLs. * @return Document new Document object in which all * XInclude elements have been replaced. * @throws XIncludeException if this document, though namespace well-formed, * violates one of the rules of XInclude. * @throws NullPointerException if the original argument is null. */ public static Document merge(Document original, String base) throws XIncludeException, NullPointerException, ParserConfigurationException { // Do I need to coalesce nodes that I receive which // I do not parse myself? It's probably not that important. // This only affects text nodes. DOMXIncluder includer = new DOMXIncluder(); return includer.resolve(original, base); } private Document resolve(Document original, String base) throws XIncludeException, NullPointerException { if (original == null) { throw new NullPointerException("Document must not be null"); } Element originalRoot = original.getDocumentElement(); // Check for xml:base String baseAttribute = originalRoot.getAttributeNS(XML_NAMESPACE, "base"); if (baseAttribute != null && !baseAttribute.equals("")) { base = baseAttribute; } Document resultDocument; if (!isIncludeElement(originalRoot)) { resultDocument = (Document) original.cloneNode(true); // This clone doesn't seem to include the DOCTYPE // if there's an internal DTD subset. // Is this the correct behavior? No, a bug in Xerces 1.4.3 // and Crimson Element resultRoot = resultDocument.getDocumentElement(); Node next = resultRoot.getFirstChild(); while (next != null) { Node current = next; next = next.getNextSibling(); if (current.getNodeType() == Node.ELEMENT_NODE) { Element currentElement = (Element) current; DocumentFragment resolved = resolve(currentElement, base); resultRoot.replaceChild(resolved, currentElement); } } } else { // Root element is XInclude element // Make a document fragment from the original prolog // Make a document fragment from the original epilog // resolve the root element into a document // really need to refactor some of this code verifyXIncludeElement(originalRoot); String href = originalRoot.getAttribute("href"); String remote; if (base != null) { try { URL context = new URL(base); URL u = new URL(context, href); remote = u.toExternalForm(); } catch (MalformedURLException ex) { XIncludeException xex = new UnavailableResourceException( "Unresolvable URL " + base + "/" + href); xex.setRootCause(ex); throw xex; } } else { // base == null remote = href; } if (originalRoot.hasAttribute("parse")) { String parseAttribute = originalRoot.getAttribute("parse"); if (parseAttribute.equals("text")) { throw new BadParseAttributeException( "Cannot include unparsed text for the root element"); } } // end if resultDocument = downloadXMLDocument(remote); // insert prolog DocumentFragment prolog = extractProlog(original); resultDocument.insertBefore(resultDocument.importNode(prolog, true), resultDocument.getFirstChild()); // insert epilog DocumentFragment epilog = extractEpilog(original); resultDocument.appendChild(resultDocument.importNode(epilog, true)); // call this method recursively; don't have enough // circularity checks here yet???? resultDocument = resolve(resultDocument, remote); } return resultDocument; } private static DocumentFragment extractProlog(Document doc) { DocumentFragment result = doc.createDocumentFragment(); NodeList children = doc.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { Node node = children.item(i); if (node.getNodeType() == Node.ELEMENT_NODE) break; if (node.getNodeType() == Node.DOCUMENT_TYPE_NODE) continue; result.appendChild(node.cloneNode(true)); } return result; } private static DocumentFragment extractEpilog(Document doc) { DocumentFragment result = doc.createDocumentFragment(); NodeList children = doc.getChildNodes(); boolean inEpilog = false; for (int i = 0; i < children.getLength(); i++) { Node node = children.item(i); if (inEpilog) result.appendChild(node.cloneNode(true)); if (node.getNodeType() == Node.ELEMENT_NODE) inEpilog = true; } return result; } private static void verifyXIncludeElement(Element includeElement) throws BadParseAttributeException, MissingHrefException { if (!includeElement.hasAttribute("href")) { throw new MissingHrefException("Missing href attribute"); } // check for parse attribute; default is true boolean parse = true; if (includeElement.hasAttribute("parse")) { String parseAttribute = includeElement.getAttribute("parse"); if (!parseAttribute.equals("text") && !parseAttribute.equals("xml")) { throw new BadParseAttributeException( parseAttribute + " is not a legal value for the parse attribute"); } } // end if } // Circularity testing is too strong. It's picking up siblings // as well as ancestors.???? Maybe I'm not popping base off stack // when I should???? /** *

* This method resolves a DOM Element * and merges in all XInclude references. This process is recursive. * The DocumentFragment returned contains no XInclude elements. * If a referenced document cannot be found, an exception is thrown. * The DocumentFragment object returned is a new element. * The original Element is not changed. *

* * @param original Element that will be processed * @param base String form of the base URI against which * relative URLs will be resolved. This can be null if the * element includes an xml:base attribute. * @param resolved Document into which the resolved element will be placed. * @return DocumentFragment the infoset that this element resolves to * @throws MissingHrefException if the href attribute is missing from an include element. * @throws MalformedResourceException if an included document is not namespace well-formed * @throws BadParseAttributeException if an include element has a parse attribute with any value other than text or parse * @throws UnavailableResourceException if the URL in the include element's href attribute cannot be loaded. * @throws CircularIncludeException if this Element contains an XInclude element * that attempts to include a document in which * this element is directly or indirectly included. * @throws NullPointerException if the original argument is null. */ public static DocumentFragment merge(Element original, String base) throws MalformedResourceException, XIncludeException, NullPointerException, CircularIncludeException, BadParseAttributeException, UnavailableResourceException, MissingHrefException, ParserConfigurationException { // Do I need to coalesce nodes // e.g. CDATA sections, that I receive which // I do not parse myself???? DOMXIncluder includer = new DOMXIncluder(); return includer.resolve(original, base); } private DocumentFragment resolve(Element original, String base) throws MalformedResourceException, XIncludeException, NullPointerException, CircularIncludeException, BadParseAttributeException, UnavailableResourceException, MissingHrefException { if (original == null) { throw new NullPointerException( "You can't XInclude a null element." ); } Stack bases = new Stack(); if (base != null) bases.push(base); DocumentFragment result = resolve(original, bases); return result; } private static boolean isIncludeElement(Element element) { if ("include".equals(element.getLocalName()) && XINCLUDE_NAMESPACE.equals(element.getNamespaceURI())) { return true; } return false; } /** *

* This method resolves a DOM Element into an infoset * and merges in all XInclude references. This process is recursive. * The returned infoset contains no XInclude elements. * If a referenced document cannot be found it is replaced with * an error message. The DocumentFragment object returned is new. * The original Element is not changed. *

* * @param original Element that will be processed * @param bases Stack containing the string forms of * all the URIs of documents which contain this element * through XIncludes. This used to detect if a circular * reference is being used. * @param resolved Document into which the resolved element will be placed. * @return DocumentFragment the infoset into which this element resolves. This is just a copy of the element if the element is not an XInclude element and does not contain any XInclude elements. * @throws CircularIncludeException if this Element contains an XInclude element * that attempts to include a document in which * this element is directly or indirectly included. * @throws MissingHrefException if the href attribute is missing from an include element. * @throws MalformedResourceException if an included document is not namespace well-formed * @throws BadParseAttributeException if an include element has a parse attribute with any value other than text or parse * @throws UnavailableResourceException if the URL in the include element's href attribute cannot be loaded. * @throws XIncludeException if this document, though namespace well-formed, * violates one of the rules of XInclude. */ private DocumentFragment resolve(Element original, Stack bases) throws CircularIncludeException, MissingHrefException, MalformedResourceException, BadParseAttributeException, UnavailableResourceException, XIncludeException { Document resolved = original.getOwnerDocument(); DocumentFragment result = resolved.createDocumentFragment(); String base = null; // Check for a base attribute boolean needsPopping = false; String baseAttribute = original.getAttributeNS(XML_NAMESPACE, "base"); if (baseAttribute != null && !baseAttribute.equals("")) { base = baseAttribute; bases.push(base); needsPopping = true; } if (bases.size() != 0) base = (String) bases.peek(); if (isIncludeElement(original)) { // Verify that there is an href attribute if (!original.hasAttribute("href")) { throw new MissingHrefException("Missing href attribute"); } String href = original.getAttribute("href"); String remote; if (base != null) { try { URL context = new URL(base); URL u = new URL(context, href); remote = u.toExternalForm(); } catch (MalformedURLException ex) { XIncludeException xex = new UnavailableResourceException( "Unresolvable URL " + base + "/" + href); xex.setRootCause(ex); throw xex; } } else { // base == null remote = href; } // check for parse attribute; default is true boolean parse = true; if (original.hasAttribute("parse")) { String parseAttribute = original.getAttribute("parse"); if (parseAttribute.equals("text")) { parse = false; } else if (!parseAttribute.equals("xml")) { throw new BadParseAttributeException( parseAttribute + "is not a legal value for the parse attribute"); } } // end if if (parse) { // checks for equality if (bases.contains(remote)) { // need to figure out how to get file and number where // bad include occurs???? throw new CircularIncludeException( "Circular XInclude Reference to " + remote + " in " ); } try { Document doc = parser.parse(remote); bases.push(remote); // Need to remove DocumentType node if any NodeList docChildren = doc.getChildNodes(); for (int i = 0; i < docChildren.getLength(); i++) { Node child = docChildren.item(i); if (child.getNodeType() != Node.DOCUMENT_TYPE_NODE) { Node node = resolved.importNode(child, true); if (node.getNodeType() == Node.ELEMENT_NODE) { Element root = (Element) node; // add xml:base attribute if necessary if (!root.hasAttributeNS("http://www.w3.org/XML/1998/namespace", "base")) { root.setAttributeNS("http://www.w3.org/XML/1998/namespace", "xml:base", remote); } result.appendChild(resolve(root, bases)); } else result.appendChild(node); } } bases.pop(); } catch (SAXParseException e) { int line = e.getLineNumber(); if (line <= 0) { XIncludeException ex = new UnavailableResourceException("Document " + remote + " was not found."); ex.setRootCause(e); throw ex; } else { int column = e.getColumnNumber(); XIncludeException ex = new MalformedResourceException("Document " + remote + " is not well-formed at line " + line + ", column " + column); ex.setRootCause(e); throw ex; } } // end catch catch (SAXException e) { XIncludeException ex = new MalformedResourceException("Document " + remote + " is not well-formed."); ex.setRootCause(e); throw ex; } catch (IOException e) { XIncludeException ex = new UnavailableResourceException("Document not found: " + remote); ex.setRootCause(e); throw ex; } } // end if parse else { // insert text String encoding = original.getAttribute("encoding"); String s = downloadTextDocument(remote, encoding); result.appendChild(resolved.createTextNode(s)); } } // end if; not an include element else { // recursively process children // Could I optimize by searching subtree for XInclude elements???? // Am I cloning too often???? N! Node clone = original.cloneNode(true); Node current = clone.getFirstChild(); while (current != null) { Node next = current.getNextSibling(); if (current.getNodeType() == Node.ELEMENT_NODE) { Element e = (Element) current; DocumentFragment newChildren = resolve(e, bases); clone.replaceChild(newChildren, e); } current = next; } result.appendChild(clone); } if (needsPopping) bases.pop(); return result; } // This parses and returns an XML document at an absolute URL. // If necessary, the document type declaration is deleted. private Document downloadXMLDocument(String url) throws XIncludeException { try { Document doc = parser.parse(url); // Attach an xml:base attribute if the root element // doesn't already have one Element root = doc.getDocumentElement(); if (!root.hasAttributeNS("http://www.w3.org/XML/1998/namespace", "base")) { root.setAttributeNS("http://www.w3.org/XML/1998/namespace", "xml:base", url); } // Need to remove DocumentType node if any DocumentType doctype = doc.getDoctype(); if (doctype == null) return doc; // There is a doctype so we need to make a copy of the document // without the doctype and import all the top-level nodes. Document newDoc = doc.getImplementation().createDocument( root.getNamespaceURI(), root.getNodeName(), null); Element newRoot = newDoc.getDocumentElement(); NodeList docChildren = doc.getChildNodes(); boolean inProlog = true; for (int i = 0; i < docChildren.getLength(); i++) { Node child = docChildren.item(i); if (child.getNodeType() == Node.ELEMENT_NODE) { inProlog = false; continue; } if (child.getNodeType() != Node.DOCUMENT_TYPE_NODE) { if (inProlog) { newDoc.insertBefore(newDoc.importNode(child, true), newRoot); } else { newDoc.appendChild(newDoc.importNode(child, true)); } } } // Now import top-level nodes NodeList rootChildren = root.getChildNodes(); for (int i = 0; i < rootChildren.getLength(); i++) { Node child = rootChildren.item(i); newRoot.appendChild(newDoc.importNode(child, true)); } // Now import root attributes NamedNodeMap attributes = root.getAttributes(); for (int i = 0; i < attributes.getLength(); i++) { Node att = newDoc.importNode(attributes.item(i), true); newRoot.setAttributeNodeNS((Attr) att); } return newDoc; } // end try catch (SAXParseException e) { int line = e.getLineNumber(); if (line <= 0) { XIncludeException ex = new UnavailableResourceException("Document " + url + " was not found."); ex.setRootCause(e); throw ex; } else { int column = e.getColumnNumber(); XIncludeException ex = new MalformedResourceException("Document " + url + " is not well-formed at line " + line + ", column " + column); ex.setRootCause(e); throw ex; } } // end catch catch (SAXException e) { XIncludeException ex = new MalformedResourceException("Document " + url + " is not well-formed."); ex.setRootCause(e); throw ex; } catch (IOException e) { XIncludeException ex = new UnavailableResourceException("Document not found: " + url); ex.setRootCause(e); throw ex; } } /** *

* This utility method reads a document at a specified URL * and returns the contents of that document as a String. * It's used to include files with parse="text" *

* * @param url URL of the document that will be stored in * String. * @param encoding Encoding of the document; e.g. UTF-8, * ISO-8859-1, etc. If this is null or the empty string * then UTF-8 is guessed. * @return String The document retrieved from the source URL * @throws UnavailableResourceException if the requested document cannot be downloaded from the specified URL. */ private static String downloadTextDocument(String url, String encoding) throws UnavailableResourceException { if (encoding == null || encoding.equals("")) { encoding = "UTF-8"; } URL source; try { source = new URL(url); } catch (MalformedURLException e) { UnavailableResourceException ex = new UnavailableResourceException("Unresolvable URL " + url); ex.setRootCause(e); throw ex; } StringBuffer s = new StringBuffer(); try { URLConnection uc = source.openConnection(); InputStream in = new BufferedInputStream(uc.getInputStream()); String encodingFromHeader = uc.getContentEncoding(); String contentType = uc.getContentType(); if (encodingFromHeader != null) encoding = encodingFromHeader; else { // What if file does not have a MIME type but name ends in .xml???? // MIME types are case-insensitive // Java may be picking this up from file URL if (contentType != null) { contentType = contentType.toLowerCase(); if (contentType.equals("text/xml") || contentType.equals("application/xml") || (contentType.startsWith("text/") && contentType.endsWith("+xml") ) || (contentType.startsWith("application/") && contentType.endsWith("+xml"))) { encoding = EncodingHeuristics.readEncodingFromStream(in); } } } InputStreamReader reader = new InputStreamReader(in, encoding); int c; while ((c = in.read()) != -1) { s.append((char) c); } return s.toString(); } catch (UnsupportedEncodingException e) { UnavailableResourceException ex = new UnavailableResourceException( "Encoding not recognized for document " + source.toExternalForm()); ex.setRootCause(e); throw ex; } catch (IOException e) { UnavailableResourceException ex = new UnavailableResourceException( "Document not found: " + source.toExternalForm()); ex.setRootCause(e); throw ex; } } /* C:\XMLJava>java -Djavax.xml.parsers.DocumentBuilderFactory=org.apache.crimson.ja xp.DocumentBuilderFactoryImpl com.elharo.xml.xinclude.DOMXIncluder masterbook.xml finishe d_book.xml */ /** *

* The driver method for the XIncluder program. * I'll probably move this to a separate class soon. *

* * @param args contains the URL and/or filenames of the * input document and the output filename. */ public static void main(String[] args) { if (args.length <= 0) { System.out.println("Usage: java DOMXIncluder URL output_file"); return; } String masterDocumentURL = args[0]; try { // Use JAXP to find a parser DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); // Turn on namespace support factory.setNamespaceAware(true); // Merge CDATA sections with text nodes factory.setCoalescing(true); DocumentBuilder parser = factory.newDocumentBuilder(); try { InputSource input = new InputSource(); if (masterDocumentURL.indexOf(':') < 0) { File f = new File(masterDocumentURL); String base = f.toURL().toExternalForm(); input.setSystemId(base); input.setByteStream(new FileInputStream(f)); } else { input.setSystemId(masterDocumentURL); } Document top = parser.parse(input); Document output = merge(top, input.getSystemId()); // need to set encoding on this to Latin-1 and check what // happens to UTF-8 curly quotes OutputFormat format = new OutputFormat("XML", "UTF-8", false); format.setPreserveSpace(true); OutputStream out = System.out; if (args.length >= 2) { String outputFileName = args[1]; out = new FileOutputStream(outputFileName); } XMLSerializer serializer = new XMLSerializer(out, format); serializer.serialize(output); } catch (Exception e) { System.err.println(e); e.printStackTrace(); } } catch (ParserConfigurationException e) { System.out.println("Could not locate a JAXP parser"); } } // end main }