/*-- Copyright 2001-2003 Elliotte Rusty Harold. All rights reserved. This file is part of XIncluder, a Java class library for integrating XInclude processing with SAX, DOM, and JDOM. XIncluder is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation. XIncluder is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with XIncluder; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ELLIOTTE RUSTY HAROLD OR ANY OTHER CONTRIBUTORS TO THIS PACKAGE BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package com.elharo.xml.xinclude; import java.net.URL; import java.net.URLConnection; import java.net.MalformedURLException; import java.util.Stack; import java.util.Iterator; import java.util.List; import java.util.LinkedList; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.io.InputStreamReader; import java.io.BufferedInputStream; import java.io.InputStream; import org.jdom.Namespace; import org.jdom.Comment; import org.jdom.CDATA; import org.jdom.Text; import org.jdom.JDOMException; import org.jdom.Attribute; import org.jdom.Element; import org.jdom.ProcessingInstruction; import org.jdom.Document; import org.jdom.DocType; import org.jdom.EntityRef; import org.jdom.input.SAXBuilder; import org.jdom.input.DOMBuilder; import org.jdom.output.XMLOutputter; /** *

* JDOMXIncluder provides methods to * resolve JDOM elements and documents to produce * a new Document, Element, * or List of nodes with all * XInclude references resolved. *

* *

* Known bugs include: *

XPointer fragment identifiers are not handled
Notations and unparsed entities from the included infosets * are not merged into the final infoset
* The xinclude:fallback element is not yet supported. *

* * @author Elliotte Rusty Harold * @version 1.0d11, March 9, 2003 */ public class JDOMXIncluder { public final static Namespace XINCLUDE_NAMESPACE = Namespace.getNamespace("xi", "http://www.w3.org/2001/XInclude"); // No instances allowed private JDOMXIncluder() {} private static SAXBuilder builder = new SAXBuilder(); /** *

* This method resolves a JDOM Document * and merges in all XInclude references. * The Document object returned is a new document. * The original Document is not changed. *

* * @param original Document that will be processed * @param base String form of the base URI against which * relative URLs will be resolved. This can be null if the * document includes an xml:base attribute. * @return Document new Document object in which all * XInclude elements have been replaced. * @throws MissingHrefException if an xinclude:include element does not have an href attribute. * @throws UnavailableResourceException if an included document cannot be located * or cannot be read. * @throws MalformedResourceException if an included document is not namespace well-formed * @throws CircularIncludeException if this document possesses a cycle of * XIncludes. * @throws XIncludeException if any of the rules of XInclude are violated */ public static Document resolve(Document original, String base) throws XIncludeException { if (original == null) { throw new NullPointerException("Document must not be null"); } Document result = (Document) original.clone(); Element root = result.getRootElement(); List resolved = resolve(root, base); // check that the list returned contains // exactly one root element Element newRoot = null; Iterator iterator = resolved.iterator(); while (iterator.hasNext()) { Object o = iterator.next(); if (o instanceof Element) { if (newRoot != null) { throw new XIncludeException("Tried to include multiple roots"); } newRoot = (Element) o; } else if (o instanceof Comment || o instanceof ProcessingInstruction) { // do nothing } else if (o instanceof Text || o instanceof String) { throw new XIncludeException( "Tried to include text node outside of root element" ); } else if (o instanceof EntityRef) { throw new XIncludeException( "Tried to include a general entity reference outside of root element" ); } else { throw new XIncludeException( "Unexpected type " + o.getClass() ); } } if (newRoot == null) { throw new XIncludeException("No root element"); } // Could probably combine two loops List newContent = result.getContent(); // resolved contains list of new content // use it to replace old root element iterator = resolved.iterator(); // put in nodes before root element int rootPosition = newContent.indexOf(result.getRootElement()); while (iterator.hasNext()) { Object o = iterator.next(); if (o instanceof Comment || o instanceof ProcessingInstruction) { newContent.add(rootPosition, o); rootPosition++; } else if (o instanceof Element) { // the root break; } else { // throw exception???? } } // put in root element result.setRootElement(newRoot); int addPosition = rootPosition+1; // put in nodes after root element while (iterator.hasNext()) { Object o = iterator.next(); if (o instanceof Comment || o instanceof ProcessingInstruction) { newContent.add(addPosition, o); addPosition++; } else { // throw exception???? } } return result; } /** *

* This method resolves a JDOM Element * and merges in all XInclude references. This process is recursive. * The element returned contains no XInclude elements. * If a referenced document cannot be found it is replaced with * an error message. The Element object returned is a new element. * The original Element is not changed. *

* * @param original Element that will be processed * @param base String form of the base URI against which * relative URLs will be resolved. This can be null if the * element includes an xml:base attribute. * @return List A List containing all nodes that replace this element. * If this element is not an xinclude:include * this list is guaranteed to contain a single Element object. * @throws MissingHrefException if an xinclude:include element does not have an href attribute. * @throws NullPointerException if original element is null. * @throws UnavailableResourceException if an included document cannot be located * or cannot be read. * @throws MalformedResourceException if an included document is not namespace well-formed * @throws CircularIncludeException if this Element contains an XInclude element * that attempts to include a document in which * this element is directly or indirectly included. */ public static List resolve(Element original, String base) throws CircularIncludeException, XIncludeException, NullPointerException { if (original == null) { throw new NullPointerException("You can't XInclude a null element."); } Stack bases = new Stack(); if (base != null) bases.push(base); List result = resolve(original, bases); bases.pop(); return result; } private static boolean isIncludeElement(Element element) { if (element.getName().equals("include") && element.getNamespace().equals(XINCLUDE_NAMESPACE)) { return true; } return false; } /** *

* This method resolves a JDOM Element * and merges in all XInclude references. This process is recursive. * The list returned contains no XInclude elements. * The nodes in the list returned are new objects. * The original Element is not changed. *

* * @param original Element that will be processed * @param bases Stack containing the string forms of * all the URIs of documents which contain this element * through XIncludes. This is used to detect if any circular * references occur. * @return List A List containing all nodes that replace this element. * If this element is not an xinclude:include * this list is guaranteed to contain a single Element object. * @throws MissingHrefException if an xinclude:include element does not have an href attribute. * @throws UnavailableResourceException if an included document cannot be located * or cannot be read. * @throws BadParseAttributeException if an include element has a parse attribute with any value other than text or parse * @throws MalformedResourceException if an included document is not namespace well-formed * @throws CircularIncludeException if this Element contains an XInclude element * that attempts to include a document in which * this element is directly or indirectly included. */ protected static List resolve(Element original, Stack bases) throws CircularIncludeException, MalformedResourceException, UnavailableResourceException, BadParseAttributeException, XIncludeException { String base = ""; if (bases.size() != 0) base = (String) bases.peek(); if (isIncludeElement(original)) { return resolveXIncludeElement(original, bases); } else { Element resolvedElement = resolveNonXIncludeElement(original, bases); List resultList = new LinkedList(); resultList.add(resolvedElement); return resultList; } } private static List resolveXIncludeElement(Element original, Stack bases) throws CircularIncludeException, MalformedResourceException, UnavailableResourceException, XIncludeException { String base = ""; if (bases.size() != 0) base = (String) bases.peek(); // These lines are probably unnecessary if (!isIncludeElement(original)) { throw new RuntimeException("Bad private Call"); } Attribute href = original.getAttribute("href"); if (href == null) { throw new MissingHrefException("Missing href attribute"); } Attribute baseAttribute = original.getAttribute("base", Namespace.XML_NAMESPACE); if (baseAttribute != null) { base = baseAttribute.getValue(); } URL remote; if (base != null) { try { URL context = new URL(base); remote = new URL(context, href.getValue()); } catch (MalformedURLException ex) { XIncludeException xex = new UnavailableResourceException( "Unresolvable URL " + base + "/" + href); xex.setRootCause(ex); throw xex; } } else { // base == null try { remote = new URL(href.getValue()); } catch (MalformedURLException ex) { XIncludeException xex = new UnavailableResourceException( "Unresolvable URL " + href.getValue()); xex.setRootCause(ex); throw xex; } } boolean parse = true; Attribute parseAttribute = original.getAttribute("parse"); if (parseAttribute != null) { String parseValue = parseAttribute.getValue(); if (parseValue.equals("text")) parse = false; else if (!parseValue.equals("xml")) { throw new BadParseAttributeException( parseAttribute + "is not a legal value for the parse attribute" ); } } if (parse) { // System.err.println("parsed"); // checks for equality (OK) or identity (not OK)???? if (bases.contains(remote.toExternalForm())) { // need to figure out how to get file and number where // bad include occurs throw new CircularIncludeException( "Circular XInclude Reference to " + remote.toExternalForm() + " in " ); } try { Document doc = builder.build(remote); // this Document object never leaves this method // System.err.println(doc); bases.push(remote.toExternalForm()); Element root = doc.getRootElement(); // Add an xml:base attributeif necessary if (root.getAttribute("base", Namespace.XML_NAMESPACE) == null) { root.setAttribute("base", remote.toExternalForm(), Namespace.XML_NAMESPACE); } // This is the point where I need to select out // the nodes pointed to by the XPointer // I really need to push this out into a separate method // that returns a list of the nodes pointed to by the XPointer String fragment = remote.getRef(); // I need to return the full document child list including comments and PIs, // not just the resolved root List topLevelNodes = doc.getContent(); int rootPosition = topLevelNodes.indexOf(root); List beforeRoot = topLevelNodes.subList(0, rootPosition); List afterRoot = topLevelNodes.subList(rootPosition+1, topLevelNodes.size()); List rootList = resolve(root, bases); List resultList = new LinkedList(); resultList.addAll(beforeRoot); resultList.addAll(rootList); resultList.addAll(afterRoot); // the top-level things I return should be disconnected from their parents for (int i = 0; i < resultList.size(); i++) { Object o = resultList.get(i); if (o instanceof Element) { Element element = (Element) o; List nodes = resolve(element, bases); resultList.addAll(i, nodes); i += nodes.size(); resultList.remove(i); i--; // System.err.println(element); element.detach(); } if (o instanceof Comment) { Comment comment = (Comment) o; comment.detach(); } if (o instanceof ProcessingInstruction) { ProcessingInstruction pi = (ProcessingInstruction) o; pi.detach(); } } bases.pop(); return resultList; } // should this be a MalformedResourceException???? // probably; maybe check on why JDOMException was thrown catch (JDOMException e) { XIncludeException xex = new UnavailableResourceException( "Unresolvable URL " + href.getValue()); xex.setRootCause(e); throw xex; } catch (IOException e) { XIncludeException xex = new UnavailableResourceException( "Unresolvable URL " + href.getValue()); xex.setRootCause(e); throw xex; } } else { // unparsed, insert text String encoding = original.getAttributeValue("encoding"); Text text = downloadTextDocument(remote, encoding); List resultList = new LinkedList(); resultList.add(text); return resultList; } } private static Element resolveNonXIncludeElement(Element original, Stack bases) throws CircularIncludeException, MalformedResourceException, UnavailableResourceException, XIncludeException { String base = ""; if (bases.size() != 0) base = (String) bases.peek(); // Not an include element; a copy of this element in which its // descendants have been resolved will be returned // recursively process children Element result = new Element(original.getName(), original.getNamespace()); Iterator attributes = original.getAttributes().iterator(); while (attributes.hasNext()) { Attribute a = (Attribute) attributes.next(); result.setAttribute((Attribute) a.clone()); } List newChildren = result.getContent(); // live list Iterator originalChildren = original.getContent().iterator(); while (originalChildren.hasNext()) { Object o = originalChildren.next(); if (o instanceof Element) { Element element = (Element) o; if (isIncludeElement(element)) { newChildren.addAll(resolveXIncludeElement(element, bases)); } else { newChildren.add(resolveNonXIncludeElement(element, bases)); } } else if (o instanceof Text) { Text t = (Text) o; newChildren.add(t.clone()); } else if (o instanceof Comment) { Comment c = (Comment) o; newChildren.add(c.clone()); } else if (o instanceof EntityRef) { EntityRef entity = (EntityRef) o; newChildren.add(entity.clone()); } else if (o instanceof ProcessingInstruction) { ProcessingInstruction pi = (ProcessingInstruction) o; newChildren.add(pi.clone()); } else { throw new XIncludeException("Unexpected Type " + o.getClass()); } } // end while return result; } /** *

* This utility method reads a document at a specified URL * and returns the contents of that document as a Text. * It's used to include files with parse="text". *

* * @param source URL of the document that will be stored in * String. * @param encoding Encoding of the document; e.g. UTF-8, * ISO-8859-1, etc. * @return Text The document retrieved from the source URL. * @throws UnavailableResourceException if the source document cannot be located * or cannot be read. */ public static Text downloadTextDocument(URL source, String encoding) throws UnavailableResourceException { if (encoding == null || encoding.equals("")) encoding = "UTF-8"; try { StringBuffer s = new StringBuffer(); URLConnection uc = source.openConnection(); String encodingFromHeader = uc.getContentEncoding(); String contentType = uc.getContentType(); InputStream in = new BufferedInputStream(uc.getInputStream()); if (encodingFromHeader != null) encoding = encodingFromHeader; else { // What if file does not have a MIME type but name ends in .xml???? // MIME types are case-insensitive // Java may be picking this up from file URL if (contentType != null) { contentType = contentType.toLowerCase(); if (contentType.equals("text/xml") || contentType.equals("application/xml") || (contentType.startsWith("text/") && contentType.endsWith("+xml") ) || (contentType.startsWith("application/") && contentType.endsWith("+xml"))) { encoding = EncodingHeuristics.readEncodingFromStream(in); } } } InputStreamReader reader = new InputStreamReader(in, encoding); int c; while ((c = in.read()) != -1) { if (c == '<') s.append("<"); else if (c == '&') s.append("&"); else s.append((char) c); } return new Text(s.toString()); } catch (UnsupportedEncodingException e) { UnavailableResourceException ex = new UnavailableResourceException( "Encoding " + encoding + " not recognized for included document: " + source.toExternalForm()); ex.setRootCause(e); throw ex; } catch (IOException e) { UnavailableResourceException ex = new UnavailableResourceException( "Document not found: " + source.toExternalForm()); ex.setRootCause(e); throw ex; } } /** *

* The driver method for the XIncluder program. * I'll probably move this to a separate class soon. *

* * @param args args[0] contains the URL or file name * of the first document to be processed; args[1] * contains the URL or file name * of the second document to be processed, etc. */ public static void main(String[] args) { SAXBuilder builder = new SAXBuilder(); XMLOutputter outputter = new XMLOutputter(); for (int i = 0; i < args.length; i++) { try { Document input = builder.build(args[i]); // absolutize URL String base = args[i]; if (base.indexOf(':') < 0) { File f = new File(base); base = f.toURL().toExternalForm(); } Document output = resolve(input, base); // need to set encoding on this to Latin-1 and check what // happens to UTF-8 curly quotes outputter.output(output, System.out); } catch (Exception e) { System.err.println(e); e.printStackTrace(); } } } }