/*-- Copyright 2001, 2002 Elliotte Rusty Harold. All rights reserved. This file is part of XIncluder, a Java class library for integrating XInclude processing with SAX, DOM, and JDOM. XIncluder is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation. XIncluder is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with XIncluder; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ELLIOTTE RUSTY HAROLD OR ANY OTHER CONTRIBUTORS TO THIS PACKAGE BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package com.elharo.xml.xinclude; import java.io.IOException; import java.io.InputStreamReader; import java.io.InputStream; /** *

* EncodingHeuristics reads from a stream * (which should be buffered) and attempts to guess * what the encoding of the text in the stream is. * Byte order marks are stripped from the stream. * If it fails to determine the type of the encoding, * it returns the default UTF-8. *

* * * @author Elliotte Rusty Harold * @version 1.0d9, July 4, 2002 */ public class EncodingHeuristics { // No instances allowed private EncodingHeuristics() {} /** *

* This utility method ????. *

* * @param in InputStream to read from. * @return String The name of the encoding. * @throws IOException if the stream cannot be reset back to where it was when * the method was invoked. */ public static String readEncodingFromStream(InputStream in) throws IOException { // This may fail if there are a lot of space characters before the end // of the encoding declaration in.mark(1024); try { // lots of things can go wrong here. If any do, I just return null // so that we'll fall back on the encoding declaration or the // UTF-8 default int byte1 = in.read(); int byte2 = in.read(); if (byte1 == 0xFE && byte2 == 0xFF) { // don't reset because the byte order mark should not be included???? return "UnicodeBig"; // name for big-endian???? } else if (byte1 == 0xFF && byte2 == 0xFE) { // don't reset because the byte order mark should not be included???? // will the reader throw away the byte order mark or will it return it???? return "UnicodeLittle"; } /* In accordance with the Character Model [Character Model], when the text format is a Unicode encoding, the XInclude processor must fail the inclusion when the text in the selected range is non-normalized. When transcoding characters to a Unicode encoding from a legacy encoding, a normalizing transcoder must be used. */ int byte3 = in.read(); // check for UTF-8 byte order mark if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF) { // don't reset because the byte order mark should not be included???? // in general what happens if text document includes non-XML legal chars???? return "UTF-8"; } int byte4 = in.read(); if (byte1 == 0x00 && byte2 == 0x00 && byte3 == 0xFE && byte4 == 0xFF) { // don't reset because the byte order mark should not be included???? return "UCS-4"; // right name for big-endian UCS-4 in Java 1.4???? } else if (byte1 == 0x00 && byte2 == 0x00 && byte3 == 0xFF && byte4 == 0xFE) { // don't reset because the byte order mark should not be included???? return "UCS-4"; // right name for little-endian UCS-4 in Java 1.4???? } // no byte order mark present; first character must be // less than sign or white space // Let's look for less-than signs first if (byte1 == 0x00 && byte2 == 0x00 && byte3 == 0x00 && byte4 == '<') { in.reset(); return "UCS-4"; // right name for big-endian UCS-4 in Java 1.4???? } else if (byte1 == '<' && byte2 == 0x00 && byte3 == 0x00 && byte4 == 0x00) { in.reset(); return "UCS-4"; // right name for little-endian UCS-4 in Java 1.4???? } else if (byte1 == 0x00 && byte2 == '<' && byte3 == 0x00 && byte4 == '?') { in.reset(); return "UnicodeBigUnmarked"; } else if (byte1 == '<' && byte2 == 0x00 && byte3 == '?' && byte4 == 0x00) { in.reset(); return "UnicodeLittleUnmarked"; } else if (byte1 == '<' && byte2 == '?' && byte3 == 'x' && byte4 == 'm') { // ASCII compatible, must read encoding declaration // 1024 bytes will be far enough to read most XML declarations byte[] data = new byte[1024]; data[0] = (byte) byte1; data[1] = (byte) byte2; data[2] = (byte) byte3; data[3] = (byte) byte4; int length = in.read(data, 4, 1020) + 4; // Use Latin-1 (ISO-8859-1) because it's ASCII compatible and // all byte sequences are legal Latin-1 sequences so I don't have // to worry about encoding errors if I slip past the // end of the XML/text declaration String declaration = new String(data, 0, length, "8859_1"); // if any of these throw a StringIndexOutOfBoundsException // we just fall into the catch bloclk and return null // since this can't be well-formed XML int position = declaration.indexOf("encoding") + 8; char c; // get rid of white space before equals sign while (true) { c = declaration.charAt(position++); if (c != ' ' && c != '\t' && c != '\r' && c != '\n') break; } if (c != '=') { // malformed in.reset(); return "UTF-8"; } // get rid of white space after equals sign while (true) { c = declaration.charAt(position++); if (c != ' ' && c != '\t' && c != '\r' && c != '\n') break; } char delimiter = c; if (delimiter != '\'' && delimiter != '"') { // malformed in.reset(); return "UTF-8"; } // now positioned to read encoding name StringBuffer encodingName = new StringBuffer(); while (true) { c = declaration.charAt(position++); if (c == delimiter) break; encodingName.append(c); } in.reset(); return encodingName.toString(); } else if (byte1 == 0x4C && byte2 == 0x6F && byte3 == 0xA7 && byte4 == 0x94) { // EBCDIC compatible, must read encoding declaration // ???? } } catch (Exception e) { in.reset(); return "UTF-8"; } // no XML or text declaration present in.reset(); return "UTF-8"; } }