// BlogBridge -- RSS feed reader, manager, and web based service // Copyright (C) 2002-2006 by R. Pito Salas // // This program is free software; you can redistribute it and/or modify it under // the terms of the GNU General Public License as published by the Free Software Foundation; // either version 2 of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; // without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // See the GNU General Public License for more details. // // You should have received a copy of the GNU General Public License along with this program; // if not, write to the Free Software Foundation, Inc., 59 Temple Place, // Suite 330, Boston, MA 02111-1307 USA // // Contact: R. Pito Salas // mailto:pitosalas@users.sourceforge.net // More information: about BlogBridge // http://www.blogbridge.com // http://sourceforge.net/projects/blogbridge // // $Id: EncodingDetector.java,v 1.6 2006/01/08 05:00:10 kyank Exp $ // package com.salas.bb.utils.xml; import java.io.InputStream; import java.io.IOException; import java.io.PushbackInputStream; import java.io.ByteArrayInputStream; /** * Encoding detector. */ public final class EncodingDetector { private static final int BUFFER_SIZE = 256; /** * Hidden utility class constructor. */ private EncodingDetector() { } /** * Detect encoding of the stream. The detector reads data from the stream, so, * the stream will not be at the same position at which it was before call. You need * to use the stream returned in result structure. * * @param is input stream. * * @return result structure with encoding and stream to use. * * @throws IOException in case of errors. */ public static DetectionResult detectEncoding(InputStream is) throws IOException { String encoding = "UTF-8"; byte[] buf = new byte[4]; PushbackInputStream pb = new PushbackInputStream(is, BUFFER_SIZE); int len = pb.read(buf); int unread = len; if (len == 4) { // Convert first four bytes of the buffer into the 32-bit integer // and compare to all known signatures. The comments within the if-blocks // show what the signature means. int nmb = (buf[0] & 0xff) << 24; nmb |= (buf[1] & 0xff) << 16; nmb |= (buf[2] & 0xff) << 8; nmb |= (buf[3] & 0xff); int tempNmb; if (nmb == 0x3c3f786d) { // <?xm // The start of <?xml version...?> declaration. We can get encoding from it. unread = 0; pb.unread(buf, 0, 4); encoding = detectEncodingByDeclaration(pb, "UTF-8"); } else if ((nmb & 0xff00ff00) == 0) { // Standard "UTF-16 BE" signature. encoding = "UnicodeBig"; } else if ((nmb & 0x00ff00ff) == 0) { // Standard "UTF-16 LE" signature. encoding = "UnicodeLittle"; } else if (nmb == 0x4c6fa794) { // Standard "EBCDIC" signature. unread = 0; encoding = detectEncodingByDeclaration(pb, "CP037"); } else if ((nmb & 0xffffff00) == 0xefbbbf00) { // Standard "UTF-8" signature. unread = 1; } else if ((tempNmb = nmb & 0xffff0000) == 0xfeff0000 || tempNmb == 0xfffe0000) { // Standard "UTF-16" signature. encoding = "UTF-16"; unread = 2; } } // Unread only these from tail if (unread > 0) pb.unread(buf, len - unread, unread); return new DetectionResult(encoding, pb); } /** * Detects encoding from XML declaration header. * * @param pb pushback input stream. * @param encoding default encoding to return. * * @return detected or default encoding. * * @throws IOException in case of errors. */ static String detectEncodingByDeclaration(PushbackInputStream pb, String encoding) throws IOException { byte[] buffer = new byte[BUFFER_SIZE]; int read = fillBuffer(pb, buffer); pb.unread(buffer, 0, read); if (read > 0) { String detected = readEncodingDeclaration(new ByteArrayInputStream(buffer, 0, read)); if (detected != null) encoding = detected.trim(); } return encoding; } /** * Makes attempt to fill the buffer from the stream. * * @param is stream. * @param buf buffer. * * @return number of characters read. * * @throws IOException I/O error. */ private static int fillBuffer(InputStream is, byte[] buf) throws IOException { int pos; boolean reading = true; for (pos = 0; reading && pos < buf.length;) { int read = is.read(buf, pos, buf.length - pos); if (read != -1) { pos += read; } else reading = false; } return pos; } /** * Reads encoding from declaration. * * @param is input stream. * * @return encoding or NULL if no encoding attribute found or not a valid XML * declaration header found in the start of the stream. * * @throws IOException in case of errors. */ static String readEncodingDeclaration(InputStream is) throws IOException { return !readDeclarationHeader(is) ? null : readAttributeValue(is, "encoding"); } /** * Reads the start of the header "<?xml ". * * @param is input stream. * * @return TRUE if the header was found. * * @throws IOException in case of errors. */ static boolean readDeclarationHeader(InputStream is) throws IOException { if (is == null) return false; return is.read() == '<' && is.read() == '?' && is.read() == 'x' && is.read() == 'm' && is.read() == 'l' && Character.isWhitespace((char)is.read()); } /** * Reads the value of a given attribute. * * @param is input stream. * @param name name of attribute to read. * * @return the value or NULL if not found. * * @throws IOException in case of errors. */ static String readAttributeValue(InputStream is, String name) throws IOException { if (name == null || is == null) return null; String value = null; int ch = 0; int length = name.length(); while (value == null) { boolean match = false; boolean attrNameRead = false; while (!attrNameRead) { if (ch == 0 && !Character.isWhitespace((char)ch)) ch = skipWhitepace(is); // reading attribute name if (ch == -1) return null; int pos = 0; match = true; while (ch != -1 && ch != '=' && ch != '?' && !Character.isWhitespace((char)ch)) { match = match && ((pos < length) && ch == name.charAt(pos++)); ch = is.read(); } if (ch == -1 || ch == '?') return null; if (ch != '=') ch = skipWhitepace(is); match = match && pos == length; attrNameRead = (ch == '='); } if (match) { StringBuffer buf = new StringBuffer(10); ch = readAttributeValue(is, buf); value = buf.toString(); } else ch = readAttributeValue(is, (StringBuffer)null); } return value; } /** * Reads value of attribute starting from quotes (after optional spaces). * * @param is input stream. * @param buf buffer to put value in. * * @return next char after the value quote, or -1 if stream ended, or unexpected char. * * @throws IOException in case of I/O error. */ static int readAttributeValue(InputStream is, StringBuffer buf) throws IOException { return is == null ? -1 : readAttributeValueNoSpace(is, skipWhitepace(is), buf); } /** * Reads value of attribute starting right away. * * @param is input stream. * @param ch first character of value. * @param buf buffer to fill with valid value or NULL for skipping. * * @return the next character after attribute value. * * @throws IOException in case of I/O error. */ static int readAttributeValueNoSpace(InputStream is, int ch, StringBuffer buf) throws IOException { int start = buf == null ? -1 : buf.length(); int quotes; if (ch == '\'' || ch == '"') { quotes = ch; boolean read = false; ch = is.read(); while (ch != -1 && !read) { if (ch != quotes) { if (buf != null) buf.append((char)ch); ch = is.read(); } else { read = true; } } if (!read && buf != null) { buf.delete(start, buf.length()); } else { ch = is.read(); } } return ch; } /** * Skips all whitespace from current position. * * @param is input stream. * * @return first non-whitespace char or -1 if stream end found. * * @throws IOException in case of I/O error. */ static int skipWhitepace(InputStream is) throws IOException { if (is == null) return -1; int ch = is.read(); while (ch != -1 && Character.isWhitespace((char)ch)) ch = is.read(); return ch; } /** * Result of encoding detection. Holds encoding name and stream to use for further * I/O operations. */ public static class DetectionResult { private String encoding; private InputStream stream; /** * Creates holder. * * @param encoding name of encoding. * @param stream stream to use for further input operations. */ public DetectionResult(String encoding, InputStream stream) { this.encoding = encoding; this.stream = stream; } /** * Returns detected encoding. * * @return encoding. */ public String getEncoding() { return encoding; } /** * Returns stream to use for further I/ operations. * * @return stream to use for further I/ operations. */ public InputStream getStream() { return stream; } } }