/* * 09/23/2004 * * UnicodeReader.java - A reader for Unicode input streams that is capable of * discerning which particular encoding is being used via the BOM. * Copyright (C) 2004 Robert Futrell * robert_futrell at users.sourceforge.net * http://fifesoft.com/rsyntaxtextarea * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ package org.fife.io; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.IOException; import java.io.PushbackInputStream; import java.io.Reader; /** * A reader capable of identifying Unicode streams by their BOMs. This class will recognize the following encodings: * <ul> * <li>UTF-8 * <li>UTF-16LE * <li>UTF-16BE * <li>UTF-32LE * <li>UTF-32BE * </ul> * If the stream is not found to be any of the above, then a default encoding is used for reading. The user can specify * this default encoding, or a system default will be used. * <p> * * For optimum performance, it is recommended that you wrap all instances of <code>UnicodeReader</code> with a * <code>java.io.BufferedReader</code>. * <p> * * This class is mostly ripped off from the workaround in the description of Java Bug 4508058. * * @author Robert Futrell * @version 0.9 */ public class UnicodeReader extends Reader { /** * The input stream from which we're really reading. */ private InputStreamReader internalIn = null; /** * The encoding being used. We keep our own instead of using the string returned by * <code>java.io.InputStreamReader</code> since that class does not return user-friendly names. */ private String encoding; /** * The size of a BOM. */ private static final int BOM_SIZE = 4; /** * This utility constructor is here because you will usually use a <code>UnicodeReader</code> on files. * <p> * Creates a reader using the encoding specified by the BOM in the file; if there is no recognized BOM, then a * system default encoding is used. * * @param file * The file from which you want to read. * @throws IOException * If an error occurs when checking for/reading the BOM. * @throws FileNotFoundException * If the file does not exist, is a directory, or cannot be opened for reading. * @throws SecurityException * If a security manager exists and its checkRead method denies read access to the file. */ public UnicodeReader(String file) throws IOException, FileNotFoundException, SecurityException { this(new File(file)); } /** * This utility constructor is here because you will usually use a <code>UnicodeReader</code> on files. * <p> * Creates a reader using the encoding specified by the BOM in the file; if there is no recognized BOM, then a * system default encoding is used. * * @param file * The file from which you want to read. * @throws IOException * If an error occurs when checking for/reading the BOM. * @throws FileNotFoundException * If the file does not exist, is a directory, or cannot be opened for reading. * @throws SecurityException * If a security manager exists and its checkRead method denies read access to the file. */ public UnicodeReader(File file) throws IOException, FileNotFoundException, SecurityException { this(new FileInputStream(file)); } /** * This utility constructor is here because you will usually use a <code>UnicodeReader</code> on files. * <p> * Creates a reader using the encoding specified by the BOM in the file; if there is no recognized BOM, then a * specified default encoding is used. * * @param file * The file from which you want to read. * @param defaultEncoding * The encoding to use if no BOM is found. If this value is <code>null</code>, a system default is used. * @throws IOException * If an error occurs when checking for/reading the BOM. * @throws FileNotFoundException * If the file does not exist, is a directory, or cannot be opened for reading. * @throws SecurityException * If a security manager exists and its checkRead method denies read access to the file. */ public UnicodeReader(File file, String defaultEncoding) throws IOException, FileNotFoundException, SecurityException { this(new FileInputStream(file), defaultEncoding); } /** * Creates a reader using the encoding specified by the BOM in the file; if there is no recognized BOM, then a * system default encoding is used. * * @param in * The input stream from which to read. * @throws IOException * If an error occurs when checking for/reading the BOM. */ public UnicodeReader(InputStream in) throws IOException { this(in, null); } /** * Creates a reader using the encoding specified by the BOM in the file; if there is no recognized BOM, then * <code>defaultEncoding</code> is used. * * @param in * The input stream from which to read. * @param defaultEncoding * The encoding to use if no recognized BOM is found. If this value is <code>null</code>, a system * default is used. * @throws IOException * If an error occurs when checking for/reading the BOM. */ public UnicodeReader(InputStream in, String defaultEncoding) throws IOException { init(in, defaultEncoding); } /** * Closes this reader. */ public void close() throws IOException { internalIn.close(); } /** * Returns the encoding being used to read this input stream (i.e., the encoding of the file). If a BOM was * recognized, then the specific Unicode type is returned; otherwise, either the default encoding passed into the * constructor or the system default is returned. * * @return The encoding of the stream. */ public String getEncoding() { return encoding; } /** * Read-ahead four bytes and check for BOM marks. Extra bytes are unread back to the stream, only BOM bytes are * skipped. * * @param defaultEncoding * The encoding to use if no BOM was recognized. If this value is <code>null</code>, then a system * default is used. * @throws IOException * If an error occurs when trying to read a BOM. */ protected void init(InputStream in, String defaultEncoding) throws IOException { PushbackInputStream tempIn = new PushbackInputStream(in, BOM_SIZE); byte bom[] = new byte[BOM_SIZE]; int n, unread; n = tempIn.read(bom, 0, bom.length); if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { encoding = "UTF-32BE"; unread = n - 4; } else if (n == BOM_SIZE && // Last 2 bytes are 0; could be an empty UTF-16 (bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { encoding = "UTF-32LE"; unread = n - 4; } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { encoding = "UTF-8"; unread = n - 3; } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { encoding = "UTF-16BE"; unread = n - 2; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { encoding = "UTF-16LE"; unread = n - 2; } else { // Unicode BOM mark not found, unread all bytes encoding = defaultEncoding; unread = n; } if (unread > 0) tempIn.unread(bom, (n - unread), unread); else if (unread < -1) tempIn.unread(bom, 0, 0); // Use given encoding if (encoding == null) { internalIn = new InputStreamReader(tempIn); encoding = internalIn.getEncoding(); // Get the default. } else { internalIn = new InputStreamReader(tempIn, encoding); } } /** * Read characters into a portion of an array. This method will block until some input is available, an I/O error * occurs, or the end of the stream is reached. * * @param cbuf * The buffer into which to read. * @param off * The offset at which to start storing characters. * @param len * The maximum number of characters to read. * * @return The number of characters read, or <code>-1</code> if the end of the stream has been reached. */ public int read(char[] cbuf, int off, int len) throws IOException { return internalIn.read(cbuf, off, len); } }