package info.opencards.pptintegration.conversion; import java.io.*; import java.nio.charset.Charset; /** * <p><code>com.glaforge.i18n.io.SmartEncodingInputStream</code> extends an <code>InputStream</code> with a special * constructor and a special method for dealing with text files encoded within different charsets.</p> <p/> <p>It * surrounds a normal <code>InputStream</code> whatever it may be (<code>FileInputStream</code>...). It reads a buffer * of a defined length. Then with this byte buffer, it uses the class <code>com.glaforge.i18n.io.CharsetToolkit</code> * to parse this buffer and guess what the encoding is. All this steps are done within the constructor. At this time, * you can call the method <code>getReader()</code> to retrieve a <code>Reader</code> created with the good charset, as * guessed while parsing the first bytes of the file. This <code>Reader</code> reads inside the * <code>com.glaforge.i18n.io.SmartEncodingInputStream</code>. It reads first in the internal buffer, then when we reach * the end of the buffer, the underlying InputStream is read with the default read method.</p> <p/> <p>Usage:</p> * <p/> * <pre> * FileInputStream fis = new FileInputStream("utf-8.txt"); * com.glaforge.i18n.io.SmartEncodingInputStream smartIS = new com.glaforge.i18n.io.SmartEncodingInputStream(fis); * Reader reader = smartIS.getReader(); * BufferedReader bufReader = new BufferedReader(reader); * * String line; * while ((line = bufReader.readLine()) != null) * { * System.out.println(line); * } * </pre> * <p/> * Date: 23 juil. 2002 * * @author Guillaume Laforge */ public class SmartEncodingInputStream extends InputStream { private final InputStream is; private int bufferLength; private final boolean enforce8Bit; private Charset defaultCharset; private final byte[] buffer; private int counter; private Charset charset; public static final int BUFFER_LENGTH_2KB = 2048; private static final int BUFFER_LENGTH_4KB = 4096; public static final int BUFFER_LENGTH_8KB = 8192; /** * <p>Constructor of the <code>com.glaforge.i18n.io.SmartEncodingInputStream</code> class. The wider the buffer is, * the most sure you are to have guessed the encoding of the <code>InputStream</code> you wished to get a * <code>Reader</code> from.</p> <p/> <p>It is possible to defined</p> * * @param is the <code>InputStream</code> of which we want to create a <code>Reader</code> with the * encoding guessed from the first buffer of the file. * @param bufferLength the length of the buffer that is used to guess the encoding. * @param defaultCharset specifies the default <code>Charset</code> to use when an 8-bit <code>Charset</code> is * guessed. This parameter may be null, in this case the default system charset is used as * definied in the system property "file.encoding" read by the method * <code>getDefaultSystemCharset()</code> from the class <code>com.glaforge.i18n.io.CharsetToolkit</code>. * @param enforce8Bit enforce the use of the specified default <code>Charset</code> in case the encoding US-ASCII * is recognized. * @throws IOException */ private SmartEncodingInputStream(InputStream is, int bufferLength, Charset defaultCharset, boolean enforce8Bit) throws IOException { this.is = is; this.bufferLength = bufferLength; this.enforce8Bit = enforce8Bit; this.buffer = new byte[bufferLength]; this.counter = 0; this.bufferLength = is.read(buffer); this.defaultCharset = defaultCharset; CharsetToolkit charsetToolkit = new CharsetToolkit(buffer, defaultCharset); charsetToolkit.setEnforce8Bit(enforce8Bit); this.charset = charsetToolkit.guessEncoding(); } /** * Constructor of the <code>com.glaforge.i18n.io.SmartEncodingInputStream</code>. With this constructor, the default * <code>Charset</code> used when an 8-bit encoding is guessed does not need to be specified. The default system * charset will be used instead. * * @param is is the <code>InputStream</code> of which we want to create a <code>Reader</code> with the * encoding guessed from the first buffer of the file. * @param bufferLength the length of the buffer that is used to guess the encoding. * @param defaultCharset specifies the default <code>Charset</code> to use when an 8-bit <code>Charset</code> is * guessed. This parameter may be null, in this case the default system charset is used as * definied in the system property "file.encoding" read by the method * <code>getDefaultSystemCharset()</code> from the class <code>com.glaforge.i18n.io.CharsetToolkit</code>. * @throws IOException */ public SmartEncodingInputStream(InputStream is, int bufferLength, Charset defaultCharset) throws IOException { this(is, bufferLength, defaultCharset, true); } /** * Constructor of the <code>com.glaforge.i18n.io.SmartEncodingInputStream</code>. With this constructor, the default * <code>Charset</code> used when an 8-bit encoding is guessed does not need to be specified. The default system * charset will be used instead. * * @param is is the <code>InputStream</code> of which we want to create a <code>Reader</code> with the * encoding guessed from the first buffer of the file. * @param bufferLength the length of the buffer that is used to guess the encoding. * @throws IOException */ public SmartEncodingInputStream(InputStream is, int bufferLength) throws IOException { this(is, bufferLength, null, true); } /** * Constructor of the <code>com.glaforge.i18n.io.SmartEncodingInputStream</code>. With this constructor, the default * <code>Charset</code> used when an 8-bit encoding is guessed does not need to be specified. The default system * charset will be used instead. The buffer length does not need to be specified either. A default buffer length of * 4 KB is used. * * @param is is the <code>InputStream</code> of which we want to create a <code>Reader</code> with the encoding * guessed from the first buffer of the file. * @throws IOException */ public SmartEncodingInputStream(InputStream is) throws IOException { this(is, SmartEncodingInputStream.BUFFER_LENGTH_4KB, null, true); } /** * Implements the method <code>read()</code> as defined in the <code>InputStream</code> interface. As a certain * number of bytes has already been read from the underlying <code>InputStream</code>, we first read the bytes of * this buffer, otherwise, we directly read the rest of the stream from the underlying <code>InputStream</code>. * * @return the total number of bytes read into the buffer, or <code>-1</code> is there is no more data because the * end of the stream has been reached. * @throws IOException */ public int read() throws IOException { if (counter < bufferLength) return buffer[counter++]; else return is.read(); } /** * Gets a <code>Reader</code> with the right <code>Charset</code> as guessed by reading the beginning of the * underlying <code>InputStream</code>. * * @return a <code>Reader</code> defined with the right encoding. */ Reader getReader() { return new InputStreamReader(this, this.charset); } /** * Retrieves the <code>Charset</code> as guessed from the underlying <code>InputStream</code>. * * @return the <code>Charset</code> guessed. */ public Charset getEncoding() { return this.charset; } public static void main(String[] args) throws IOException { // FileInputStream fis = new FileInputStream("windows-1252.txt"); // FileInputStream fis = new FileInputStream("utf-8.txt"); FileInputStream fis = new FileInputStream("us-ascii.txt"); SmartEncodingInputStream smartIS = new SmartEncodingInputStream(fis); System.err.println("The charset of this input stream is: " + smartIS.getEncoding().displayName()); Reader reader = smartIS.getReader(); BufferedReader bufReader = new BufferedReader(reader); String line; while ((line = bufReader.readLine()) != null) { System.out.println(line); } } }