package org.openanzo.rdf.utils; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; import org.openanzo.exceptions.AnzoException; import org.openanzo.exceptions.ExceptionConstants; import org.openanzo.rdf.Constants; /** * From http://glaforge.free.fr/wiki/index.php?wiki=GuessEncoding */ /** * <p> * <code>com.glaforge.i18n.io.SmartEncodingInputStream</code> extends an <code>InputStream</code> with a special constructor and a special method for dealing * with text files encoded within different charsets. * </p> * * <p> * It surrounds a normal <code>InputStream</code> whatever it may be (<code>FileInputStream</code>...). It reads a buffer of a defined length. Then with this * byte buffer, it uses the class <code>com.glaforge.i18n.io.CharsetToolkit</code> to parse this buffer and guess what the encoding is. All this steps are done * within the constructor. At this time, you can call the method <code>getReader()</code> to retrieve a <code>Reader</code> created with the good charset, as * guessed while parsing the first bytes of the file. This <code>Reader</code> reads inside the <code>com.glaforge.i18n.io.SmartEncodingInputStream</code>. It * reads first in the internal buffer, then when we reach the end of the buffer, the underlying InputStream is read with the default read method. * </p> * * <p> * Usage: * </p> * * <pre> * FileInputStream fis = new FileInputStream("utf-8.txt"); * com.glaforge.i18n.io.SmartEncodingInputStream smartIS = new com.glaforge.i18n.io.SmartEncodingInputStream(fis); * Reader reader = smartIS.getReader(); * BufferedReader bufReader = new BufferedReader(reader); * * String line; * while ((line = bufReader.readLine()) != null) { * System.out.println(line); * } * </pre> * * Date: 23 juil. 2002 * * @author Guillaume Laforge */ @SuppressWarnings("all") public class SmartEncodingInputStream extends InputStream { private InputStream is; private int bufferLength; private boolean enforce8Bit; private Charset defaultCharset; private byte[] buffer; private int counter; private Charset charset; public static final int BUFFER_LENGTH_2KB = 2048; public static final int BUFFER_LENGTH_4KB = 4096; public static final int BUFFER_LENGTH_8KB = 8192; private static long skippedChars = 0; public static Reader createSmartReader(InputStream is) throws AnzoException { return createSmartStream(is, 4048, Charset.forName(Constants.byteEncoding), true).getReader(); } public static Reader createSmartReader(InputStream is, String charsetName) throws AnzoException { return createSmartStream(is, 4048, Charset.forName(charsetName), true).getReader(); } public static SmartEncodingInputStream createSmartStream(InputStream is) throws AnzoException { return createSmartStream(is, 4048, Charset.forName(Constants.byteEncoding), true); } public static SmartEncodingInputStream createSmartStream(InputStream is, String charsetName) throws AnzoException { return createSmartStream(is, 4048, Charset.forName(charsetName), true); } public static SmartEncodingInputStream createSmartStream(InputStream is, int bufferLength, Charset charset, boolean enforce8Bit) throws AnzoException { try { if (is instanceof SmartEncodingInputStream) { return (SmartEncodingInputStream) is; } else { return new SmartEncodingInputStream(is, bufferLength, charset, enforce8Bit); } } catch (IOException ioe) { throw new AnzoException(ExceptionConstants.IO.ENCODING_ERROR, ioe); } } /** * <p> * Constructor of the <code>com.glaforge.i18n.io.SmartEncodingInputStream</code> class. The wider the buffer is, the most sure you are to have guessed the * encoding of the <code>InputStream</code> you wished to get a <code>Reader</code> from. * </p> * * <p> * It is possible to defined * </p> * * @param is * the <code>InputStream</code> of which we want to create a <code>Reader</code> with the encoding guessed from the first buffer of the file. * @param bufferLength * the length of the buffer that is used to guess the encoding. * @param defaultCharset * specifies the default <code>Charset</code> to use when an 8-bit <code>Charset</code> is guessed. This parameter may be null, in this case the * default system charset is used as definied in the system property "file.encoding" read by the method <code>getDefaultSystemCharset()</code> * from the class <code>com.glaforge.i18n.io.CharsetToolkit</code>. * @param enforce8Bit * enforce the use of the specified default <code>Charset</code> in case the encoding US-ASCII is recognized. * * @throws IOException */ public SmartEncodingInputStream(InputStream is, int bufferLength, Charset defaultCharset, boolean enforce8Bit) throws IOException { this.is = is; this.bufferLength = bufferLength; this.enforce8Bit = enforce8Bit; this.buffer = new byte[bufferLength]; this.counter = 0; this.bufferLength = is.read(buffer); this.defaultCharset = defaultCharset; CharsetToolkit charsetToolkit = new CharsetToolkit(buffer, defaultCharset); charsetToolkit.setEnforce8Bit(enforce8Bit); this.charset = charsetToolkit.guessEncoding(); this.skippedChars = charsetToolkit.getStripChars(); } /** * Constructor of the <code>com.glaforge.i18n.io.SmartEncodingInputStream</code>. With this constructor, the default <code>Charset</code> used when an 8-bit * encoding is guessed does not need to be specified. The default system charset will be used instead. * * @param is * is the <code>InputStream</code> of which we want to create a <code>Reader</code> with the encoding guessed from the first buffer of the file. * @param bufferLength * the length of the buffer that is used to guess the encoding. * @param defaultCharset * specifies the default <code>Charset</code> to use when an 8-bit <code>Charset</code> is guessed. This parameter may be null, in this case the * default system charset is used as definied in the system property "file.encoding" read by the method <code>getDefaultSystemCharset()</code> * from the class <code>com.glaforge.i18n.io.CharsetToolkit</code>. * * @throws IOException */ public SmartEncodingInputStream(InputStream is, int bufferLength, Charset defaultCharset) throws IOException { this(is, bufferLength, defaultCharset, true); } /** * Constructor of the <code>com.glaforge.i18n.io.SmartEncodingInputStream</code>. With this constructor, the default <code>Charset</code> used when an 8-bit * encoding is guessed does not need to be specified. The default system charset will be used instead. * * @param is * is the <code>InputStream</code> of which we want to create a <code>Reader</code> with the encoding guessed from the first buffer of the file. * @param bufferLength * the length of the buffer that is used to guess the encoding. * * @throws IOException */ public SmartEncodingInputStream(InputStream is, int bufferLength) throws IOException { this(is, bufferLength, null, true); } /** * Constructor of the <code>com.glaforge.i18n.io.SmartEncodingInputStream</code>. With this constructor, the default <code>Charset</code> used when an 8-bit * encoding is guessed does not need to be specified. The default system charset will be used instead. The buffer length does not need to be specified * either. A default buffer length of 4 KB is used. * * @param is * is the <code>InputStream</code> of which we want to create a <code>Reader</code> with the encoding guessed from the first buffer of the file. * * @throws IOException */ public SmartEncodingInputStream(InputStream is) throws IOException { this(is, SmartEncodingInputStream.BUFFER_LENGTH_4KB, null, true); } /** * Implements the method <code>read()</code> as defined in the <code>InputStream</code> interface. As a certain number of bytes has already been read from * the underlying <code>InputStream</code>, we first read the bytes of this buffer, otherwise, we directly read the rest of the stream from the underlying * <code>InputStream</code>. * * @return the total number of bytes read into the buffer, or <code>-1</code> is there is no more data because the end of the stream has been reached. * @throws IOException */ @Override public int read() throws IOException { if (counter < bufferLength) return buffer[counter++]; else return is.read(); } /** * Gets a <code>Reader</code> with the right <code>Charset</code> as guessed by reading the beginning of the underlying <code>InputStream</code>. * * @return a <code>Reader</code> defined with the right encoding. */ public Reader getReader() { Reader reader = new InputStreamReader(this, this.charset); if (this.skippedChars > 0) { try { reader.skip(this.skippedChars); } catch (IOException ioe) { throw new RuntimeException(ioe); } } return reader; } /** * Retrieves the <code>Charset</code> as guessed from the underlying <code>InputStream</code>. * * @return the <code>Charset</code> guessed. */ public Charset getEncoding() { return this.charset; } public static void main(String[] args) throws IOException { // FileInputStream fis = new FileInputStream("windows-1252.txt"); // FileInputStream fis = new FileInputStream("utf-8.txt"); FileInputStream fis = new FileInputStream("us-ascii.txt"); SmartEncodingInputStream smartIS = new SmartEncodingInputStream(fis); System.err.println("The charset of this input stream is: " + smartIS.getEncoding().displayName()); Reader reader = smartIS.getReader(); BufferedReader bufReader = new BufferedReader(reader); String line; while ((line = bufReader.readLine()) != null) { System.out.println(line); } } }