package info.opencards.pptintegration.conversion;
import java.io.*;
import java.nio.charset.Charset;
/**
* <p><code>com.glaforge.i18n.io.SmartEncodingInputStream</code> extends an <code>InputStream</code> with a special
* constructor and a special method for dealing with text files encoded within different charsets.</p> <p/> <p>It
* surrounds a normal <code>InputStream</code> whatever it may be (<code>FileInputStream</code>...). It reads a buffer
* of a defined length. Then with this byte buffer, it uses the class <code>com.glaforge.i18n.io.CharsetToolkit</code>
* to parse this buffer and guess what the encoding is. All this steps are done within the constructor. At this time,
* you can call the method <code>getReader()</code> to retrieve a <code>Reader</code> created with the good charset, as
* guessed while parsing the first bytes of the file. This <code>Reader</code> reads inside the
* <code>com.glaforge.i18n.io.SmartEncodingInputStream</code>. It reads first in the internal buffer, then when we reach
* the end of the buffer, the underlying InputStream is read with the default read method.</p> <p/> <p>Usage:</p>
* <p/>
* <pre>
* FileInputStream fis = new FileInputStream("utf-8.txt");
* com.glaforge.i18n.io.SmartEncodingInputStream smartIS = new com.glaforge.i18n.io.SmartEncodingInputStream(fis);
* Reader reader = smartIS.getReader();
* BufferedReader bufReader = new BufferedReader(reader);
*
* String line;
* while ((line = bufReader.readLine()) != null)
* {
* System.out.println(line);
* }
* </pre>
* <p/>
* Date: 23 juil. 2002
*
* @author Guillaume Laforge
*/
public class SmartEncodingInputStream extends InputStream {
private final InputStream is;
private int bufferLength;
private final boolean enforce8Bit;
private Charset defaultCharset;
private final byte[] buffer;
private int counter;
private Charset charset;
public static final int BUFFER_LENGTH_2KB = 2048;
private static final int BUFFER_LENGTH_4KB = 4096;
public static final int BUFFER_LENGTH_8KB = 8192;
/**
* <p>Constructor of the <code>com.glaforge.i18n.io.SmartEncodingInputStream</code> class. The wider the buffer is,
* the most sure you are to have guessed the encoding of the <code>InputStream</code> you wished to get a
* <code>Reader</code> from.</p> <p/> <p>It is possible to defined</p>
*
* @param is the <code>InputStream</code> of which we want to create a <code>Reader</code> with the
* encoding guessed from the first buffer of the file.
* @param bufferLength the length of the buffer that is used to guess the encoding.
* @param defaultCharset specifies the default <code>Charset</code> to use when an 8-bit <code>Charset</code> is
* guessed. This parameter may be null, in this case the default system charset is used as
* definied in the system property "file.encoding" read by the method
* <code>getDefaultSystemCharset()</code> from the class <code>com.glaforge.i18n.io.CharsetToolkit</code>.
* @param enforce8Bit enforce the use of the specified default <code>Charset</code> in case the encoding US-ASCII
* is recognized.
* @throws IOException
*/
private SmartEncodingInputStream(InputStream is, int bufferLength, Charset defaultCharset, boolean enforce8Bit) throws IOException {
this.is = is;
this.bufferLength = bufferLength;
this.enforce8Bit = enforce8Bit;
this.buffer = new byte[bufferLength];
this.counter = 0;
this.bufferLength = is.read(buffer);
this.defaultCharset = defaultCharset;
CharsetToolkit charsetToolkit = new CharsetToolkit(buffer, defaultCharset);
charsetToolkit.setEnforce8Bit(enforce8Bit);
this.charset = charsetToolkit.guessEncoding();
}
/**
* Constructor of the <code>com.glaforge.i18n.io.SmartEncodingInputStream</code>. With this constructor, the default
* <code>Charset</code> used when an 8-bit encoding is guessed does not need to be specified. The default system
* charset will be used instead.
*
* @param is is the <code>InputStream</code> of which we want to create a <code>Reader</code> with the
* encoding guessed from the first buffer of the file.
* @param bufferLength the length of the buffer that is used to guess the encoding.
* @param defaultCharset specifies the default <code>Charset</code> to use when an 8-bit <code>Charset</code> is
* guessed. This parameter may be null, in this case the default system charset is used as
* definied in the system property "file.encoding" read by the method
* <code>getDefaultSystemCharset()</code> from the class <code>com.glaforge.i18n.io.CharsetToolkit</code>.
* @throws IOException
*/
public SmartEncodingInputStream(InputStream is, int bufferLength, Charset defaultCharset) throws IOException {
this(is, bufferLength, defaultCharset, true);
}
/**
* Constructor of the <code>com.glaforge.i18n.io.SmartEncodingInputStream</code>. With this constructor, the default
* <code>Charset</code> used when an 8-bit encoding is guessed does not need to be specified. The default system
* charset will be used instead.
*
* @param is is the <code>InputStream</code> of which we want to create a <code>Reader</code> with the
* encoding guessed from the first buffer of the file.
* @param bufferLength the length of the buffer that is used to guess the encoding.
* @throws IOException
*/
public SmartEncodingInputStream(InputStream is, int bufferLength) throws IOException {
this(is, bufferLength, null, true);
}
/**
* Constructor of the <code>com.glaforge.i18n.io.SmartEncodingInputStream</code>. With this constructor, the default
* <code>Charset</code> used when an 8-bit encoding is guessed does not need to be specified. The default system
* charset will be used instead. The buffer length does not need to be specified either. A default buffer length of
* 4 KB is used.
*
* @param is is the <code>InputStream</code> of which we want to create a <code>Reader</code> with the encoding
* guessed from the first buffer of the file.
* @throws IOException
*/
public SmartEncodingInputStream(InputStream is) throws IOException {
this(is, SmartEncodingInputStream.BUFFER_LENGTH_4KB, null, true);
}
/**
* Implements the method <code>read()</code> as defined in the <code>InputStream</code> interface. As a certain
* number of bytes has already been read from the underlying <code>InputStream</code>, we first read the bytes of
* this buffer, otherwise, we directly read the rest of the stream from the underlying <code>InputStream</code>.
*
* @return the total number of bytes read into the buffer, or <code>-1</code> is there is no more data because the
* end of the stream has been reached.
* @throws IOException
*/
public int read() throws IOException {
if (counter < bufferLength)
return buffer[counter++];
else
return is.read();
}
/**
* Gets a <code>Reader</code> with the right <code>Charset</code> as guessed by reading the beginning of the
* underlying <code>InputStream</code>.
*
* @return a <code>Reader</code> defined with the right encoding.
*/
Reader getReader() {
return new InputStreamReader(this, this.charset);
}
/**
* Retrieves the <code>Charset</code> as guessed from the underlying <code>InputStream</code>.
*
* @return the <code>Charset</code> guessed.
*/
public Charset getEncoding() {
return this.charset;
}
public static void main(String[] args) throws IOException {
// FileInputStream fis = new FileInputStream("windows-1252.txt");
// FileInputStream fis = new FileInputStream("utf-8.txt");
FileInputStream fis = new FileInputStream("us-ascii.txt");
SmartEncodingInputStream smartIS = new SmartEncodingInputStream(fis);
System.err.println("The charset of this input stream is: " + smartIS.getEncoding().displayName());
Reader reader = smartIS.getReader();
BufferedReader bufReader = new BufferedReader(reader);
String line;
while ((line = bufReader.readLine()) != null) {
System.out.println(line);
}
}
}