package com.xiaoleilu.hutool.io;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import com.xiaoleilu.hutool.util.CharsetUtil;
/**
* 读取带BOM头的流内容,<code>getCharset()</code>方法调用后会得到BOM头的编码,且会去除BOM头<br>
* BOM定义:http://www.unicode.org/unicode/faq/utf_bom.html<br>
* <ul>
* <li>00 00 FE FF = UTF-32, big-endian</li>
* <li>FF FE 00 00 = UTF-32, little-endian</li>
* <li>EF BB BF = UTF-8</li>
* <li>FE FF = UTF-16, big-endian</li>
* <li>FF FE = UTF-16, little-endian</li>
* </ul>
* 使用: <br>
* <code>
* String enc = "UTF-8"; // or NULL to use systemdefault<br>
* FileInputStream fis = new FileInputStream(file); <br>
* UnicodeInputStream uin = new UnicodeInputStream(fis, enc); <br>
* enc = uin.getCharset(); // check and skip possible BOM bytes
* </code>
* <br><br>
* 参考: http://akini.mbnet.fi/java/unicodereader/UnicodeInputStream.java.txt
*/
public class BOMInputStream extends InputStream {
PushbackInputStream in;
boolean isInited = false;
String defaultCharset;
String charset;
private static final int BOM_SIZE = 4;
// ----------------------------------------------------------------- Constructor start
public BOMInputStream(InputStream in) {
this(in, CharsetUtil.UTF_8);
}
public BOMInputStream(InputStream in, String defaultCharset) {
in = new PushbackInputStream(in, BOM_SIZE);
this.defaultCharset = defaultCharset;
}
// ----------------------------------------------------------------- Constructor end
public String getDefaultCharset() {
return defaultCharset;
}
public String getCharset() {
if (!isInited) {
try {
init();
} catch (IOException ex) {
throw new IORuntimeException(ex);
}
}
return charset;
}
public void close() throws IOException {
isInited = true;
in.close();
}
public int read() throws IOException {
isInited = true;
return in.read();
}
/**
* Read-ahead four bytes and check for BOM marks. <br>
* Extra bytes are unread back to the stream, only BOM bytes are skipped.
*/
protected void init() throws IOException {
if (isInited) return;
byte bom[] = new byte[BOM_SIZE];
int n, unread;
n = in.read(bom, 0, bom.length);
if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
charset = "UTF-32BE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
charset = "UTF-32LE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
charset = "UTF-8";
unread = n - 3;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
charset = "UTF-16BE";
unread = n - 2;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
charset = "UTF-16LE";
unread = n - 2;
} else {
// Unicode BOM mark not found, unread all bytes
charset = defaultCharset;
unread = n;
}
// System.out.println("read=" + n + ", unread=" + unread);
if (unread > 0) in.unread(bom, (n - unread), unread);
isInited = true;
}
}