package com.xiaoleilu.hutool.io; import java.io.IOException; import java.io.InputStream; import java.io.PushbackInputStream; import com.xiaoleilu.hutool.util.CharsetUtil; /** * 读取带BOM头的流内容,<code>getCharset()</code>方法调用后会得到BOM头的编码,且会去除BOM头<br> * BOM定义:http://www.unicode.org/unicode/faq/utf_bom.html<br> * <ul> * <li>00 00 FE FF = UTF-32, big-endian</li> * <li>FF FE 00 00 = UTF-32, little-endian</li> * <li>EF BB BF = UTF-8</li> * <li>FE FF = UTF-16, big-endian</li> * <li>FF FE = UTF-16, little-endian</li> * </ul> * 使用: <br> * <code> * String enc = "UTF-8"; // or NULL to use systemdefault<br> * FileInputStream fis = new FileInputStream(file); <br> * UnicodeInputStream uin = new UnicodeInputStream(fis, enc); <br> * enc = uin.getCharset(); // check and skip possible BOM bytes * </code> * <br><br> * 参考: http://akini.mbnet.fi/java/unicodereader/UnicodeInputStream.java.txt */ public class BOMInputStream extends InputStream { PushbackInputStream in; boolean isInited = false; String defaultCharset; String charset; private static final int BOM_SIZE = 4; // ----------------------------------------------------------------- Constructor start public BOMInputStream(InputStream in) { this(in, CharsetUtil.UTF_8); } public BOMInputStream(InputStream in, String defaultCharset) { in = new PushbackInputStream(in, BOM_SIZE); this.defaultCharset = defaultCharset; } // ----------------------------------------------------------------- Constructor end public String getDefaultCharset() { return defaultCharset; } public String getCharset() { if (!isInited) { try { init(); } catch (IOException ex) { throw new IORuntimeException(ex); } } return charset; } public void close() throws IOException { isInited = true; in.close(); } public int read() throws IOException { isInited = true; return in.read(); } /** * Read-ahead four bytes and check for BOM marks. <br> * Extra bytes are unread back to the stream, only BOM bytes are skipped. */ protected void init() throws IOException { if (isInited) return; byte bom[] = new byte[BOM_SIZE]; int n, unread; n = in.read(bom, 0, bom.length); if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { charset = "UTF-32BE"; unread = n - 4; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { charset = "UTF-32LE"; unread = n - 4; } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { charset = "UTF-8"; unread = n - 3; } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { charset = "UTF-16BE"; unread = n - 2; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { charset = "UTF-16LE"; unread = n - 2; } else { // Unicode BOM mark not found, unread all bytes charset = defaultCharset; unread = n; } // System.out.println("read=" + n + ", unread=" + unread); if (unread > 0) in.unread(bom, (n - unread), unread); isInited = true; } }