/*
* This file is part of muCommander, http://www.mucommander.com
* Copyright (C) 2002-2016 Maxence Bernard
*
* muCommander is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* muCommander is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package com.mucommander.commons.io.bom;
import java.io.IOException;
import java.io.InputStream;
/**
* <code>BOMInputStream</code> is an <code>InputStream</code> which provides support for Byte-Order Marks (BOM).
* A BOM is a byte sequence found at the beginning of a Unicode text stream which indicates the encoding of the text
* that follows.
*
* <p>
* This class serves a dual purpose:<br>
* 1) it allows to detect a BOM in the underlying stream and determine the encoding used by the stream:
* the {@link BOM} instance returned by {@link #getBOM()} provides that information.<br>
* 2) it allows to discard the BOM from a Unicode stream: the leading bytes corresponding to the BOM are swallowed by
* the stream and never returned by the <code>read</code> methods.
* </p>
*
*<p>
* The following BOMs are supported by this class:
* <ul>
* <li>{@link #UTF8_BOM UTF-8}</li>
* <li>{@link #UTF16_BE_BOM UTF-16 Big Endian}</li>
* <li>{@link #UTF16_LE_BOM UTF-16 Little Endian}</li>
* <li>{@link #UTF32_BE_BOM UTF-32 Big Endian}.</li>
* <li>{@link #UTF32_LE_BOM UTF-32 Little Endian}</li>
* </ul>
* Note that UTF-32 encodings (both Little and Big Endians) are usually <b>not</b> supported by Java runtimes
* out of the box.
* <p>
*
* @see BOMReader
* @author Maxence Bernard
*/
public class BOMInputStream extends InputStream implements BOMConstants {
/** The underlying InputStream that feeds bytes to this stream */
private InputStream in;
/** Contains the BOM that was detected in the stream, null if none was found */
private BOM bom;
/** Bytes that were swallowed by this stream when searching for a BOM, null if a BOM was found */
private byte leadingBytes[];
/** Current offset within the {@link #leadingBytes} array */
private int leadingBytesOff;
private byte oneByteBuf[];
/** Contains the max signature length of supported BOMs */
private final static int MAX_BOM_LENGTH;
static {
// Calculates MAX_BOM_LENGTH
int maxLen = SUPPORTED_BOMS[0].getSignature().length;
int len;
for(int i=1; i<SUPPORTED_BOMS.length; i++) {
len = SUPPORTED_BOMS[i].getSignature().length;
if(len>maxLen)
maxLen = len;
}
MAX_BOM_LENGTH = maxLen;
}
/**
* Creates a new <code>BOMInputStream</code> and looks for a BOM at the beginning of the stream.
*
* @param in the underlying stream
* @throws IOException if an error occurred while reading the given InputStream
*/
public BOMInputStream(InputStream in) throws IOException {
this.in = in;
// Read up to MAX_BOM_LENGTH bytes
byte bytes[] = new byte[MAX_BOM_LENGTH];
int nbRead;
int totalRead = 0;
while((nbRead=in.read(bytes, totalRead, MAX_BOM_LENGTH-totalRead))!=-1 && (totalRead+=nbRead)<MAX_BOM_LENGTH);
// Truncate the byte array if the stream ended before MAX_BOM_LENGTH
if(totalRead<MAX_BOM_LENGTH) {
byte tempBytes[] = new byte[totalRead];
System.arraycopy(bytes, 0, tempBytes, 0, totalRead);
bytes = tempBytes;
}
int bestMatchLength = 0;
int bestMatchIndex = -1;
BOM tempBom;
byte[] tempBomSig;
// Looks for the best (longest) signature match
for(int i=0; i<SUPPORTED_BOMS.length; i++) {
tempBom = SUPPORTED_BOMS[i];
tempBomSig = tempBom.getSignature();
if(tempBomSig.length>bestMatchLength && startsWith(bytes, tempBomSig)) {
bestMatchIndex = i;
bestMatchLength = tempBomSig.length;
}
}
// Keep the bytes that do not correspond to a BOM to have the read methods return them
if(bestMatchIndex!=-1) {
bom = SUPPORTED_BOMS[bestMatchIndex];
if(bestMatchLength<MAX_BOM_LENGTH) {
leadingBytes = bytes;
leadingBytesOff = bestMatchLength;
}
}
else {
leadingBytes = bytes;
leadingBytesOff = 0;
}
}
/**
* Returns <code>true</code> if the first byte sequence starts with the second byte sequence.
*
* @param b1 first byte array to test
* @param b2 second byte array to test
* @return true if the first byte sequence starts with the second byte sequence.
*/
private static boolean startsWith(byte b1[], byte b2[]) {
int b1Len = b1.length;
int b2Len = b2.length;
if(b1Len<b2Len)
return false;
for(int i=0; i<b2Len; i++) {
if(b2[i]!= b1[i])
return false;
}
return true;
}
/**
* Returns the {@link BOM} that was found at the beginning of the stream if there was one,
* <code>null</code> otherwise.
*
* @return the BOM that was found at the beginning of the stream
*/
public BOM getBOM() {
return bom;
}
////////////////////////////////
// InputStream implementation //
////////////////////////////////
@Override
public int read() throws IOException {
if(oneByteBuf==null)
oneByteBuf = new byte[1];
int ret = read(oneByteBuf, 0, 1);
return ret==-1?-1:oneByteBuf[0];
}
@Override
public int read(byte b[]) throws IOException {
return read(b, 0, b.length);
}
@Override
public int read(byte b[], int off, int len) throws IOException {
if(leadingBytes==null || leadingBytesOff>=leadingBytes.length)
return in.read(b, off, len);
int nbBytes = Math.min(leadingBytes.length-leadingBytesOff, len);
System.arraycopy(leadingBytes, leadingBytesOff, b, off, nbBytes);
leadingBytesOff += nbBytes;
return nbBytes;
}
@Override
public void close() throws IOException {
in.close();
}
}