/* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.riotfamily.common.io;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.Reader;
/**
* Reader that uses a Byte Order Mark (BOM) to identify the encoding of the
* underlying stream. If present, the BOM is removed from the stream.
*
* @see <a href="http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4508058">JDK Bug 4508058</a>
* @see <a href="http://www.unicode.org/faq/utf_bom.html#BOM">Byte Order Mark FAQ</a>
* @author Felix Gnass [fgnass at neteye dot de]
* @since 7.0
*/
public class UnicodeStreamReader extends Reader {
private static final int MAX_BOM_SIZE = 4;
private static final BOM[] BOMS = new BOM[] {
new BOM("UTF-32LE", new byte [] { (byte) 0xff, (byte) 0xfe, (byte) 0x00, (byte) 0x00 }),
new BOM("UTF-32BE", new byte [] { (byte) 0x00, (byte) 0x00, (byte) 0xfe, (byte) 0xff }),
new BOM("UTF-8", new byte [] { (byte) 0xef, (byte) 0xbb, (byte) 0xbf }),
new BOM("UTF-16LE", new byte [] { (byte) 0xff, (byte) 0xfe }),
new BOM("UTF-16BE", new byte [] { (byte) 0xfe, (byte) 0xff }),
};
private InputStreamReader reader;
private String defaultEncoding;
/**
* Creates a new UnicodeStreamReader. If no BOM is found in the stream
* the default system encoding is used.
*
* @param in An InputStream
* @throws IOException If an I/O error occurs
*/
public UnicodeStreamReader(InputStream in) throws IOException {
this(in, null);
}
/**
* Creates a new UnicodeStreamReader. If no BOM is found in the stream
* the given default encoding is used.
*
* @param in An InputStream
* @param defaultEncoding The encoding to be used if no BOM is found
* @throws IOException If an I/O error occurs
*/
public UnicodeStreamReader(InputStream in, String defaultEncoding)
throws IOException {
this.defaultEncoding = defaultEncoding;
byte buffer[] = new byte[MAX_BOM_SIZE];
PushbackInputStream pushbackStream = new PushbackInputStream(in, MAX_BOM_SIZE);
int read = pushbackStream.read(buffer, 0, MAX_BOM_SIZE);
BOM bom = getByteOrderMark(buffer);
int unread = read - bom.length();
if (unread > 0) {
pushbackStream.unread(buffer, bom.length(), unread);
}
reader = new InputStreamReader(pushbackStream, bom.encoding);
}
public String getEncoding() {
return reader.getEncoding();
}
public void close() throws IOException {
reader.close();
}
public int read(char[] cbuf, int off, int len) throws IOException {
return reader.read(cbuf, off, len);
}
private BOM getByteOrderMark(byte[] bytes) {
for (int i = 0; i < BOMS.length; i++) {
if (BOMS[i].matches(bytes)) {
return BOMS[i];
}
}
return new BOM(defaultEncoding, null);
}
private static class BOM {
private String encoding;
private byte[] bytes;
public BOM(String encoding, byte[] bytes) {
this.encoding = encoding;
this.bytes = bytes;
}
public int length() {
return bytes != null ? bytes.length : 0;
}
public boolean matches(byte[] buffer) {
for (int i = 0; i < bytes.length; i++) {
if (bytes[i] != buffer[i]) {
return false;
}
}
return true;
}
}
}