/*
* Copyright (C) 2011 Laurent Caillette
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation, either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.novelang.parser.unicode ;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.nio.charset.Charset;
import com.google.common.base.Preconditions;
/**
* This {@code InputStream} recognizes unicode BOM and skips bytes if {@link #getEncoding()}
* method is called before any of the {@code read(...)} methods.
* <p>
* Copied from
* <a href="http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4508058" >Sun's bug database</a>
* <p>
* See <a href="http://www.unicode.org/unicode/faq/utf_bom.html" >Unicode BOM FAQ</a>
* <p>
* BOMs:
* <ul>
* <li> 00 00 FE FF = UTF-32, big-endian
* <li> FF FE 00 00 = UTF-32, little-endian
* <li> FE FF = UTF-16, big-endian
* <li> FF FE = UTF-16, little-endian
* <li> EF BB BF = UTF-8
* <li> Win2k Notepad: Unicode format = UTF-16LE
* </ul>
* <p>
* Usage pattern:
* <pre>
* String enc = "ISO-8859-1" ; // or NULL to use systemdefault
* FileInputStream fis = new FileInputStream( file ) ;
* UnicodeInputStream uin = new UnicodeInputStream( fis, enc ) ;
* enc = uin.getEncoding() ; // check for BOM and skip bytes
* InputStreamReader in ;
* if (enc == null) in = new InputStreamReader(uin) ;
* else in = new InputStreamReader(uin, enc) ;
* </pre>
*
* @author Thomas Weidenfeller for original pseudocode.
* @author Aki Nieminen for implementation.
* @author Laurent Caillette for minor changes.
*/
public class UnicodeInputStream extends InputStream {
final PushbackInputStream internalInputStream;
final Charset defaultEncoding ;
private boolean initialized = false ;
private Charset encoding = null ;
private static final int BOM_SIZE = 4 ;
private static final int BUFFER_SIZE = 1024 * 32 ;
@SuppressWarnings( { "IOResourceOpenedButNotSafelyClosed" } )
public UnicodeInputStream( final InputStream in, final Charset defaultEncoding ) {
final BufferedInputStream bufferedInputStream = new BufferedInputStream( in, BUFFER_SIZE ) ;
internalInputStream = new PushbackInputStream( bufferedInputStream, BOM_SIZE ) ;
this.defaultEncoding = Preconditions.checkNotNull( defaultEncoding ) ;
}
public Charset getEncoding() {
if( ! initialized ) {
try {
initialize() ;
} catch( IOException ex ) {
throw new IllegalStateException( "Initialization failed", ex ) ;
}
}
return encoding ;
}
/**
* Read-ahead four bytes and check for BOM marks. Extra
* bytes are
* unread back to the stream, only BOM bytes are skipped.
*/
protected void initialize() throws IOException {
if( initialized ) return ;
final byte[] bom = new byte[ BOM_SIZE ] ;
final int n;
final int unread ;
n = internalInputStream.read( bom, 0, bom.length ) ;
if( ( bom[ 0 ] == ( byte ) 0xEF ) && ( bom[ 1 ] == ( byte ) 0xBB ) &&
( bom[ 2 ] == ( byte ) 0xBF )
) {
encoding = Charset.forName( "UTF-8" ) ;
unread = n - 3 ;
} else if( ( bom[ 0 ] == ( byte ) 0x00 ) && ( bom[ 1 ] == ( byte ) 0x00 ) &&
( bom[ 2 ] == ( byte ) 0xFE ) && ( bom[ 3 ] == ( byte ) 0xFF )
) {
encoding = Charset.forName( "UTF-32BE" ) ;
unread = n - 4 ;
} else if( ( bom[ 0 ] == ( byte ) 0xFF ) && ( bom[ 1 ] == ( byte ) 0xFE ) &&
( bom[ 2 ] == ( byte ) 0x00 ) && ( bom[ 3 ] == ( byte ) 0x00 )
) {
encoding = Charset.forName( "UTF-32LE" ) ;
unread = n - 4 ;
} else if( ( bom[ 0 ] == ( byte ) 0xFE ) && ( bom[ 1 ] == ( byte ) 0xFF ) ) {
encoding = Charset.forName( "UTF-16BE" ) ;
unread = n - 2 ;
} else if( ( bom[ 0 ] == ( byte ) 0xFF ) && ( bom[ 1 ] == ( byte ) 0xFE ) ) {
encoding = Charset.forName( "UTF-16LE" ) ;
unread = n - 2 ;
} else {
// Unicode BOM mark not found, unread all bytes
encoding = defaultEncoding ;
unread = n ;
}
// System.out.println( "read=" + n + ", unread=" + unread ) ;
if( unread > 0 ) {
internalInputStream.unread( bom, ( n - unread ), unread ) ;
}
initialized = true ;
}
@Override
public void close() throws IOException {
initialized = true ;
internalInputStream.close() ;
}
@Override
public int read() throws IOException {
initialized = true ;
return internalInputStream.read() ;
}
}