/**
* Copyright (c) 2007-2011, JAGaToo Project Group all rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* Neither the name of the 'Xith3D Project Group' nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) A
* RISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE
*/
package org.jagatoo.util.io;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
/**
* Unicode BOM model.
*
* @author Marvin Froehlich (aka Qudus)
*/
public enum UnicodeBOM
{
UTF_8( 0xEFBBBF, 3, "UTF-8" ),
UTF_16_BE( 0xFEFF, 2, "UTF-16BE" ), // X-UTF-32BE-BOM
UTF_16_LE( 0xFFFE, 2, "UTF-16LE" ), // x-UTF-16LE-BOM
UTF_32_BE( 0x0000FEFF, 4, "UTF-32BE" ),
UTF_32_LE( 0xFEFF0000, 4, "UTF-32LE" ), // X-UTF-32LE-BOM
UTF_7a( 0x2B2F7638, 4, "UTF-7" ), // seems to be unavailable
UTF_7b( 0x2B2F7639, 4, "UTF-7" ), // seems to be unavailable
UTF_7c( 0x2B2F762B, 4, "UTF-7" ), // seems to be unavailable
UTF_7d( 0x2B2F762F, 4, "UTF-7" ), // seems to be unavailable
UTF_1( 0xF7644C, 3, "UTF-1" ), // seems to be unavailable
UTF_EBCDIC( 0xDD736673, 4, "UTF-EBCDIC" ), // seems to be unavailable
SUSU( 0x0EFEFF, 3, "SUSU" ), // seems to be unavailable
BOCU_1( 0xFBEE28, 3, "BOCU-1" ), // seems to be unavailable // optional trailing FF and length 4
GB_18030( 0x84319533, 4, "GB18030" ),
;
private final int bom;
private final int length;
private final Charset charset;
/**
* Gets the BOM code.
*
* @return the BOM code.
*/
public final int getBOM()
{
return ( bom );
}
/**
* The theoretical byte length. Could be more for {@value #BOCU_1}.
*
* @return theoretical length. Could be more for {@value #BOCU_1}.
*/
public final int getLength()
{
return ( length );
}
/**
* The actual byte length of the given BOM.
*
* @param bom the 4 byte bom (first 4 bytes of the file)
*
* @return the actual byte length of the given BOM.
*/
public final int getLength( int bom )
{
if ( this == BOCU_1 )
{
if ( ( bom & 0xFF ) == 0xFF )
return ( 4 );
return ( 3 );
}
return ( getLength() );
}
/**
* Gets the corresponding {@link Charset} or <code>null</code>, if not available.
*
* @return the corresponding {@link Charset} or <code>null</code>, if not available.
*/
public final Charset getCharset()
{
return ( charset );
}
private UnicodeBOM( int bom, int length, String charset )
{
this.bom = bom;
this.length = length;
Charset cs = null;
try
{
cs = Charset.forName( charset );
}
catch ( IllegalCharsetNameException e )
{
}
catch ( UnsupportedCharsetException e )
{
}
this.charset = cs;
}
private static UnicodeBOM recognize4( int bom )
{
if ( bom == UTF_32_BE.getBOM() )
return ( UTF_32_BE );
if ( bom == UTF_32_BE.getBOM() )
return ( UTF_32_BE );
if ( bom == UTF_32_LE.getBOM() )
return ( UTF_32_LE );
if ( bom == UTF_7a.getBOM() )
return ( UTF_7a );
if ( bom == UTF_7b.getBOM() )
return ( UTF_7b );
if ( bom == UTF_7c.getBOM() )
return ( UTF_7c );
if ( bom == UTF_7d.getBOM() )
return ( UTF_7d );
if ( bom == UTF_EBCDIC.getBOM() )
return ( UTF_EBCDIC );
if ( bom == GB_18030.getBOM() )
return ( GB_18030 );
return ( null );
}
private static UnicodeBOM recognize3( int bom )
{
if ( bom == BOCU_1.getBOM() )
{
//if ( ( bom & 0xFF ) == 0xFF )
return ( BOCU_1 );
}
if ( bom == SUSU.getBOM() )
return ( SUSU );
if ( bom == UTF_1.getBOM() )
return ( UTF_1 );
if ( bom == UTF_8.getBOM() )
return ( UTF_8 );
return ( null );
}
private static UnicodeBOM recognize2( int bom )
{
if ( bom == UTF_16_BE.getBOM() )
return ( UTF_16_BE );
if ( bom == UTF_16_LE.getBOM() )
return ( UTF_16_LE );
return ( null );
}
/**
* <p>
* Attempts to recognize the passed unicode BOM. If it can't be recognized, <code>null</code> is returned.
* </p>
*
* <p>
* The difference to the {@link #valueOf(int)} method is, that {@link #recognize(int)} expects
* the first four bytes of the file, where a BOM of length 3 appears in the first three byte
* and hence the value is shifted to the left by one byte
* while {@link #valueOf(int)} takes the concrete BOM code.
* </p>
*
* @param bom the 4 byte bom (first 4 bytes of the file)
*
* @return the recognized {@link UnicodeBOM} or <code>null</code>.
*
* @see #valueOf(int)
*/
public static UnicodeBOM recognize( int bom )
{
UnicodeBOM result = recognize4( bom );
if ( result != null )
return ( result );
int bom3 = ( bom & 0xFFFFFF00 ) >>> 8;
result = recognize3( bom3 );
if ( result != null )
return ( result );
int bom2 = ( bom & 0xFFFF00 ) >>> 16;
result = recognize2( bom2 );
//if ( result != null )
// return ( result );
return ( null );
}
/**
* <p>
* Attempts to recognize the passed unicode BOM. If it can't be recognized, <code>null</code> is returned.
* </p>
*
* <p>
* The difference to the {@link #recognize(int)} method is, that {@link #valueOf(int)} takes the concrete BOM code
* while {@link #recognize(int)} expects the first four bytes of the file, where a BOM of length 3 appears in the first three byte
* and hence the value is shifted to the left by one byte.
* </p>
*
* @param bom the BOM code
*
* @return the recognized {@link UnicodeBOM} or <code>null</code>.
*
* @see #recognize(int)
*/
public static UnicodeBOM valueOf( int bom )
{
UnicodeBOM result = recognize4( bom );
if ( result != null )
return ( result );
result = recognize3( bom );
if ( result != null )
return ( result );
result = recognize2( bom );
//if ( result != null )
// return ( result );
return ( null );
}
private static UnicodeBOM skipBOM( InputStream in, int[] info, boolean reset ) throws IOException
{
if ( reset && !in.markSupported() )
throw new IllegalArgumentException( "The passed InputStream doesn't support mark/reset." );
if ( reset )
in.mark( 16 );
try
{
// probe length 2
int off = 0;
int n = 2;
byte[] buffer = new byte[ 4 ];
while ( n > 0 )
{
n = in.read( buffer, off, n );
if ( n > 0 )
{
off += n;
n = 2 - off;
}
}
n = off;
int bom = 0;
for ( int i = 0; i < n; i++ )
{
bom = ( ( ( bom << 8 ) & 0xFFFFFF00 ) | ( buffer[i] & 0xFF ) );
}
UnicodeBOM uniBOM = UnicodeBOM.recognize2( bom );
if ( uniBOM != null )
{
if ( info != null )
{
info[0] = uniBOM.getLength( bom );
info[1] = bom;
}
reset = false;
return ( uniBOM );
}
// probe length 3
n = 1;
while ( n > 0 )
{
n = in.read( buffer, off, n );
if ( n > 0 )
{
off += n;
n = 2 - off;
}
}
n = off;
for ( int i = 2; i < n; i++ )
{
bom = ( ( ( bom << 8 ) & 0xFFFFFF00 ) | ( buffer[i] & 0xFF ) );
}
uniBOM = UnicodeBOM.recognize3( bom );
if ( uniBOM != null )
{
if ( info != null )
{
info[0] = uniBOM.getLength( bom );
info[1] = bom;
}
reset = false;
return ( uniBOM );
}
// probe length 4
n = 1;
while ( n > 0 )
{
n = in.read( buffer, off, n );
if ( n > 0 )
{
off += n;
n = 2 - off;
}
}
n = off;
for ( int i = 3; i < n; i++ )
{
bom = ( ( ( bom << 8 ) & 0xFFFFFF00 ) | ( buffer[i] & 0xFF ) );
}
uniBOM = UnicodeBOM.recognize4( bom );
if ( uniBOM != null )
{
if ( info != null )
{
info[0] = uniBOM.getLength( bom );
info[1] = bom;
}
reset = false;
return ( uniBOM );
}
return ( null );
}
finally
{
if ( reset )
in.reset();
}
}
/**
* Skips the bytes, covered by a possible unicode BOM, or, if not recognized, does nothing.
* The passed {@link InputStream} needs to support mark/reset.
*
* @param in
*
* @return the recognized {@link UnicodeBOM} or <code>null</code>.
*
* @throws IOException if anything went wrong.
*/
public static UnicodeBOM skipBOM( InputStream in ) throws IOException
{
return ( skipBOM( in, null, true ) );
}
/**
* Only ready up to the first four bytes of the file and tries to recognize the unicode BOM from these data.
*
* @param file the file to probe
*
* @return the recognized {@link UnicodeBOM} or <code>null</code>.
*
* @throws IOException if something went wrong
*/
public static UnicodeBOM readBOM( File file ) throws IOException
{
FileInputStream in = null;
try
{
in = new FileInputStream( file );
return ( skipBOM( in, null, false ) );
}
finally
{
if ( in != null )
try { in.close(); } catch ( IOException e ) {}
}
}
/**
* Removes the BOM bytes from the input file, if and only if one was found.
* The method uses a temp file to opcy the contents, which replaces the input file afterwards.
*
* @param file the input file
* @param temp the temp file name
* @param targetCharset the charset to use for the new file or <code>null</code> for default
*
* @return the recognized {@link UnicodeBOM} or <code>null</code>.
*
* @throws IOException if something went wrong
*/
public static UnicodeBOM removeBOM( File file, File temp, Charset targetCharset ) throws IOException
{
InputStream in = null;
Reader in2 = null;
Writer out = null;
UnicodeBOM bom = null;
boolean success = false;
try
{
in = new FileInputStream( file );
int[] info = { -1, 0 };
bom = skipBOM( in, info, false );
if ( bom == null )
return ( null );
if ( bom.getCharset() == null )
in2 = new InputStreamReader( in );
else
in2 = new InputStreamReader( in, bom.getCharset() );
if ( targetCharset == null )
targetCharset = bom.getCharset();
if ( targetCharset == null )
out = new BufferedWriter( new OutputStreamWriter( new FileOutputStream( temp ) ) );
else
out = new BufferedWriter( new OutputStreamWriter( new FileOutputStream( temp ), targetCharset ) );
char[] buffer = new char[ 1024 ];
int n = 0;
while ( ( n = in2.read( buffer, 0, Math.min( buffer.length, in.available() + 1 ) ) ) >= 0 )
{
if ( n > 0 )
out.write( buffer, 0, n );
}
success = true;
}
finally
{
if ( in2 != null )
try { in2.close(); in = null; } catch ( IOException e ) {}
if ( in != null )
try { in.close(); } catch ( IOException e ) {}
if ( out != null )
try { out.close(); } catch ( IOException e ) {}
}
if ( success )
{
if ( !file.delete() )
throw new IOException( "Could't delete the file \"" + file.getAbsolutePath() + "\"." );
if ( !temp.renameTo( file ) )
throw new IOException( "Could't rename the file \"" + temp.getAbsolutePath() + "\" to \"" + file.getAbsolutePath() + "\"." );
}
return ( bom );
}
/**
* Removes the BOM bytes from the input file, if and only if one was found.
* The method uses a temp file to opcy the contents, which replaces the input file afterwards.
*
* @param file the input file
* @param temp the temp file name
*
* @return the recognized {@link UnicodeBOM} or <code>null</code>.
*
* @throws IOException if something went wrong
*/
public static UnicodeBOM removeBOM( File file, File temp ) throws IOException
{
return ( removeBOM( file, temp, null ) );
}
}