/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.core.fileinput;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Collection;
import org.apache.commons.vfs2.FileObject;
/**
* <p>
* Utility class to guess the encoding of a given byte array. The guess is unfortunately not 100% sure. Especially for
* 8-bit charsets. It's not possible to know which 8-bit charset is used. Except through statistical analysis. We will
* then infer that the charset encountered is the same as the default standard charset.
* </p>
*
* <p>
* On the other hand, unicode files encoded in UTF-16 (low or big endian) or UTF-8 files with a Byte Order Marker are
* easy to find. For UTF-8 files with no BOM, if the buffer is wide enough, it's easy to guess.
* </p>
*
* <p>
* Tested against a complicated UTF-8 file, Sun's implementation does not render bad UTF-8 constructs as expected by the
* specification. But with a buffer wide enough, the method guessEncoding() did behave correctly and recognized the
* UTF-8 charset.
* </p>
*
* <p>
* A byte buffer of 4KB or 8KB is sufficient to be able to guess the encoding.
* </p>
*
* <p>
* Usage:
* </p>
*
* <pre>
* // guess the encoding
* Charset guessedCharset = com.glaforge.i18n.io.CharsetToolkit.guessEncoding(file, 4096);
*
* // create a reader with the charset we've just discovered
* FileInputStream fis = new FileInputStream(file);
* InputStreamReader isr = new InputStreamReader(fis, guessedCharset);
* BufferedReader br = new BufferedReader(isr);
*
*
* <p>Date: 18 juil. 2002</p>
* @author Guillaume LAFORGE
*/
public class CharsetToolkit {
private byte[] buffer;
private Charset defaultCharset;
private boolean enforce8Bit = false;
/**
* Constructor of the <code>com.glaforge.i18n.io.CharsetToolkit</code> utility class.
*
* @param buffer
* the byte buffer of which we want to know the encoding.
*/
public CharsetToolkit( byte[] buffer ) {
this.buffer = buffer;
this.defaultCharset = getDefaultSystemCharset();
}
/**
* Constructor of the <code>com.glaforge.i18n.io.CharsetToolkit</code> utility class.
*
* @param buffer
* the byte buffer of which we want to know the encoding.
* @param defaultCharset
* the default Charset to use in case an 8-bit charset is recognized.
*/
public CharsetToolkit( byte[] buffer, Charset defaultCharset ) {
this.buffer = buffer;
setDefaultCharset( defaultCharset );
}
/**
* Defines the default <code>Charset</code> used in case the buffer represents an 8-bit <code>Charset</code>.
*
* @param defaultCharset
* the default <code>Charset</code> to be returned by <code>guessEncoding()</code> if an 8-bit
* <code>Charset</code> is encountered.
*/
public void setDefaultCharset( Charset defaultCharset ) {
if ( defaultCharset != null ) {
this.defaultCharset = defaultCharset;
} else {
this.defaultCharset = getDefaultSystemCharset();
}
}
public static Charset guessEncoding( FileObject file, int bufferLength ) throws FileNotFoundException,
IOException {
return guessEncoding( new File( file.getName().getPathDecoded() ), bufferLength );
}
public static String guessEncodingName( FileObject file ) throws FileNotFoundException, IOException {
return guessEncodingName( new File( file.getName().getPathDecoded() ) );
}
public static String guessEncodingName( File file ) throws FileNotFoundException, IOException {
return guessEncoding( file, 4096 ).displayName();
}
/**
* If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII. It might be a file without
* any special character in the range 128-255, but that may be or become a file encoded with the default
* <code>charset</code> rather than US-ASCII.
*
* @param enforce
* a boolean specifying the use or not of US-ASCII.
*/
public void setEnforce8Bit( boolean enforce ) {
this.enforce8Bit = enforce;
}
/**
* Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
*
* @return a boolean representing the flag of use of US-ASCII.
*/
public boolean getEnforce8Bit() {
return this.enforce8Bit;
}
/**
* Retrieves the default Charset
*
* @return
*/
public Charset getDefaultCharset() {
return defaultCharset;
}
/**
* <p>
* Guess the encoding of the provided buffer.
* </p>
* If Byte Order Markers are encountered at the beginning of the buffer, we immidiately return the charset implied by
* this BOM. Otherwise, the file would not be a human readable text file.</p>
*
* <p>
* If there is no BOM, this method tries to discern whether the file is UTF-8 or not. If it is not UTF-8, we assume
* the encoding is the default system encoding (of course, it might be any 8-bit charset, but usually, an 8-bit
* charset is the default one).
* </p>
*
* <p>
* It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.
* </p>
*
* <pre>
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
* 0000 0000-0000 007F 0xxxxxxx
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
* 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* </pre>
* <p>
* With UTF-8, 0xFE and 0xFF never appear.
* </p>
*
* @return the Charset recognized.
*/
public Charset guessEncoding() {
// if the file has a Byte Order Marker, we can assume the file is in UTF-xx
// otherwise, the file would not be human readable
if ( hasUTF8Bom( buffer ) ) {
return Charset.forName( "UTF-8" );
}
if ( hasUTF16LEBom( buffer ) ) {
return Charset.forName( "UTF-16LE" );
}
if ( hasUTF16BEBom( buffer ) ) {
return Charset.forName( "UTF-16BE" );
}
// if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
// otherwise, the file is in US-ASCII
boolean highOrderBit = false;
// if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
// if it's not the case, we can assume the encoding is the default encoding of the system
boolean validU8Char = true;
// TODO the buffer is not read up to the end, but up to length - 6
int length = buffer.length;
int i = 0;
while ( i < length - 6 ) {
byte b0 = buffer[i];
byte b1 = buffer[i + 1];
byte b2 = buffer[i + 2];
byte b3 = buffer[i + 3];
byte b4 = buffer[i + 4];
byte b5 = buffer[i + 5];
if ( b0 < 0 ) {
// a high order bit was encountered, thus the encoding is not US-ASCII
// it may be either an 8-bit encoding or UTF-8
highOrderBit = true;
// a two-bytes sequence was encoutered
if ( isTwoBytesSequence( b0 ) ) {
// there must be one continuation byte of the form 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8 construct
if ( !isContinuationChar( b1 ) ) {
validU8Char = false;
} else {
i++;
}
} else if ( isThreeBytesSequence( b0 ) ) {
// a three-bytes sequence was encoutered
// there must be two continuation bytes of the form 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8 construct
if ( !( isContinuationChar( b1 ) && isContinuationChar( b2 ) ) ) {
validU8Char = false;
} else {
i += 2;
}
} else if ( isFourBytesSequence( b0 ) ) {
// a four-bytes sequence was encoutered
// there must be three continuation bytes of the form 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8 construct
if ( !( isContinuationChar( b1 ) && isContinuationChar( b2 ) && isContinuationChar( b3 ) ) ) {
validU8Char = false;
} else {
i += 3;
}
} else if ( isFiveBytesSequence( b0 ) ) {
// a five-bytes sequence was encoutered
// there must be four continuation bytes of the form 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8 construct
if ( !( isContinuationChar( b1 ) && isContinuationChar( b2 )
&& isContinuationChar( b3 ) && isContinuationChar( b4 ) ) ) {
validU8Char = false;
} else {
i += 4;
}
} else if ( isSixBytesSequence( b0 ) ) {
// a six-bytes sequence was encoutered
// there must be five continuation bytes of the form 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8 construct
if ( !( isContinuationChar( b1 )
&& isContinuationChar( b2 ) && isContinuationChar( b3 )
&& isContinuationChar( b4 ) && isContinuationChar( b5 ) ) ) {
validU8Char = false;
} else {
i += 5;
}
} else {
validU8Char = false;
}
}
if ( !validU8Char ) {
break;
}
i++;
}
// if no byte with an high order bit set, the encoding is US-ASCII
// (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
if ( !highOrderBit ) {
// returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
if ( this.enforce8Bit ) {
return this.defaultCharset;
} else {
return Charset.forName( "US-ASCII" );
}
}
// if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
// otherwise the file would not be human readable
if ( validU8Char ) {
return Charset.forName( "UTF-8" );
}
// finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
return this.defaultCharset;
}
public static Charset guessEncoding( File f, int bufferLength ) throws FileNotFoundException, IOException {
FileInputStream fis = new FileInputStream( f );
byte[] buffer = new byte[bufferLength];
fis.read( buffer );
fis.close();
CharsetToolkit toolkit = new CharsetToolkit( buffer );
toolkit.setDefaultCharset( getDefaultSystemCharset() );
return toolkit.guessEncoding();
}
public static Charset guessEncoding( File f, int bufferLength, Charset defaultCharset ) throws FileNotFoundException, IOException {
FileInputStream fis = new FileInputStream( f );
byte[] buffer = new byte[bufferLength];
fis.read( buffer );
fis.close();
CharsetToolkit toolkit = new CharsetToolkit( buffer );
toolkit.setDefaultCharset( defaultCharset );
return toolkit.guessEncoding();
}
/**
* If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
*
* @param b
* a byte.
* @return true if it's a continuation char.
*/
private static boolean isContinuationChar( byte b ) {
return -128 <= b && b <= -65;
}
/**
* If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
*
* @param b
* a byte.
* @return true if it's the first byte of a two-bytes sequence.
*/
private static boolean isTwoBytesSequence( byte b ) {
return -64 <= b && b <= -33;
}
/**
* If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
*
* @param b
* a byte.
* @return true if it's the first byte of a three-bytes sequence.
*/
private static boolean isThreeBytesSequence( byte b ) {
return -32 <= b && b <= -17;
}
/**
* If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
*
* @param b
* a byte.
* @return true if it's the first byte of a four-bytes sequence.
*/
private static boolean isFourBytesSequence( byte b ) {
return -16 <= b && b <= -9;
}
/**
* If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
*
* @param b
* a byte.
* @return true if it's the first byte of a five-bytes sequence.
*/
private static boolean isFiveBytesSequence( byte b ) {
return -8 <= b && b <= -5;
}
/**
* If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
*
* @param b
* a byte.
* @return true if it's the first byte of a six-bytes sequence.
*/
private static boolean isSixBytesSequence( byte b ) {
return -4 <= b && b <= -3;
}
/**
* Retrieve the default charset of the system.
*
* @return the default <code>Charset</code>.
*/
public static Charset getDefaultSystemCharset() {
return Charset.forName( System.getProperty( "file.encoding" ) );
}
/**
* Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
*
* @param bom
* a buffer.
* @return true if the buffer has a BOM for UTF8.
*/
private static boolean hasUTF8Bom( byte[] bom ) {
return ( bom[0] == -17 && bom[1] == -69 && bom[2] == -65 );
}
/**
* Has a Byte Order Marker for UTF-16 Low Endian (ucs-2le, ucs-4le, and ucs-16le).
*
* @param bom
* a buffer.
* @return true if the buffer has a BOM for UTF-16 Low Endian.
*/
private static boolean hasUTF16LEBom( byte[] bom ) {
return ( bom[0] == -1 && bom[1] == -2 );
}
/**
* Has a Byte Order Marker for UTF-16 Big Endian (utf-16 and ucs-2).
*
* @param bom
* a buffer.
* @return true if the buffer has a BOM for UTF-16 Big Endian.
*/
private static boolean hasUTF16BEBom( byte[] bom ) {
return ( bom[0] == -2 && bom[1] == -1 );
}
/**
* Retrieves all the available <code>Charset</code>s on the platform, among which the default <code>charset</code>.
*
* @return an array of <code>Charset</code>s.
*/
public static Charset[] getAvailableCharsets() {
Collection<Charset> collection = Charset.availableCharsets().values();
return collection.toArray( new Charset[collection.size()] );
}
}