/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.fileinput.text; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; /** * Detector of BOM prefix in file. * * We don't use BOMInputStream because: 1) it requires commons-io 2.1 or higher, 2) it doesn't support GB18030 charset, * 3) it's additional abstraction level that add some microseconds to read */ public class BOMDetector { public static final BOMMark[] MARKS = new BOMMark[] { new BOMMark( "UTF-8", 0xEF, 0xBB, 0xBF ), new BOMMark( "UTF-32BE", 0x00, 0x00, 0xFE, 0xFF ), new BOMMark( "UTF-32LE", 0xFF, 0xFE, 0x00, 0x00 ), new BOMMark( "UTF-16BE", 0xFE, 0xFF ), new BOMMark( "UTF-16LE", 0xFF, 0xFE ), new BOMMark( "GB18030", 0x84, 0x31, 0x95, 0x33 ), }; private final InputStream in; private int bomSize; private String charset; public BOMDetector( BufferedInputStream in ) throws IOException { this.in = in; in.mark( 16 ); readBOM(); in.reset(); in.skip( bomSize ); } void readBOM() throws IOException { long bom = readLong(); for ( BOMMark m : MARKS ) { if ( m.matches( bom ) ) { bomSize = m.getBytes(); charset = m.getCharset(); return; } } } public boolean bomExist() { return charset != null; } public String getCharset() { return charset; } /** * Read first 6 bytes for check BOM. */ long readLong() throws IOException { long[] b = new long[6]; for ( int i = 0; i < b.length; i++ ) { b[i] = in.read(); if ( b[i] < 0 ) { b[i] = 0; // after EOF } } long r = 0; for ( int i = 0; i < b.length; i++ ) { r += b[i] << ( i * 8 ); } return r; } public static class BOMMark { private final String charset; private final long mark; private final long mask; private final int bytes; public BOMMark( String charset, int... bytes ) { this.charset = charset; long m = 0; for ( int i = 0; i < bytes.length; i++ ) { m += ( (long) bytes[i] ) << ( i * 8 ); } mark = m; mask = ( 1L << ( bytes.length * 8 ) ) - 1; this.bytes = bytes.length; } public boolean matches( long bytes ) { return ( bytes & mask ) == mark; } public int getBytes() { return bytes; } public String getCharset() { return charset; } } }