/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.trans.steps.parallelgzipcsv;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.GZIPInputStream;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.trans.step.BaseStepData;
import org.pentaho.di.trans.step.StepDataInterface;
/**
* @author Matt
* @since 3.2
*/
public class ParGzipCsvInputData extends BaseStepData implements StepDataInterface {
public RowMetaInterface convertRowMeta;
public RowMetaInterface outputRowMeta;
public String[] filenames;
public int filenr;
public int startFilenr;
public byte[] binaryFilename;
public InputStream fis;
public boolean isAddingRowNumber;
public long rowNumber;
public int stepNumber;
public int totalNumberOfSteps;
public boolean parallel;
public int filenameFieldIndex;
public int rownumFieldIndex;
public GZIPInputStream gzis;
public int bufferSize;
public byte[] delimiter;
public byte[] enclosure;
public int startBuffer;
public int endBuffer;
public int maxBuffer;
/**
* This is the main byte buffer into which we're going to read chunks of data...
*/
public byte[] byteBuffer;
public long totalBytesRead;
public long blockSize;
public boolean eofReached;
public long fileReadPosition;
public int blockNr;
public ParGzipCsvInputData() {
super();
startBuffer = 0;
endBuffer = 0;
maxBuffer = 0;
fileReadPosition = 0L;
}
/**
* @return the byte array with escaped enclosures escaped.
*/
public byte[] removeEscapedEnclosures( byte[] field, int nrEnclosuresFound ) {
byte[] result = new byte[field.length - nrEnclosuresFound];
int resultIndex = 0;
for ( int i = 0; i < field.length; i++ ) {
if ( field[i] == enclosure[0] ) {
if ( !( i + 1 < field.length && field[i + 1] == enclosure[0] ) ) {
// Not an escaped enclosure...
// field[i]+field[i+1] is an escaped enclosure...
// so we ignore this one
// field[i+1] will be picked up on the next iteration.
result[resultIndex++] = field[i];
}
} else {
result[resultIndex++] = field[i];
}
}
return result;
}
/**
* Read more data from our current file...
*
* @return
*/
public boolean getMoreData() throws KettleException {
// See if the buffer is completely full (very long lines of data...
// In that situation, we need to re-size the byte buffer...
// We make it half as long...
//
if ( startBuffer == 0 && endBuffer >= byteBuffer.length ) {
int newSize;
if ( byteBuffer.length == 0 ) { // initial
newSize = bufferSize;
} else {
newSize = ( byteBuffer.length * 3 ) / 2; // increase by 50%
}
byte[] newByteBuffer = new byte[newSize];
// Copy over the data into the new buffer.
//
maxBuffer = byteBuffer.length - startBuffer;
System.arraycopy( byteBuffer, startBuffer, newByteBuffer, 0, maxBuffer );
byteBuffer = newByteBuffer;
} else {
// Copy The old data to the start of the buffer...
//
if ( startBuffer > 0 ) {
maxBuffer = byteBuffer.length - startBuffer;
System.arraycopy( byteBuffer, startBuffer, byteBuffer, 0, maxBuffer );
endBuffer = maxBuffer;
startBuffer = 0;
}
}
// Read from our file...
//
int size = byteBuffer.length - maxBuffer;
int bytesRead = 0;
int leftToRead = size;
try {
while ( bytesRead < size ) {
int n = gzis.read( byteBuffer, maxBuffer, leftToRead );
if ( n < 0 ) {
// EOF, nothing more to read in combination with the need to get more data means we're done.
//
eofReached = true;
fileReadPosition += bytesRead;
return bytesRead == 0;
}
bytesRead += n; // bytes read so far
maxBuffer += n; // that's where we ended up so far
leftToRead -= n; // a little bit less to read
}
fileReadPosition += bytesRead; // keep track of where we are in the file...
return false; // all OK
} catch ( IOException e ) {
throw new KettleException( "Unable to read " + size + " bytes from the gzipped input file", e );
}
}
}