/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2016 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.s3csvinput; import java.io.IOException; import java.io.InputStream; import java.util.List; import org.jets3t.service.S3Service; import org.jets3t.service.model.S3Bucket; import org.jets3t.service.model.S3Object; import org.pentaho.di.core.row.RowMetaInterface; import org.pentaho.di.trans.step.BaseStepData; import org.pentaho.di.trans.step.StepDataInterface; /** * @author Matt * @since 24-jan-2005 */ public class S3CsvInputData extends BaseStepData implements StepDataInterface { public RowMetaInterface convertRowMeta; public RowMetaInterface outputRowMeta; private byte[] bb; public byte[] byteBuffer; public int startBuffer; public int endBuffer; public int bufferSize; public byte[] delimiter; public byte[] enclosure; public int preferredBufferSize; public String[] filenames; public int filenr; public int startFilenr; public byte[] binaryFilename; public long fileSize; public InputStream fis; public boolean isAddingRowNumber; public long rowNumber; public boolean stopReading; public int stepNumber; public int totalNumberOfSteps; public List<Long> fileSizes; public long totalFileSize; public long blockToRead; public long startPosition; public long endPosition; public long bytesToSkipInFirstFile; public long totalBytesRead; public boolean parallel; public S3Service s3Service; public S3Bucket s3bucket; public int maxLineSize; public S3Object s3Object; /** * */ public S3CsvInputData() { super(); byteBuffer = new byte[]{}; startBuffer = 0; endBuffer = 0; totalBytesRead = 0; bb = new byte[50000]; // TODO re-introduce as parameter, probably doesn't matter at all. } // Resize public void resizeByteBuffer() { // What's the new size? // It's (endBuffer-startBuffer)+size !! // That way we can at least read one full block of data using NIO // bufferSize = endBuffer - startBuffer; int newSize = bufferSize + preferredBufferSize; byte[] newByteBuffer = new byte[newSize]; // copy over the old data... System.arraycopy( byteBuffer, startBuffer, newByteBuffer, 0, bufferSize ); // replace the old byte buffer... byteBuffer = newByteBuffer; // Adjust start and end point of data in the byte buffer // startBuffer = 0; endBuffer = bufferSize; } public boolean readBufferFromFile() throws IOException { int n = fis.read( bb ); if ( n == -1 ) { return false; } else { // adjust the highest used position... // bufferSize = endBuffer + n; // Store the data in our byte array // for ( int i = 0; i < n; i++ ) { byteBuffer[endBuffer + i] = bb[i]; } return true; } } /** * Increase the endBuffer pointer by one.<br> * If there is not enough room in the buffer to go there, resize the byte buffer and read more data.<br> * if there is no more data to read and if the endBuffer pointer has reached the end of the byte buffer, we return true.<br> * @return true if we reached the end of the byte buffer. * @throws IOException In case we get an error reading from the input file. */ public boolean increaseEndBuffer() throws IOException { endBuffer++; if ( endBuffer >= bufferSize ) { // Oops, we need to read more data... // Better resize this before we read other things in it... // resizeByteBuffer(); // Also read another chunk of data, now that we have the space for it... if ( !readBufferFromFile() ) { // Break out of the loop if we don't have enough buffer space to continue... // if ( endBuffer >= bufferSize ) { return true; } } } return false; } /** <pre> [abcd "" defg] --> [abcd " defg] [""""] --> [""] [""] --> ["] </pre> @return the byte array with escaped enclosures escaped. */ public byte[] removeEscapedEnclosures( byte[] field, int nrEnclosuresFound ) { byte[] result = new byte[field.length - nrEnclosuresFound]; int resultIndex = 0; for ( int i = 0; i < field.length; i++ ) { if ( field[i] == enclosure[0] ) { if ( !( i + 1 < field.length && field[i + 1] == enclosure[0] ) ) { // if field[i]+field[i+1] is an escaped enclosure, ignore it // field[i+1] will be picked up on the next iteration. // But this is not an escaped enclosure... result[resultIndex++] = field[i]; } } else { result[resultIndex++] = field[i]; } } return result; } }