/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2016 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.s3csvinput; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.jets3t.service.model.S3Bucket; import org.jets3t.service.model.S3Object; import org.pentaho.di.core.util.Utils; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.exception.KettleFileException; import org.pentaho.di.core.row.RowDataUtil; import org.pentaho.di.core.row.RowMeta; import org.pentaho.di.core.row.ValueMetaInterface; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStep; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; /** * Read a simple CSV file * Just output Strings found in the file... * * @author Matt * @since 2007-07-05 */ public class S3CsvInput extends BaseStep implements StepInterface { private S3CsvInputMeta meta; private S3CsvInputData data; public S3CsvInput( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans ) { super( stepMeta, stepDataInterface, copyNr, transMeta, trans ); } @Override public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException { meta = (S3CsvInputMeta) smi; data = (S3CsvInputData) sdi; if ( first ) { first = false; data.outputRowMeta = new RowMeta(); meta.getFields( data.outputRowMeta, getStepname(), null, null, this ); if ( data.filenames == null ) { // We're expecting the list of filenames from the previous step(s)... // getFilenamesFromPreviousSteps(); } // We only run in parallel if we have at least one file to process // AND if we have more than one step copy running... // data.parallel = meta.isRunningInParallel() && data.totalNumberOfSteps > 1; // The conversion logic for when the lazy conversion is turned of is simple: // Pretend it's a lazy conversion object anyway and get the native type during conversion. // data.convertRowMeta = data.outputRowMeta.clone(); for ( ValueMetaInterface valueMeta : data.convertRowMeta.getValueMetaList() ) { valueMeta.setStorageType( ValueMetaInterface.STORAGE_TYPE_BINARY_STRING ); } // Now handle the parallel reading aspect: determine total of all the file sizes // Then skip to the appropriate file and location in the file to start reading... // Also skip to right after the first newline // if ( data.parallel ) { prepareToRunInParallel(); } // Open the next file... // if ( !openNextFile() ) { setOutputDone(); return false; // nothing to see here, move along... } } // If we are running in parallel, make sure we don't read too much in this step copy... // if ( data.parallel ) { if ( data.totalBytesRead > data.blockToRead ) { setOutputDone(); // stop reading return false; } } Object[] outputRowData = readOneRow( true ); // get row, set busy! if ( outputRowData == null ) { // no more input to be expected... if ( openNextFile() ) { return true; // try again on the next loop... } else { setOutputDone(); // last file, end here return false; } } else { putRow( data.outputRowMeta, outputRowData ); // copy row to possible alternate rowset(s). if ( checkFeedback( getLinesInput() ) ) { if ( log.isBasic() ) { logBasic( Messages.getString( "S3CsvInput.Log.LineNumber", Long.toString( getLinesInput() ) ) ); //$NON-NLS-1$ } } } return true; } private void prepareToRunInParallel() throws KettleException { try { // At this point it doesn't matter if we have 1 or more files. // We'll use the same algorithm... // for ( String filename : data.filenames ) { S3Object objectDetails = data.s3Service.getObjectDetails( data.s3bucket, filename ); long size = objectDetails.getContentLength(); data.fileSizes.add( size ); data.totalFileSize += size; } // Now we can determine the range to read. // // For example, the total file size is 50000, spread over 5 files of 10000 // Suppose we have 2 step copies running (clustered or not) // That means step 0 has to read 0-24999 and step 1 has to read 25000-49999 // // The size of the block to read (25000 in the example) : // data.blockToRead = Math.round( (double) data.totalFileSize / (double) data.totalNumberOfSteps ); // Now we calculate the position to read (0 and 25000 in our sample) : // data.startPosition = data.blockToRead * data.stepNumber; data.endPosition = data.startPosition + data.blockToRead; // Determine the start file number (0 or 2 in our sample) : // >0<,1000,>2000<,3000,4000 // long totalFileSize = 0L; for ( int i = 0; i < data.fileSizes.size(); i++ ) { long size = data.fileSizes.get( i ); // Start of file range: totalFileSize // End of file range: totalFileSize+size if ( data.startPosition >= totalFileSize && data.startPosition < totalFileSize + size ) { // This is the file number to start reading from... // data.filenr = i; // remember where we started to read to allow us to know that we have to skip the header row in the next files (if any) // data.startFilenr = i; // How many bytes do we skip in that first file? // if ( data.startPosition == 0 ) { data.bytesToSkipInFirstFile = 0L; } else { data.bytesToSkipInFirstFile = data.startPosition - totalFileSize; } break; } totalFileSize += size; } logBasic( Messages.getString( "S3CsvInput.Log.ParallelFileNrAndPositionFeedback", data.filenames[data.filenr], Long.toString( data.fileSizes.get( data.filenr ) ), Long.toString( data.bytesToSkipInFirstFile ), Long.toString( data.blockToRead ) ) ); } catch ( Exception e ) { throw new KettleException( Messages.getString( "S3CsvInput.Exception.ErrorPreparingParallelRun" ), e ); } } private void getFilenamesFromPreviousSteps() throws KettleException { List<String> filenames = new ArrayList<String>(); boolean firstRow = true; int index = -1; Object[] row = getRow(); while ( row != null ) { if ( firstRow ) { firstRow = false; // Get the filename field index... // String filenameField = environmentSubstitute( meta.getFilenameField() ); index = getInputRowMeta().indexOfValue( filenameField ); if ( index < 0 ) { throw new KettleException( Messages.getString( "S3CsvInput.Exception.FilenameFieldNotFound", filenameField ) ); } } String filename = getInputRowMeta().getString( row, index ); filenames.add( filename ); // add it to the list... row = getRow(); // Grab another row... } data.filenames = filenames.toArray( new String[filenames.size()] ); logBasic( Messages.getString( "S3CsvInput.Log.ReadingFromNrFiles", Integer.toString( data.filenames.length ) ) ); } private boolean openNextFile() throws KettleException { try { // Close the previous file... // if ( data.fis != null ) { data.fis.close(); } if ( data.filenr >= data.filenames.length ) { return false; } data.s3Object = null; // If we are running in parallel we only want to grab a part of the content, not everything. // if ( data.parallel ) { data.s3Object = data.s3Service.getObject( data.s3bucket, data.filenames[data.filenr], null, null, null, null, data.bytesToSkipInFirstFile, data.bytesToSkipInFirstFile + data.blockToRead + data.maxLineSize * 2 ); } else { data.s3Object = data.s3Service.getObject( data.s3bucket, data.filenames[data.filenr] ); } if ( meta.isLazyConversionActive() ) { data.binaryFilename = data.filenames[data.filenr].getBytes(); } data.fis = data.s3Object.getDataInputStream(); if ( data.parallel ) { if ( data.bytesToSkipInFirstFile > 0 ) { // Now, we need to skip the first row, until the first CR that is. // readOneRow( false ); } } // See if we need to skip the header row... // if ( ( meta.isHeaderPresent() && !data.parallel ) || // Standard flat file : skip header ( data.parallel && data.filenr == data.startFilenr && data.bytesToSkipInFirstFile <= 0 ) || // parallel processing : first file : nothing to skip ( data.parallel && data.filenr > data.startFilenr && data.bytesToSkipInFirstFile <= 0 ) ) { // parallel processing : start of next file, nothing to skip readOneRow( false ); // skip this row. logBasic( Messages.getString( "S3CsvInput.Log.HeaderRowSkipped", data.filenames[data.filenr] ) ); } // Move to the next filename // data.filenr++; // Reset the row number pointer... // data.rowNumber = 1L; // Don't skip again in the next file... // data.bytesToSkipInFirstFile = -1L; return true; } catch ( Exception e ) { throw new KettleException( e ); } } /** Read a single row of data from the file... * * @param doConversions if you want to do conversions, set to false for the header row. * @return a row of data... * @throws KettleException */ private Object[] readOneRow( boolean doConversions ) throws KettleException { try { Object[] outputRowData = RowDataUtil.allocateRowData( data.outputRowMeta.size() ); int outputIndex = 0; boolean newLineFound = false; int newLines = 0; // The strategy is as follows... // We read a block of byte[] from the file. // We scan for the separators in the file (NOT for line feeds etc) // Then we scan that block of data. // We keep a byte[] that we extend if needed.. // At the end of the block we read another, etc. // // Let's start by looking where we left off reading. // while ( !newLineFound && outputIndex < data.convertRowMeta.size() ) { if ( data.endBuffer >= data.bufferSize ) { // Oops, we need to read more data... // Better resize this before we read other things in it... // data.resizeByteBuffer(); // Also read another chunk of data, now that we have the space for it... if ( !data.readBufferFromFile() ) { // TODO handle EOF properly for EOF in the middle of the row, etc. return null; } } // OK, at this point we should have data in the byteBuffer and we should be able to scan for the next // delimiter (;) // So let's look for a delimiter. // Also skip over the enclosures ("), it is NOT taking into account escaped enclosures. // Later we can add an option for having escaped or double enclosures in the file. <sigh> // boolean delimiterFound = false; boolean enclosureFound = false; int escapedEnclosureFound = 0; while ( !delimiterFound ) { // If we find the first char, we might find others as well ;-) // Single byte delimiters only for now. // if ( data.byteBuffer[data.endBuffer] == data.delimiter[0] ) { delimiterFound = true; } else if ( data.byteBuffer[data.endBuffer] == '\n' || data.byteBuffer[data.endBuffer] == '\r' ) { // Perhaps we found a new line? // // data.endBuffer++; data.totalBytesRead++; newLines = 1; if ( data.endBuffer >= data.bufferSize ) { // Oops, we need to read more data... // Better resize this before we read other things in it... // data.resizeByteBuffer(); // Also read another chunk of data, now that we have the space for it... // Ignore EOF, there might be other stuff in the buffer. // data.readBufferFromFile(); } // re-check for double delimiters... if ( data.byteBuffer[data.endBuffer] == '\n' || data.byteBuffer[data.endBuffer] == '\r' ) { data.endBuffer++; data.totalBytesRead++; newLines = 2; if ( data.endBuffer >= data.bufferSize ) { // Oops, we need to read more data... // Better resize this before we read other things in it... // data.resizeByteBuffer(); // Also read another chunk of data, now that we have the space for it... // Ignore EOF, there might be other stuff in the buffer. // data.readBufferFromFile(); } } newLineFound = true; delimiterFound = true; } else if ( data.enclosure != null && data.byteBuffer[data.endBuffer] == data.enclosure[0] ) { // Perhaps we need to skip over an enclosed part? // We always expect exactly one enclosure character // If we find the enclosure doubled, we consider it escaped. // --> "" is converted to " later on. // enclosureFound = true; boolean keepGoing; do { if ( data.increaseEndBuffer() ) { enclosureFound = false; break; } keepGoing = data.byteBuffer[data.endBuffer] != data.enclosure[0]; if ( !keepGoing ) { // We found an enclosure character. // Read another byte... if ( data.increaseEndBuffer() ) { enclosureFound = false; break; } // If this character is also an enclosure, we can consider the enclosure "escaped". // As such, if this is an enclosure, we keep going... // keepGoing = data.byteBuffer[data.endBuffer] == data.enclosure[0]; if ( keepGoing ) { escapedEnclosureFound++; } } } while ( keepGoing ); // Did we reach the end of the buffer? // if ( data.endBuffer >= data.bufferSize ) { newLineFound = true; // consider it a newline to break out of the upper while loop newLines += 2; // to remove the enclosures in case of missing newline on last line. break; } } else { data.endBuffer++; data.totalBytesRead++; if ( data.endBuffer >= data.bufferSize ) { // Oops, we need to read more data... // Better resize this before we read other things in it... // data.resizeByteBuffer(); // Also read another chunk of data, now that we have the space for it... if ( !data.readBufferFromFile() ) { // Break out of the loop if we don't have enough buffer space to continue... // if ( data.endBuffer >= data.bufferSize ) { newLineFound = true; // consider it a newline to break out of the upper while loop break; } } } } } // If we're still here, we found a delimiter.. // Since the starting point never changed really, we just can grab range: // // [startBuffer-endBuffer[ // // This is the part we want. // int length = data.endBuffer - data.startBuffer; if ( newLineFound ) { length -= newLines; if ( length <= 0 ) { length = 0; } } if ( enclosureFound ) { data.startBuffer++; length -= 2; if ( length <= 0 ) { length = 0; } } if ( length <= 0 ) { length = 0; } byte[] field = new byte[length]; System.arraycopy( data.byteBuffer, data.startBuffer, field, 0, length ); // Did we have any escaped characters in there? // if ( escapedEnclosureFound > 0 ) { if ( log.isRowLevel() ) { logRowlevel( "Escaped enclosures found in " + new String( field ) ); } field = data.removeEscapedEnclosures( field, escapedEnclosureFound ); } if ( doConversions ) { if ( meta.isLazyConversionActive() ) { outputRowData[outputIndex++] = field; } else { // We're not lazy so we convert the data right here and now. // The convert object uses binary storage as such we just have to ask the native type from it. // That will do the actual conversion. // ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta( outputIndex ); outputRowData[outputIndex++] = sourceValueMeta.convertBinaryStringToNativeType( field ); } } else { outputRowData[outputIndex++] = null; // nothing for the header, no conversions here. } // OK, move on to the next field... if ( !newLineFound ) { data.endBuffer++; data.totalBytesRead++; } data.startBuffer = data.endBuffer; } // Optionally add the current filename to the mix as well... // if ( meta.isIncludingFilename() && !Utils.isEmpty( meta.getFilenameField() ) ) { if ( meta.isLazyConversionActive() ) { outputRowData[outputIndex++] = data.binaryFilename; } else { outputRowData[outputIndex++] = data.filenames[data.filenr - 1]; } } if ( data.isAddingRowNumber ) { outputRowData[outputIndex++] = new Long( data.rowNumber++ ); } incrementLinesInput(); return outputRowData; } catch ( Exception e ) { throw new KettleFileException( "Exception reading line using NIO", e ); } } @Override public boolean init( StepMetaInterface smi, StepDataInterface sdi ) { meta = (S3CsvInputMeta) smi; data = (S3CsvInputData) sdi; if ( super.init( smi, sdi ) ) { data.preferredBufferSize = 500000; // Fixed size try { data.s3Service = meta.getS3Service( this ); // Get a list of objects in the specified bucket! // String bucketname = environmentSubstitute( meta.getBucket() ); S3Bucket[] buckets = data.s3Service.listAllBuckets(); data.s3bucket = null; for ( S3Bucket bucket : buckets ) { if ( bucket.getName().equals( bucketname ) ) { data.s3bucket = bucket; } } if ( data.s3bucket == null ) { logError( "Unable to find specified bucket : [" + bucketname + "]" ); // TODO i18n return false; } data.maxLineSize = Integer.parseInt( environmentSubstitute( meta.getMaxLineSize() ) ); // If the step doesn't have any previous steps, we just get the filename. // Otherwise, we'll grab the list of filenames later... // if ( getTransMeta().findNrPrevSteps( getStepMeta() ) == 0 ) { String filename = environmentSubstitute( meta.getFilename() ); if ( Utils.isEmpty( filename ) ) { logError( Messages.getString( "S3CsvInput.MissingFilename.Message" ) ); return false; } data.filenames = new String[] { filename, }; } else { data.filenames = null; data.filenr = 0; } data.totalBytesRead = 0L; data.delimiter = environmentSubstitute( meta.getDelimiter() ).getBytes(); if ( Utils.isEmpty( meta.getEnclosure() ) ) { data.enclosure = null; } else { data.enclosure = environmentSubstitute( meta.getEnclosure() ).getBytes(); } data.isAddingRowNumber = !Utils.isEmpty( meta.getRowNumField() ); // Handle parallel reading capabilities... // data.stopReading = false; if ( meta.isRunningInParallel() ) { data.stepNumber = getUniqueStepNrAcrossSlaves(); data.totalNumberOfSteps = getUniqueStepCountAcrossSlaves(); // We are not handling a single file, but possibly a list of files... // As such, the fair thing to do is calculate the total size of the files // Then read the required block. // data.fileSizes = new ArrayList<Long>(); data.totalFileSize = 0L; } return true; } catch ( Exception e ) { logError( "Unexpected error trying to verify S3 settings : ", e ); } } return false; } public void closeFile() throws KettleException { try { if ( data.fis != null ) { data.fis.close(); } } catch ( IOException e ) { throw new KettleException( "Unable to close file channel for file '" + data.filenames[data.filenr - 1], e ); } } }