/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2017 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.csvinput; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import org.apache.commons.vfs2.FileObject; import org.apache.commons.vfs2.provider.local.LocalFile; import org.pentaho.di.core.Const; import org.pentaho.di.core.ResultFile; import org.pentaho.di.core.exception.KettleConversionException; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.exception.KettleFileException; import org.pentaho.di.core.exception.KettleValueException; import org.pentaho.di.core.logging.LogChannelInterface; import org.pentaho.di.core.row.RowDataUtil; import org.pentaho.di.core.row.RowMeta; import org.pentaho.di.core.row.ValueMetaInterface; import org.pentaho.di.core.util.Utils; import org.pentaho.di.core.vfs.KettleVFS; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStep; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; import org.pentaho.di.trans.steps.textfileinput.EncodingType; import org.pentaho.di.trans.steps.textfileinput.TextFileInput; import org.pentaho.di.trans.steps.textfileinput.TextFileInputField; import org.pentaho.di.trans.steps.textfileinput.TextFileInputMeta; /** * Read a simple CSV file Just output Strings found in the file... * * @author Matt * @since 2007-07-05 */ public class CsvInput extends BaseStep implements StepInterface { private static Class<?> PKG = CsvInput.class; // for i18n purposes, needed by Translator2!! private CsvInputMeta meta; private CsvInputData data; public CsvInput( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans ) { super( stepMeta, stepDataInterface, copyNr, transMeta, trans ); } public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException { meta = (CsvInputMeta) smi; data = (CsvInputData) sdi; if ( first ) { first = false; data.outputRowMeta = new RowMeta(); meta.getFields( data.outputRowMeta, getStepname(), null, null, this, repository, metaStore ); if ( data.filenames == null ) { // We're expecting the list of filenames from the previous step(s)... // getFilenamesFromPreviousSteps(); } // We only run in parallel if we have at least one file to process // AND if we have more than one step copy running... // data.parallel = meta.isRunningInParallel() && data.totalNumberOfSteps > 1; // The conversion logic for when the lazy conversion is turned of is simple: // Pretend it's a lazy conversion object anyway and get the native type during conversion. // data.convertRowMeta = data.outputRowMeta.clone(); for ( ValueMetaInterface valueMeta : data.convertRowMeta.getValueMetaList() ) { valueMeta.setStorageType( ValueMetaInterface.STORAGE_TYPE_BINARY_STRING ); } // Calculate the indexes for the filename and row number fields // data.filenameFieldIndex = -1; if ( !Utils.isEmpty( meta.getFilenameField() ) && meta.isIncludingFilename() ) { data.filenameFieldIndex = meta.getInputFields().length; } data.rownumFieldIndex = -1; if ( !Utils.isEmpty( meta.getRowNumField() ) ) { data.rownumFieldIndex = meta.getInputFields().length; if ( data.filenameFieldIndex >= 0 ) { data.rownumFieldIndex++; } } // Now handle the parallel reading aspect: determine total of all the file sizes // Then skip to the appropriate file and location in the file to start reading... // Also skip to right after the first newline // if ( data.parallel ) { prepareToRunInParallel(); } // Open the next file... // if ( !openNextFile() ) { setOutputDone(); return false; // nothing to see here, move along... } } // If we are running in parallel, make sure we don't read too much in this step copy... // if ( data.parallel ) { if ( data.totalBytesRead >= data.blockToRead ) { setOutputDone(); // stop reading return false; } } try { Object[] outputRowData = readOneRow( false, false ); // get row, set busy! // no more input to be expected... if ( outputRowData == null ) { if ( openNextFile() ) { return true; // try again on the next loop... } else { setOutputDone(); // last file, end here return false; } } else { putRow( data.outputRowMeta, outputRowData ); // copy row to possible alternate rowset(s). if ( checkFeedback( getLinesInput() ) ) { if ( log.isBasic() ) { logBasic( BaseMessages.getString( PKG, "CsvInput.Log.LineNumber", Long.toString( getLinesInput() ) ) ); } } } } catch ( KettleConversionException e ) { if ( getStepMeta().isDoingErrorHandling() ) { StringBuilder errorDescriptions = new StringBuilder( 100 ); StringBuilder errorFields = new StringBuilder( 50 ); for ( int i = 0; i < e.getCauses().size(); i++ ) { if ( i > 0 ) { errorDescriptions.append( ", " ); errorFields.append( ", " ); } errorDescriptions.append( e.getCauses().get( i ).getMessage() ); errorFields.append( e.getFields().get( i ).toStringMeta() ); } putError( data.outputRowMeta, e.getRowData(), e.getCauses().size(), errorDescriptions.toString(), errorFields .toString(), "CSVINPUT001" ); } else { // Only forward the first cause. // throw new KettleException( e.getMessage(), e.getCauses().get( 0 ) ); } } return true; } private void prepareToRunInParallel() throws KettleException { try { // At this point it doesn't matter if we have 1 or more files. // We'll use the same algorithm... // for ( String filename : data.filenames ) { long size = KettleVFS.getFileObject( filename, getTransMeta() ).getContent().getSize(); data.fileSizes.add( size ); data.totalFileSize += size; } // Now we can determine the range to read. // // For example, the total file size is 50000, spread over 5 files of 10000 // Suppose we have 2 step copies running (clustered or not) // That means step 0 has to read 0-24999 and step 1 has to read 25000-49999 // // The size of the block to read (25000 in the example) : // data.blockToRead = Math.round( (double) data.totalFileSize / (double) data.totalNumberOfSteps ); // Now we calculate the position to read (0 and 25000 in our sample) : // data.startPosition = data.blockToRead * data.stepNumber; data.endPosition = data.startPosition + data.blockToRead; // Determine the start file number (0 or 2 in our sample) : // >0<,1000,>2000<,3000,4000 // long totalFileSize = 0L; for ( int i = 0; i < data.fileSizes.size(); i++ ) { long size = data.fileSizes.get( i ); // Start of file range: totalFileSize // End of file range: totalFileSize+size if ( data.startPosition >= totalFileSize && data.startPosition < totalFileSize + size ) { // This is the file number to start reading from... // data.filenr = i; // remember where we started to read to allow us to know that we have to skip the header row in the next files // (if any) // data.startFilenr = i; // How many bytes do we skip in that first file? // if ( data.startPosition == 0 ) { data.bytesToSkipInFirstFile = 0L; } else { data.bytesToSkipInFirstFile = data.startPosition - totalFileSize; } break; } totalFileSize += size; } if ( data.filenames.length > 0 ) { logBasic( BaseMessages.getString( PKG, "CsvInput.Log.ParallelFileNrAndPositionFeedback", data.filenames[ data.filenr ], Long .toString( data.fileSizes.get( data.filenr ) ), Long.toString( data.bytesToSkipInFirstFile ), Long .toString( data.blockToRead ) ) ); } } catch ( Exception e ) { throw new KettleException( BaseMessages.getString( PKG, "CsvInput.Exception.ErrorPreparingParallelRun" ), e ); } } private void getFilenamesFromPreviousSteps() throws KettleException { List<String> filenames = new ArrayList<String>(); boolean firstRow = true; int index = -1; Object[] row = getRow(); while ( row != null ) { if ( firstRow ) { firstRow = false; // Get the filename field index... // String filenameField = environmentSubstitute( meta.getFilenameField() ); index = getInputRowMeta().indexOfValue( filenameField ); if ( index < 0 ) { throw new KettleException( BaseMessages.getString( PKG, "CsvInput.Exception.FilenameFieldNotFound", filenameField ) ); } } String filename = getInputRowMeta().getString( row, index ); filenames.add( filename ); // add it to the list... row = getRow(); // Grab another row... } data.filenames = filenames.toArray( new String[ filenames.size() ] ); logBasic( BaseMessages.getString( PKG, "CsvInput.Log.ReadingFromNrFiles", Integer .toString( data.filenames.length ) ) ); } @Override public void dispose( StepMetaInterface smi, StepDataInterface sdi ) { try { // Close the previous file... // if ( data.fc != null ) { data.fc.close(); } } catch ( Exception e ) { logError( "Error closing file channel", e ); } try { if ( data.fis != null ) { data.fis.close(); } } catch ( Exception e ) { logError( "Error closing file input stream", e ); } super.dispose( smi, sdi ); } private boolean openNextFile() throws KettleException { try { // Close the previous file... // data.closeFile(); if ( data.filenr >= data.filenames.length ) { return false; } // Open the next one... // data.fieldsMapping = createFieldMapping( data.filenames[data.filenr], meta ); FileObject fileObject = KettleVFS.getFileObject( data.filenames[ data.filenr ], getTransMeta() ); if ( !( fileObject instanceof LocalFile ) ) { // We can only use NIO on local files at the moment, so that's what we limit ourselves to. // throw new KettleException( BaseMessages.getString( PKG, "CsvInput.Log.OnlyLocalFilesAreSupported" ) ); } if ( meta.isLazyConversionActive() ) { data.binaryFilename = data.filenames[ data.filenr ].getBytes(); } data.fis = new FileInputStream( KettleVFS.getFilename( fileObject ) ); data.fc = data.fis.getChannel(); data.bb = ByteBuffer.allocateDirect( data.preferredBufferSize ); // If we are running in parallel and we need to skip bytes in the first file, let's do so here. // if ( data.parallel ) { if ( data.bytesToSkipInFirstFile > 0 ) { data.fc.position( data.bytesToSkipInFirstFile ); // evaluate whether there is a need to skip a row if ( needToSkipRow() ) { readOneRow( true, true ); } } } // Add filename to result filenames ? if ( meta.isAddResultFile() ) { ResultFile resultFile = new ResultFile( ResultFile.FILE_TYPE_GENERAL, fileObject, getTransMeta().getName(), toString() ); resultFile.setComment( "File was read by a Csv input step" ); addResultFile( resultFile ); } // Move to the next filename // data.filenr++; // See if we need to skip a row... // - If you have a header row checked and if you're not running in parallel // - If you're running in parallel, if a header row is checked, if you're at the beginning of a file // if ( meta.isHeaderPresent() ) { // Standard flat file : skip header if ( !data.parallel || data.bytesToSkipInFirstFile <= 0 ) { readOneRow( true, false ); // skip this row. logBasic( BaseMessages.getString( PKG, "CsvInput.Log.HeaderRowSkipped", data.filenames[ data.filenr - 1 ] ) ); } } // Reset the row number pointer... // data.rowNumber = 1L; // Don't skip again in the next file... // data.bytesToSkipInFirstFile = -1L; return true; } catch ( KettleException e ) { throw e; } catch ( Exception e ) { throw new KettleException( e ); } } FieldsMapping createFieldMapping( String fileName, CsvInputMeta csvInputMeta ) throws KettleException { FieldsMapping mapping = null; if ( csvInputMeta.isHeaderPresent() ) { String[] fieldNames = readFieldNamesFromFile( fileName, csvInputMeta ); mapping = NamedFieldsMapping.mapping( fieldNames, fieldNames( csvInputMeta ) ); } else { int fieldsCount = csvInputMeta.getInputFields() == null ? 0 : csvInputMeta.getInputFields().length; mapping = UnnamedFieldsMapping.mapping( fieldsCount ); } return mapping; } String[] readFieldNamesFromFile( String fileName, CsvInputMeta csvInputMeta ) throws KettleException { String delimiter = environmentSubstitute( csvInputMeta.getDelimiter() ); String enclosure = environmentSubstitute( csvInputMeta.getEnclosure() ); String realEncoding = environmentSubstitute( csvInputMeta.getEncoding() ); try ( FileObject fileObject = KettleVFS.getFileObject( fileName, getTransMeta() ); InputStream inputStream = KettleVFS.getInputStream( fileObject ) ) { InputStreamReader reader = null; if ( Utils.isEmpty( realEncoding ) ) { reader = new InputStreamReader( inputStream ); } else { reader = new InputStreamReader( inputStream, realEncoding ); } EncodingType encodingType = EncodingType.guessEncodingType( reader.getEncoding() ); String line = TextFileInput.getLine( log, reader, encodingType, TextFileInputMeta.FILE_FORMAT_UNIX, new StringBuilder( 1000 ) ); // remove BOM boolean containsBOM = line.indexOf( "\uFEFF" ) == 0; if ( containsBOM ) { line = line.substring( 1 ); } String[] fieldNames = CsvInput.guessStringsFromLine( log, line, delimiter, enclosure, csvInputMeta.getEscapeCharacter() ); if ( !Utils.isEmpty( csvInputMeta.getEnclosure() ) ) { removeEnclosure( fieldNames, csvInputMeta.getEnclosure() ); } return fieldNames; } catch ( IOException e ) { throw new KettleFileException( BaseMessages.getString( PKG, "CsvInput.Exception.CreateFieldMappingError" ), e ); } } static String[] fieldNames( CsvInputMeta csvInputMeta ) { TextFileInputField[] fields = csvInputMeta.getInputFields(); String[] fieldNames = new String[fields.length]; for ( int i = 0; i < fields.length; i++ ) { fieldNames[i] = fields[i].getName(); } return fieldNames; } static void removeEnclosure( String[] fields, String enclosure ) { for ( int i = 0; i < fields.length; i++ ) { if ( fields[i].startsWith( enclosure ) && fields[i].endsWith( enclosure ) && fields[i].length() > 1 ) { fields[i] = fields[i].substring( 1, fields[i].length() - 1 ); } } } /** * We need to skip row only if a line, that we are currently on is read by the previous step <b>partly</b>. * In other words, we DON'T skip a line if we are just beginning to read it from the first symbol. * We have to do some work for this: read last byte from the previous step and make sure that it is a new line byte. * But it's not enough. There could be a situation, where new line is indicated by '\r\n' construction. And if we are * <b>between</b> this construction, we want to skip last '\n', and don't want to include it in our line. * * So, we DON'T skip line only if the previous char is new line indicator AND we are not between '\r\n'. * */ private boolean needToSkipRow() { try { // first we move pointer to the last byte of the previous step data.fc.position( data.fc.position() - 1 ); // read data, if not yet data.resizeBufferIfNeeded(); // check whether the last symbol from the previous step is a new line if ( data.newLineFound() ) { // don't increase bytes read for this step, as it is actually content of another step // and we are reading this just for evaluation. data.moveEndBufferPointer( false ); // now we are at the first char of our thread. // there is still a situation we want to avoid: when there is a windows style "/r/n", and we are between two // of this chars. In this case we need to skip a line. Otherwise we don't skip it. return data.newLineFound(); } else { // moving to the first char of our line. data.moveEndBufferPointer( false ); } } catch ( IOException e ) { e.printStackTrace(); } finally { try { data.fc.position( data.fc.position() + 1 ); } catch ( IOException e ) { // nothing to do here } } return true; } /** * Read a single row of data from the file... * * @param skipRow if row should be skipped: header row or part of row in case of parallel read * @param ignoreEnclosures if enclosures should be ignored, i.e. in case of we need to skip part of the row during * parallel read * @return a row of data... * @throws KettleException */ private Object[] readOneRow( boolean skipRow, boolean ignoreEnclosures ) throws KettleException { try { Object[] outputRowData = RowDataUtil.allocateRowData( data.outputRowMeta.size() ); int outputIndex = 0; boolean newLineFound = false; boolean endOfBuffer = false; List<Exception> conversionExceptions = null; List<ValueMetaInterface> exceptionFields = null; // The strategy is as follows... // We read a block of byte[] from the file. // We scan for the separators in the file (NOT for line feeds etc) // Then we scan that block of data. // We keep a byte[] that we extend if needed.. // At the end of the block we read another, etc. // // Let's start by looking where we left off reading. // while ( !newLineFound && outputIndex < data.fieldsMapping.size() ) { if ( data.resizeBufferIfNeeded() ) { // Last row was being discarded if the last item is null and // there is no end of line delimiter if ( outputRowData != null ) { // Make certain that at least one record exists before // filling the rest of them with null if ( outputIndex > 0 ) { // Optionally add the current filename to the mix as well... // if ( meta.isIncludingFilename() && !Utils.isEmpty( meta.getFilenameField() ) ) { if ( meta.isLazyConversionActive() ) { outputRowData[ data.filenameFieldIndex ] = data.binaryFilename; } else { outputRowData[ data.filenameFieldIndex ] = data.filenames[ data.filenr - 1 ]; } } if ( data.isAddingRowNumber ) { outputRowData[data.rownumFieldIndex] = data.rowNumber++; } incrementLinesInput(); return outputRowData; } } return null; // nothing more to read, call it a day. } // OK, at this point we should have data in the byteBuffer and we should be able to scan for the next // delimiter (;) // So let's look for a delimiter. // Also skip over the enclosures ("), it is NOT taking into account escaped enclosures. // Later we can add an option for having escaped or double enclosures in the file. <sigh> // boolean delimiterFound = false; boolean enclosureFound = false; boolean doubleLineEnd = false; int escapedEnclosureFound = 0; boolean ignoreEnclosuresInField = ignoreEnclosures; while ( !delimiterFound && !newLineFound && !endOfBuffer ) { // If we find the first char, we might find others as well ;-) // Single byte delimiters only for now. // if ( data.delimiterFound() ) { delimiterFound = true; } else if ( ( !meta.isNewlinePossibleInFields() || outputIndex == data.fieldsMapping.size() - 1 ) && data.newLineFound() ) { // Perhaps we found a (pre-mature) new line? // // In case we are not using an enclosure and in case fields contain new lines // we need to make sure that we check the newlines possible flag. // If the flag is enable we skip newline checking except for the last field in the row. // In that one we can't support newlines without enclosure (handled below). // newLineFound = true; // Skip new line character for ( int i = 0; i < data.encodingType.getLength(); i++ ) { data.moveEndBufferPointer(); } // Re-check for double new line (\r\n)... if ( data.newLineFound() ) { // Found another one, need to skip it later doubleLineEnd = true; } } else if ( data.enclosureFound() && !ignoreEnclosuresInField ) { int enclosurePosition = data.getEndBuffer(); int fieldFirstBytePosition = data.getStartBuffer(); if ( fieldFirstBytePosition == enclosurePosition ) { // Perhaps we need to skip over an enclosed part? // We always expect exactly one enclosure character // If we find the enclosure doubled, we consider it escaped. // --> "" is converted to " later on. // enclosureFound = true; boolean keepGoing; do { if ( data.moveEndBufferPointer() ) { enclosureFound = false; break; } keepGoing = !data.enclosureFound(); if ( !keepGoing ) { // We found an enclosure character. // Read another byte... if ( !data.endOfBuffer() && data.moveEndBufferPointer() ) { break; } if ( data.enclosure.length > 1 ) { data.moveEndBufferPointer(); } // If this character is also an enclosure, we can consider the enclosure "escaped". // As such, if this is an enclosure, we keep going... // keepGoing = data.enclosureFound(); if ( keepGoing ) { escapedEnclosureFound++; } } } while ( keepGoing ); // Did we reach the end of the buffer? // if ( data.endOfBuffer() ) { endOfBuffer = true; break; } } else { // Ignoring enclosure if it's not at the field start ignoreEnclosuresInField = true; } } else { if ( data.moveEndBufferPointer() ) { endOfBuffer = true; break; } } } // If we're still here, we found a delimiter... // Since the starting point never changed really, we just can grab range: // // [startBuffer-endBuffer[ // // This is the part we want. // data.byteBuffer[data.startBuffer] // byte[] field = data.getField( delimiterFound, enclosureFound, newLineFound, endOfBuffer ); // Did we have any escaped characters in there? // if ( escapedEnclosureFound > 0 ) { if ( log.isRowLevel() ) { logRowlevel( "Escaped enclosures found in " + new String( field ) ); } field = data.removeEscapedEnclosures( field, escapedEnclosureFound ); } final int currentFieldIndex = outputIndex++; final int actualFieldIndex = data.fieldsMapping.fieldMetaIndex( currentFieldIndex ); if ( actualFieldIndex != FieldsMapping.FIELD_DOES_NOT_EXIST ) { if ( !skipRow ) { if ( meta.isLazyConversionActive() ) { outputRowData[actualFieldIndex] = field; } else { // We're not lazy so we convert the data right here and now. // The convert object uses binary storage as such we just have to ask the native type from it. // That will do the actual conversion. // ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta( actualFieldIndex ); try { outputRowData[actualFieldIndex] = sourceValueMeta.convertBinaryStringToNativeType( field ); } catch ( KettleValueException e ) { // There was a conversion error, // outputRowData[actualFieldIndex] = null; if ( conversionExceptions == null ) { conversionExceptions = new ArrayList<Exception>(); exceptionFields = new ArrayList<ValueMetaInterface>(); } conversionExceptions.add( e ); exceptionFields.add( sourceValueMeta ); } } } else { outputRowData[actualFieldIndex] = null; // nothing for the header, no conversions here. } } // OK, move on to the next field... // PDI-8187: Before we increment, we should check to see if the while condition is about to fail. // this will prevent the endBuffer from being incremented twice (once by this block and once in the // do-while loop below) and possibly skipping a newline character. This can occur if there is an // empty column at the end of the row (see the Jira case for details) if ( ( !newLineFound && outputIndex < data.fieldsMapping.size() ) || ( newLineFound && doubleLineEnd ) ) { int i = 0; while ( ( !data.newLineFound() && ( i < data.delimiter.length ) ) ) { data.moveEndBufferPointer(); i++; } if ( data.newLineFound() && outputIndex >= data.fieldsMapping.size() ) { data.moveEndBufferPointer(); } if ( doubleLineEnd && data.encodingType.getLength() > 1 ) { data.moveEndBufferPointer(); } } data.setStartBuffer( data.getEndBuffer() ); } // See if we reached the end of the line. // If not, we need to skip the remaining items on the line until the next newline... // if ( !newLineFound && !data.resizeBufferIfNeeded() ) { do { data.moveEndBufferPointer(); if ( data.resizeBufferIfNeeded() ) { break; // nothing more to read. } // TODO: if we're using quoting we might be dealing with a very dirty file with quoted newlines in trailing // fields. (imagine that) // In that particular case we want to use the same logic we use above (refactored a bit) to skip these fields. } while ( !data.newLineFound() ); if ( !data.resizeBufferIfNeeded() ) { while ( data.newLineFound() ) { data.moveEndBufferPointer(); if ( data.resizeBufferIfNeeded() ) { break; // nothing more to read. } } } // Make sure we start at the right position the next time around. data.setStartBuffer( data.getEndBuffer() ); } // Optionally add the current filename to the mix as well... // if ( meta.isIncludingFilename() && !Utils.isEmpty( meta.getFilenameField() ) ) { if ( meta.isLazyConversionActive() ) { outputRowData[ data.filenameFieldIndex ] = data.binaryFilename; } else { outputRowData[ data.filenameFieldIndex ] = data.filenames[ data.filenr - 1 ]; } } if ( data.isAddingRowNumber ) { outputRowData[ data.rownumFieldIndex ] = data.rowNumber++; } if ( !ignoreEnclosures ) { incrementLinesInput(); } if ( conversionExceptions != null && conversionExceptions.size() > 0 ) { // Forward the first exception // throw new KettleConversionException( "There were " + conversionExceptions.size() + " conversion errors on line " + getLinesInput(), conversionExceptions, exceptionFields, outputRowData ); } return outputRowData; } catch ( KettleConversionException e ) { throw e; } catch ( IOException e ) { throw new KettleFileException( "Exception reading line using NIO", e ); } } public boolean init( StepMetaInterface smi, StepDataInterface sdi ) { meta = (CsvInputMeta) smi; data = (CsvInputData) sdi; if ( super.init( smi, sdi ) ) { // PDI-10242 see if a variable is used as encoding value String realEncoding = environmentSubstitute( meta.getEncoding() ); data.preferredBufferSize = Integer.parseInt( environmentSubstitute( meta.getBufferSize() ) ); // If the step doesn't have any previous steps, we just get the filename. // Otherwise, we'll grab the list of file names later... // if ( getTransMeta().findNrPrevSteps( getStepMeta() ) == 0 ) { String filename = environmentSubstitute( meta.getFilename() ); if ( Utils.isEmpty( filename ) ) { logError( BaseMessages.getString( PKG, "CsvInput.MissingFilename.Message" ) ); return false; } data.filenames = new String[] { filename, }; } else { data.filenames = null; data.filenr = 0; } data.totalBytesRead = 0L; data.encodingType = EncodingType.guessEncodingType( realEncoding ); // PDI-2489 - set the delimiter byte value to the code point of the // character as represented in the input file's encoding try { data.delimiter = data.encodingType.getBytes( environmentSubstitute( meta.getDelimiter() ), realEncoding ); if ( Utils.isEmpty( meta.getEnclosure() ) ) { data.enclosure = null; } else { data.enclosure = data.encodingType.getBytes( environmentSubstitute( meta.getEnclosure() ), realEncoding ); } } catch ( UnsupportedEncodingException e ) { logError( BaseMessages.getString( PKG, "CsvInput.BadEncoding.Message" ), e ); return false; } data.isAddingRowNumber = !Utils.isEmpty( meta.getRowNumField() ); // Handle parallel reading capabilities... // data.stopReading = false; if ( meta.isRunningInParallel() ) { data.stepNumber = getUniqueStepNrAcrossSlaves(); data.totalNumberOfSteps = getUniqueStepCountAcrossSlaves(); // We are not handling a single file, but possibly a list of files... // As such, the fair thing to do is calculate the total size of the files // Then read the required block. // data.fileSizes = new ArrayList<Long>(); data.totalFileSize = 0L; } // Set the most efficient pattern matcher to match the delimiter. // if ( data.delimiter.length == 1 ) { data.delimiterMatcher = new SingleBytePatternMatcher(); } else { data.delimiterMatcher = new MultiBytePatternMatcher(); } // Set the most efficient pattern matcher to match the enclosure. // if ( data.enclosure == null ) { data.enclosureMatcher = new EmptyPatternMatcher(); } else { if ( data.enclosure.length == 1 ) { data.enclosureMatcher = new SingleBytePatternMatcher(); } else { data.enclosureMatcher = new MultiBytePatternMatcher(); } } switch ( data.encodingType ) { case DOUBLE_BIG_ENDIAN: data.crLfMatcher = new MultiByteBigCrLfMatcher(); break; case DOUBLE_LITTLE_ENDIAN: data.crLfMatcher = new MultiByteLittleCrLfMatcher(); break; default: data.crLfMatcher = new SingleByteCrLfMatcher(); break; } return true; } return false; } /** * This method is borrowed from TextFileInput * * @param log logger * @param line line to analyze * @param delimiter delimiter used * @param enclosure enclosure used * @param escapeCharacter escape character used * @return list of string detected * @throws KettleException */ public static String[] guessStringsFromLine( LogChannelInterface log, String line, String delimiter, String enclosure, String escapeCharacter ) throws KettleException { List<String> strings = new ArrayList<String>(); String pol; // piece of line try { if ( line == null ) { return null; } // Split string in pieces, only for CSV! int pos = 0; int length = line.length(); boolean dencl = false; int len_encl = ( enclosure == null ? 0 : enclosure.length() ); int len_esc = ( escapeCharacter == null ? 0 : escapeCharacter.length() ); while ( pos < length ) { int from = pos; int next; boolean encl_found; boolean contains_escaped_enclosures = false; boolean contains_escaped_separators = false; // Is the field beginning with an enclosure? // "aa;aa";123;"aaa-aaa";000;... if ( len_encl > 0 && line.substring( from, from + len_encl ).equalsIgnoreCase( enclosure ) ) { if ( log.isRowLevel() ) { log.logRowlevel( BaseMessages.getString( PKG, "CsvInput.Log.ConvertLineToRowTitle" ), BaseMessages .getString( PKG, "CsvInput.Log.ConvertLineToRow", line.substring( from, from + len_encl ) ) ); } encl_found = true; int p = from + len_encl; boolean is_enclosure = len_encl > 0 && p + len_encl < length && line.substring( p, p + len_encl ).equalsIgnoreCase( enclosure ); boolean is_escape = len_esc > 0 && p + len_esc < length && line.substring( p, p + len_esc ).equalsIgnoreCase( escapeCharacter ); boolean enclosure_after = false; // Is it really an enclosure? See if it's not repeated twice or escaped! if ( ( is_enclosure || is_escape ) && p < length - 1 ) { String strnext = line.substring( p + len_encl, p + 2 * len_encl ); if ( strnext.equalsIgnoreCase( enclosure ) ) { p++; enclosure_after = true; dencl = true; // Remember to replace them later on! if ( is_escape ) { contains_escaped_enclosures = true; } } } // Look for a closing enclosure! while ( ( !is_enclosure || enclosure_after ) && p < line.length() ) { p++; enclosure_after = false; is_enclosure = len_encl > 0 && p + len_encl < length && line.substring( p, p + len_encl ).equals( enclosure ); is_escape = len_esc > 0 && p + len_esc < length && line.substring( p, p + len_esc ).equals( escapeCharacter ); // Is it really an enclosure? See if it's not repeated twice or escaped! if ( ( is_enclosure || is_escape ) && p < length - 1 ) { String strnext = line.substring( p + len_encl, p + 2 * len_encl ); if ( strnext.equals( enclosure ) ) { p++; enclosure_after = true; dencl = true; // Remember to replace them later on! if ( is_escape ) { contains_escaped_enclosures = true; // remember } } } } if ( p >= length ) { next = p; } else { next = p + len_encl; } if ( log.isRowLevel() ) { log.logRowlevel( BaseMessages.getString( PKG, "CsvInput.Log.ConvertLineToRowTitle" ), BaseMessages .getString( PKG, "CsvInput.Log.EndOfEnclosure", "" + p ) ); } } else { encl_found = false; boolean found = false; int startpoint = from; do { next = line.indexOf( delimiter, startpoint ); // See if this position is preceded by an escape character. if ( len_esc > 0 && next - len_esc > 0 ) { String before = line.substring( next - len_esc, next ); if ( escapeCharacter != null && escapeCharacter.equals( before ) ) { // take the next separator, this one is escaped... startpoint = next + 1; contains_escaped_separators = true; } else { found = true; } } else { found = true; } } while ( !found && next >= 0 ); } if ( next == -1 ) { next = length; } if ( encl_found ) { pol = line.substring( from + len_encl, next - len_encl ); if ( log.isRowLevel() ) { log .logRowlevel( BaseMessages.getString( PKG, "CsvInput.Log.ConvertLineToRowTitle" ), BaseMessages.getString( PKG, "CsvInput.Log.EnclosureFieldFound", "" + pol ) ); } } else { pol = line.substring( from, next ); if ( log.isRowLevel() ) { log .logRowlevel( BaseMessages.getString( PKG, "CsvInput.Log.ConvertLineToRowTitle" ), BaseMessages.getString( PKG, "CsvInput.Log.NormalFieldFound", "" + pol ) ); } } if ( dencl ) { StringBuilder sbpol = new StringBuilder( pol ); int idx = sbpol.indexOf( enclosure + enclosure ); while ( idx >= 0 ) { sbpol.delete( idx, idx + ( enclosure == null ? 0 : enclosure.length() ) ); idx = sbpol.indexOf( enclosure + enclosure ); } pol = sbpol.toString(); } // replace the escaped enclosures with enclosures... if ( contains_escaped_enclosures ) { String replace = escapeCharacter + enclosure; pol = Const.replace( pol, replace, enclosure ); } // replace the escaped separators with separators... if ( contains_escaped_separators ) { String replace = escapeCharacter + delimiter; pol = Const.replace( pol, replace, delimiter ); } // Now add pol to the strings found! strings.add( pol ); pos = next + delimiter.length(); } if ( pos == length ) { if ( log.isRowLevel() ) { log.logRowlevel( BaseMessages.getString( PKG, "CsvInput.Log.ConvertLineToRowTitle" ), BaseMessages .getString( PKG, "CsvInput.Log.EndOfEmptyLineFound" ) ); } strings.add( "" ); } } catch ( Exception e ) { throw new KettleException( BaseMessages.getString( PKG, "CsvInput.Log.Error.ErrorConvertingLine", e .toString() ), e ); } return strings.toArray( new String[ strings.size() ] ); } }