/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2016 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.fileinput; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.apache.commons.vfs2.FileObject; import org.apache.commons.vfs2.FileSystemException; import org.pentaho.di.core.Const; import org.pentaho.di.core.util.Utils; import org.pentaho.di.core.Result; import org.pentaho.di.core.ResultFile; import org.pentaho.di.core.RowSet; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.exception.KettleFileException; import org.pentaho.di.core.row.RowDataUtil; import org.pentaho.di.core.row.RowMeta; import org.pentaho.di.core.row.RowMetaInterface; import org.pentaho.di.core.row.ValueMetaInterface; import org.pentaho.di.core.vfs.KettleVFS; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStep; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; import org.pentaho.di.trans.step.errorhandling.CompositeFileErrorHandler; import org.pentaho.di.trans.step.errorhandling.FileErrorHandler; import org.pentaho.di.trans.step.errorhandling.FileErrorHandlerContentLineNumber; import org.pentaho.di.trans.step.errorhandling.FileErrorHandlerMissingFiles; /** * This class contains base functionality for file-based input steps. * * @author Alexander Buloichik */ public abstract class BaseFileInputStep<M extends BaseFileInputStepMeta, D extends BaseFileInputStepData> extends BaseStep implements IBaseFileInputStepControl { private static Class<?> PKG = BaseFileInputStep.class; // for i18n purposes, needed by Translator2!! TODO: is // it right for // base class ??? protected M meta; protected D data; /** * Content-dependent initialization. */ protected abstract boolean init(); /** * Create reader for specific file. */ protected abstract IBaseFileInputReader createReader( M meta, D data, FileObject file ) throws Exception; public BaseFileInputStep( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans ) { super( stepMeta, stepDataInterface, copyNr, transMeta, trans ); } /** * Initialize step before execute. */ @Override public boolean init( StepMetaInterface smi, StepDataInterface sdi ) { meta = (M) smi; data = (D) sdi; if ( !super.init( smi, sdi ) ) { return false; } initErrorHandling(); meta.additionalOutputFields.normalize(); data.files = meta.getFileInputList( this ); data.currentFileIndex = 0; // If there are missing files, // fail if we don't ignore errors // Result previousResult = getTrans().getPreviousResult(); Map<String, ResultFile> resultFiles = ( previousResult != null ) ? previousResult.getResultFiles() : null; if ( ( previousResult == null || resultFiles == null || resultFiles.size() == 0 ) && data.files .nrOfMissingFiles() > 0 && !meta.inputFiles.acceptingFilenames && !meta.errorHandling.errorIgnored ) { logError( BaseMessages.getString( PKG, "TextFileInput.Log.Error.NoFilesSpecified" ) ); return false; } String clusterSize = getVariable( Const.INTERNAL_VARIABLE_CLUSTER_SIZE ); if ( !Utils.isEmpty( clusterSize ) && Integer.valueOf( clusterSize ) > 1 ) { // TODO: add metadata to configure this. String nr = getVariable( Const.INTERNAL_VARIABLE_SLAVE_SERVER_NUMBER ); if ( log.isDetailed() ) { logDetailed( "Running on slave server #" + nr + " : assuming that each slave reads a dedicated part of the same file(s)." ); } } return init(); } /** * Open next VFS file for processing. * * This method will support different parallelization methods later. */ protected boolean openNextFile() { try { if ( data.currentFileIndex >= data.files.nrOfFiles() ) { // all files already processed return false; } // Is this the last file? data.file = data.files.getFile( data.currentFileIndex ); data.filename = KettleVFS.getFilename( data.file ); fillFileAdditionalFields( data, data.file ); if ( meta.inputFiles.passingThruFields ) { data.currentPassThruFieldsRow = data.passThruFields.get( data.file ); } // Add this files to the result of this transformation. // if ( meta.inputFiles.isaddresult ) { ResultFile resultFile = new ResultFile( ResultFile.FILE_TYPE_GENERAL, data.file, getTransMeta().getName(), toString() ); resultFile.setComment( "File was read by an Text File input step" ); addResultFile( resultFile ); } if ( log.isBasic() ) { logBasic( "Opening file: " + data.file.getName().getFriendlyURI() ); } data.dataErrorLineHandler.handleFile( data.file ); data.reader = createReader( meta, data, data.file ); } catch ( Exception e ) { String errorMsg = "Couldn't open file #" + data.currentFileIndex + " : " + data.file.getName().getFriendlyURI() + " --> " + e .toString(); logError( errorMsg ); if ( failAfterBadFile( errorMsg ) ) { // !meta.isSkipBadFiles()) stopAll(); stopAll(); } setErrors( getErrors() + 1 ); return false; } // Move file pointer ahead! data.currentFileIndex++; return true; } /** * Process next row. This methods opens next file automatically. */ @Override public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException { meta = (M) smi; data = (D) sdi; if ( first ) { first = false; prepareToRowProcessing(); if ( !openNextFile() ) { setOutputDone(); // signal end to receiver(s) closeLastFile(); return false; } } while ( true ) { if ( data.reader.readRow() ) { // row processed return true; } // end of current file closeLastFile(); if ( !openNextFile() ) { // there are no more files break; } } // after all files processed setOutputDone(); // signal end to receiver(s) closeLastFile(); return false; } /** * Prepare to process. Executed only first time row processing. It can't be possible to prepare to process in the * init() phrase, because files can be in fields from previous step. */ protected void prepareToRowProcessing() throws KettleException { data.outputRowMeta = new RowMeta(); RowMetaInterface[] infoStep = null; if ( meta.inputFiles.acceptingFilenames ) { // input files from previous step infoStep = filesFromPreviousStep(); } // get the metadata populated. Simple and easy. meta.getFields( data.outputRowMeta, getStepname(), infoStep, null, this, repository, metaStore ); // Create convert meta-data objects that will contain Date & Number formatters // data.convertRowMeta = data.outputRowMeta.cloneToType( ValueMetaInterface.TYPE_STRING ); BaseFileInputStepUtils.handleMissingFiles( data.files, log, meta.errorHandling.errorIgnored, data.dataErrorLineHandler ); // Count the number of repeat fields... for ( int i = 0; i < meta.inputFiles.inputFields.length; i++ ) { if ( meta.inputFiles.inputFields[i].isRepeated() ) { data.nr_repeats++; } } } @Override public boolean checkFeedback( long lines ) { return super.checkFeedback( lines ); } /** * Initialize error handling. * * TODO: should we set charset for error files from content meta ? What about case for automatic charset ? */ private void initErrorHandling() { List<FileErrorHandler> dataErrorLineHandlers = new ArrayList<FileErrorHandler>( 2 ); if ( meta.errorHandling.lineNumberFilesDestinationDirectory != null ) { dataErrorLineHandlers.add( new FileErrorHandlerContentLineNumber( getTrans().getCurrentDate(), environmentSubstitute( meta.errorHandling.lineNumberFilesDestinationDirectory ), meta.errorHandling.lineNumberFilesExtension, meta.getEncoding(), this ) ); } if ( meta.errorHandling.errorFilesDestinationDirectory != null ) { dataErrorLineHandlers.add( new FileErrorHandlerMissingFiles( getTrans().getCurrentDate(), environmentSubstitute( meta.errorHandling.errorFilesDestinationDirectory ), meta.errorHandling.errorFilesExtension, meta .getEncoding(), this ) ); } data.dataErrorLineHandler = new CompositeFileErrorHandler( dataErrorLineHandlers ); } /** * Read files from previous step. */ private RowMetaInterface[] filesFromPreviousStep() throws KettleException { RowMetaInterface[] infoStep = null; data.files.getFiles().clear(); int idx = -1; RowSet rowSet = findInputRowSet( meta.inputFiles.acceptingStepName ); Object[] fileRow = getRowFrom( rowSet ); while ( fileRow != null ) { RowMetaInterface prevInfoFields = rowSet.getRowMeta(); if ( idx < 0 ) { if ( meta.inputFiles.passingThruFields ) { data.passThruFields = new HashMap<FileObject, Object[]>(); infoStep = new RowMetaInterface[] { prevInfoFields }; data.nrPassThruFields = prevInfoFields.size(); } idx = prevInfoFields.indexOfValue( meta.inputFiles.acceptingField ); if ( idx < 0 ) { logError( BaseMessages.getString( PKG, "TextFileInput.Log.Error.UnableToFindFilenameField", meta.inputFiles.acceptingField ) ); setErrors( getErrors() + 1 ); stopAll(); return null; } } String fileValue = prevInfoFields.getString( fileRow, idx ); try { FileObject fileObject = KettleVFS.getFileObject( fileValue, getTransMeta() ); data.files.addFile( fileObject ); if ( meta.inputFiles.passingThruFields ) { data.passThruFields.put( fileObject, fileRow ); } } catch ( KettleFileException e ) { logError( BaseMessages.getString( PKG, "TextFileInput.Log.Error.UnableToCreateFileObject", fileValue ), e ); } // Grab another row fileRow = getRowFrom( rowSet ); } if ( data.files.nrOfFiles() == 0 ) { if ( log.isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "TextFileInput.Log.Error.NoFilesSpecified" ) ); } return null; } return infoStep; } /** * Close last opened file/ */ protected void closeLastFile() { if ( data.reader != null ) { try { data.reader.close(); } catch ( Exception ex ) { failAfterBadFile( "Error close reader" ); } data.reader = null; } if ( data.file != null ) { try { data.file.close(); } catch ( Exception ex ) { failAfterBadFile( "Error close file" ); } data.file = null; } } /** * Dispose step. */ @Override public void dispose( StepMetaInterface smi, StepDataInterface sdi ) { closeLastFile(); super.dispose( smi, sdi ); } /** * * @param errorMsg * Message to send to rejected row if enabled * @return If should stop processing after having problems with a file */ public boolean failAfterBadFile( String errorMsg ) { if ( getStepMeta().isDoingErrorHandling() && data.filename != null && !data.rejectedFiles.containsKey( data.filename ) ) { data.rejectedFiles.put( data.filename, true ); rejectCurrentFile( errorMsg ); } return !meta.errorHandling.errorIgnored || !meta.errorHandling.skipBadFiles; } /** * Send file name and/or error message to error output * * @param errorMsg * Message to send to rejected row if enabled */ private void rejectCurrentFile( String errorMsg ) { if ( StringUtils.isNotBlank( meta.errorHandling.fileErrorField ) || StringUtils.isNotBlank( meta.errorHandling.fileErrorMessageField ) ) { RowMetaInterface rowMeta = getInputRowMeta(); if ( rowMeta == null ) { rowMeta = new RowMeta(); } int errorFileIndex = ( StringUtils.isBlank( meta.errorHandling.fileErrorField ) ) ? -1 : BaseFileInputStepUtils.addValueMeta( getStepname(), rowMeta, this.environmentSubstitute( meta.errorHandling.fileErrorField ) ); int errorMessageIndex = StringUtils.isBlank( meta.errorHandling.fileErrorMessageField ) ? -1 : BaseFileInputStepUtils.addValueMeta( getStepname(), rowMeta, this.environmentSubstitute( meta.errorHandling.fileErrorMessageField ) ); try { Object[] rowData = getRow(); if ( rowData == null ) { rowData = RowDataUtil.allocateRowData( rowMeta.size() ); } if ( errorFileIndex >= 0 ) { rowData[errorFileIndex] = data.filename; } if ( errorMessageIndex >= 0 ) { rowData[errorMessageIndex] = errorMsg; } putError( rowMeta, rowData, getErrors(), data.filename, null, "ERROR_CODE" ); } catch ( Exception e ) { logError( "Error sending error row", e ); } } } /** * Prepare file-dependent data for fill additional fields. */ protected void fillFileAdditionalFields( D data, FileObject file ) throws FileSystemException { data.shortFilename = file.getName().getBaseName(); data.path = KettleVFS.getFilename( file.getParent() ); data.hidden = file.isHidden(); data.extension = file.getName().getExtension(); data.uriName = file.getName().getURI(); data.rootUriName = file.getName().getRootURI(); if ( file.getType().hasContent() ) { data.lastModificationDateTime = new Date( file.getContent().getLastModifiedTime() ); data.size = file.getContent().getSize(); } else { data.lastModificationDateTime = null; data.size = null; } } }