/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2016 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.gpbulkloader; // // The "designer" notes of the Greenplum bulkloader: // ---------------------------------------------- // // - "Enclosed" is used in the loader instead of "optionally enclosed" as optionally // encloses kind of destroys the escaping. // - A Boolean is output as Y and N (as in the text output step e.g.). If people don't // like this they can first convert the boolean value to something else before loading // it. // - Filters (besides data and datetime) are not supported as it slows down. // // import java.io.BufferedReader; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import org.apache.commons.vfs2.FileObject; import org.pentaho.di.core.Const; import org.pentaho.di.core.util.Utils; import org.pentaho.di.core.database.DatabaseMeta; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.row.RowMetaInterface; import org.pentaho.di.core.vfs.KettleVFS; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStep; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; /** * Performs a bulk load to an Greenplum table. * * Based on (copied from) Sven Boden's Oracle Bulk Loader step * * @author Luke Lonergan * @since 28-mar-2008 */ public class GPBulkLoader extends BaseStep implements StepInterface { private static Class<?> PKG = GPBulkLoaderMeta.class; // for i18n purposes, needed by Translator2!! Process psqlProcess = null; private GPBulkLoaderMeta meta; protected GPBulkLoaderData data; private GPBulkDataOutput output = null; private String enclosure; /* * Local copy of the transformation "preview" property. We only forward the rows upon previewing, we don't do any of * the real stuff. */ private boolean preview = false; // // This class continually reads from the stream, and sends it to the log // if the logging level is at least basic level. // private final class StreamLogger extends Thread { private InputStream input; private String type; StreamLogger( InputStream is, String type ) { this.input = is; this.type = type + ">"; } public void run() { try { final BufferedReader br = new BufferedReader( new InputStreamReader( input ) ); String line; while ( ( line = br.readLine() ) != null ) { // Only perform the concatenation if at basic level. Otherwise, // this just reads from the stream. if ( log.isBasic() ) { logBasic( type + line ); } } } catch ( IOException ioe ) { ioe.printStackTrace(); } } } public GPBulkLoader( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans ) { super( stepMeta, stepDataInterface, copyNr, transMeta, trans ); if ( Const.getOS().startsWith( "Windows" ) ) { enclosure = "\""; } else { enclosure = "'"; } } /** * Get the contents of the control file as specified in the meta object * * @param meta * the meta object to model the control file after * * @return a string containing the control file contents */ public String getControlFileContents( GPBulkLoaderMeta meta, RowMetaInterface rm, Object[] r ) throws KettleException { DatabaseMeta dm = meta.getDatabaseMeta(); String inputName = "'" + environmentSubstitute( meta.getDataFile() ) + "'"; // if ( GPBulkLoaderMeta.METHOD_AUTO_CONCURRENT.equals(meta.getLoadMethod()) ) // { // // if loading is concurrent, the filename has to be a * as sqlldr will // // read from stdin. // inputName = "*"; // } String loadAction = meta.getLoadAction(); StringBuffer contents = new StringBuffer( 500 ); String tableName = dm.getQuotedSchemaTableCombination( environmentSubstitute( meta.getSchemaName() ), environmentSubstitute( meta.getTableName() ) ); // Create a Postgres / Greenplum COPY string for use with a psql client if ( loadAction.equalsIgnoreCase( "truncate" ) ) { contents.append( loadAction + " " ); contents.append( tableName + ";" ); contents.append( Const.CR ); } contents.append( "\\COPY " ); // Table name contents.append( tableName ); // Names of columns contents.append( " ( " ); String[] streamFields = meta.getFieldStream(); String[] tableFields = meta.getFieldTable(); if ( streamFields == null || streamFields.length == 0 ) { throw new KettleException( "No fields defined to load to database" ); } for ( int i = 0; i < streamFields.length; i++ ) { if ( i != 0 ) { contents.append( ", " ); } contents.append( dm.quoteField( tableFields[i] ) ); } contents.append( " ) " ); // The "FROM" filename contents.append( " FROM " ); contents.append( inputName ); // The "FORMAT" clause contents.append( " WITH CSV " ); // The single row error handling contents.append( "LOG ERRORS INTO " ); contents.append( tableName + "_errors " ); contents.append( " SEGMENT REJECT LIMIT " ); contents.append( meta.getMaxErrors() ); // contents.append(" ERRORS=\'").append(meta.getMaxErrors()).append("\'").append(Const.CR); // contents.append("LOAD DATA").append(Const.CR).append( // "INFILE ").append(inputName).append(Const.CR).append( // "INTO TABLE ").append(dm.getQuotedSchemaTableCombination(environmentSubstitute(meta.getSchemaName()), // environmentSubstitute(meta.getTableName()))).append( // Const.CR).append(loadAction).append(Const.CR).append( // "FIELDS TERMINATED BY ',' ENCLOSED BY '\"'").append(Const.CR).append( // "("); // // String streamFields[] = meta.getFieldStream(); // String tableFields[] = meta.getFieldTable(); // String dateMask[] = meta.getDateMask(); // // if ( streamFields == null || streamFields.length == 0 ) // { // throw new KettleException("No fields defined to load to database"); // } // // for (int i = 0; i < streamFields.length; i++) // { // if ( i!=0 ) contents.append(", ").append(Const.CR); // contents.append(dm.quoteField(tableFields[i])); // // int pos = rm.indexOfValue(streamFields[i]); // if (pos<0) // { // throw new KettleException("Could not find field " + // streamFields[i] + " in stream"); // } // ValueMetaInterface v = rm.getValueMeta(pos); // switch ( v.getType() ) // { // case ValueMetaInterface.TYPE_STRING: // if ( v.getLength() > 255 ) // { // contents.append(" CHAR(").append(v.getLength()).append(")"); // } // else // { // contents.append(" CHAR"); // } // break; // case ValueMetaInterface.TYPE_INTEGER: // case ValueMetaInterface.TYPE_NUMBER: // case ValueMetaInterface.TYPE_BIGNUMBER: // break; // case ValueMetaInterface.TYPE_DATE: // if ( GPBulkLoaderMeta.DATE_MASK_DATE.equals(dateMask[i]) ) // { // contents.append(" DATE 'yyyy-mm-dd'"); // } // else if ( GPBulkLoaderMeta.DATE_MASK_DATETIME.equals(dateMask[i]) ) // { // contents.append(" TIMESTAMP 'yyyy-mm-dd hh24:mi:ss.ff'"); // } // else // { // // If not specified the default is date. // contents.append(" DATE 'yyyy-mm-dd'"); // } // break; // case ValueMetaInterface.TYPE_BINARY: // contents.append(" ENCLOSED BY '<startlob>' AND '<endlob>'"); // break; // } // } // contents.append(")"); // if ( GPBulkLoaderMeta.METHOD_AUTO_CONCURRENT.equals(meta.getLoadMethod()) ) // { // contents.append(Const.CR).append("BEGINDATA").append(Const.CR); // } return contents.toString(); } /** * Create a control file. * * @param filename * @param meta * @throws KettleException */ public void createControlFile( String filename, Object[] row, GPBulkLoaderMeta meta ) throws KettleException { File controlFile = new File( filename ); FileWriter fw = null; try { controlFile.createNewFile(); fw = new FileWriter( controlFile ); fw.write( getControlFileContents( meta, getInputRowMeta(), row ) ); } catch ( IOException ex ) { throw new KettleException( ex.getMessage(), ex ); } finally { try { if ( fw != null ) { fw.close(); } } catch ( Exception ex ) { // Ignore close errors } } } /** * Create the command line for a psql process depending on the meta information supplied. * * @param meta * The meta data to create the command line from * @param password * Use the real password or not * * @return The string to execute. * * @throws KettleException * Upon any exception */ public String createCommandLine( GPBulkLoaderMeta meta, boolean password ) throws KettleException { StringBuffer sb = new StringBuffer( 300 ); if ( meta.getPsqlpath() != null ) { try { FileObject fileObject = KettleVFS.getFileObject( environmentSubstitute( meta.getPsqlpath() ), getTransMeta() ); String psqlexec = KettleVFS.getFilename( fileObject ); sb.append( enclosure ).append( psqlexec ).append( enclosure ); } catch ( Exception ex ) { throw new KettleException( "Error retrieving sqlldr string", ex ); } } else { throw new KettleException( "No psql application specified" ); } if ( meta.getControlFile() != null ) { try { FileObject fileObject = KettleVFS.getFileObject( environmentSubstitute( meta.getControlFile() ), getTransMeta() ); sb.append( " -n -f " ); sb.append( enclosure ).append( KettleVFS.getFilename( fileObject ) ).append( enclosure ); } catch ( Exception ex ) { throw new KettleException( "Error retrieving controlfile string", ex ); } } else { throw new KettleException( "No control file specified" ); } if ( meta.getLogFile() != null ) { try { FileObject fileObject = KettleVFS.getFileObject( environmentSubstitute( meta.getLogFile() ), getTransMeta() ); sb.append( " -o " ); sb.append( enclosure ).append( KettleVFS.getFilename( fileObject ) ).append( enclosure ); } catch ( Exception ex ) { throw new KettleException( "Error retrieving logfile string", ex ); } } DatabaseMeta dm = meta.getDatabaseMeta(); if ( dm != null ) { String user = Const.NVL( dm.getUsername(), "" ); // Passwords will not work for now because we can't get them to the command line without assuming UNIX and using // an environment variable String pass = Const.NVL( dm.getPassword(), "" ); if ( password && !pass.equalsIgnoreCase( "" ) ) { throw new KettleException( "Passwords are not supported directly, try configuring " + "your connection for trusted access using pg_hba.conf" ); } // if ( ! password ) // { // pass = "******"; // } // String dns = Const.NVL(dm.getDatabaseName(), ""); // sb.append(" -U ").append(environmentSubstitute(user)).append("/").append(environmentSubstitute(pass)); sb.append( " -U " ).append( environmentSubstitute( user ) ); // Hostname and portname String hostname = Const.NVL( dm.getHostname(), "" ); String portnum = Const.NVL( dm.getDatabasePortNumberString(), "" ); sb.append( " -h " ); sb.append( hostname ); sb.append( " -p " ); sb.append( portnum ); // Database Name String dns = Const.NVL( dm.getDatabaseName(), "" ); sb.append( " -d " ); String overrideName = meta.getDbNameOverride(); if ( Utils.isEmpty( Const.rtrim( overrideName ) ) ) { sb.append( environmentSubstitute( dns ) ); } else { // if the database name override is filled in, do that one. sb.append( environmentSubstitute( overrideName ) ); } } else { throw new KettleException( "No connection specified" ); } return sb.toString(); } public boolean execute( GPBulkLoaderMeta meta, boolean wait ) throws KettleException { Runtime rt = Runtime.getRuntime(); try { psqlProcess = rt.exec( createCommandLine( meta, true ) ); // any error message? StreamLogger errorLogger = new StreamLogger( psqlProcess.getErrorStream(), "ERROR" ); // any output? StreamLogger outputLogger = new StreamLogger( psqlProcess.getInputStream(), "OUTPUT" ); // kick them off errorLogger.start(); outputLogger.start(); if ( wait ) { // any error??? int exitVal = psqlProcess.waitFor(); logBasic( BaseMessages.getString( PKG, "GPBulkLoader.Log.ExitValuePsqlPath", "" + exitVal ) ); } } catch ( KettleException ke ) { // Re-throw the exception to the caller. throw ke; } catch ( Exception ex ) { // The message below doesn't include the password since passwords aren't supported. throw new KettleException( "Error while executing psql \'" + createCommandLine( meta, false ) + "\'", ex ); } return true; } public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException { meta = (GPBulkLoaderMeta) smi; data = (GPBulkLoaderData) sdi; try { Object[] r = getRow(); // Get row from input rowset & set row busy! if ( r == null ) { // no more input to be expected... setOutputDone(); if ( !preview ) { if ( output != null ) { // Close the output try { output.close(); } catch ( IOException e ) { throw new KettleException( "Error while closing output", e ); } output = null; } String loadMethod = meta.getLoadMethod(); if ( GPBulkLoaderMeta.METHOD_AUTO_END.equals( loadMethod ) ) { execute( meta, true ); } // else if ( GPBulkLoaderMeta.METHOD_AUTO_CONCURRENT.equals(meta.getLoadMethod()) ) // { // try // { // if ( psqlProcess != null ) // { // int exitVal = psqlProcess.waitFor(); // logBasic(BaseMessages.getString(PKG, "GPBulkLoader.Log.ExitValueSqlldr", "" + exitVal)); // } // else // { // throw new KettleException("Internal error: no sqlldr process running"); // } // } // catch ( Exception ex ) // { // throw new KettleException("Error while executing sqlldr", ex); // } // } } return false; } if ( !preview ) { if ( first ) { first = false; createControlFile( environmentSubstitute( meta.getControlFile() ), r, meta ); output = new GPBulkDataOutput( meta ); // if ( GPBulkLoaderMeta.METHOD_AUTO_CONCURRENT.equals(meta.getLoadMethod()) ) // { // execute(meta, false); // } output.open( this, psqlProcess ); } output.writeLine( getInputRowMeta(), r ); } putRow( getInputRowMeta(), r ); incrementLinesOutput(); } catch ( KettleException e ) { logError( BaseMessages.getString( PKG, "GPBulkLoader.Log.ErrorInStep" ) + e.getMessage() ); setErrors( 1 ); stopAll(); setOutputDone(); // signal end to receiver(s) return false; } return true; } public boolean init( StepMetaInterface smi, StepDataInterface sdi ) { meta = (GPBulkLoaderMeta) smi; data = (GPBulkLoaderData) sdi; Trans trans = getTrans(); preview = trans.isPreview(); if ( super.init( smi, sdi ) ) { return true; } return false; } public void dispose( StepMetaInterface smi, StepDataInterface sdi ) { meta = (GPBulkLoaderMeta) smi; data = (GPBulkLoaderData) sdi; super.dispose( smi, sdi ); if ( !preview && meta.isEraseFiles() ) { // Erase the created cfg/dat files if requested. We don't erase // the rest of the files because it would be "stupid" to erase them // right after creation. If you don't want them, don't fill them in. FileObject fileObject = null; String method = meta.getLoadMethod(); // GPBulkLoaderMeta.METHOD_AUTO_CONCURRENT.equals(method) || if ( GPBulkLoaderMeta.METHOD_AUTO_END.equals( method ) ) { if ( meta.getControlFile() != null ) { try { fileObject = KettleVFS.getFileObject( environmentSubstitute( meta.getControlFile() ), getTransMeta() ); fileObject.delete(); fileObject.close(); } catch ( Exception ex ) { logError( "Error deleting control file \'" + KettleVFS.getFilename( fileObject ) + "\': " + ex.getMessage() ); } } } if ( GPBulkLoaderMeta.METHOD_AUTO_END.equals( method ) ) { // In concurrent mode the data is written to the control file. if ( meta.getDataFile() != null ) { try { fileObject = KettleVFS.getFileObject( environmentSubstitute( meta.getDataFile() ), getTransMeta() ); fileObject.delete(); fileObject.close(); } catch ( Exception ex ) { logError( "Error deleting data file \'" + KettleVFS.getFilename( fileObject ) + "\': " + ex.getMessage(), ex ); } } } if ( GPBulkLoaderMeta.METHOD_MANUAL.equals( method ) ) { logBasic( "Deletion of files is not compatible with \'manual load method\'" ); } } } }