/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.sqoop.mapreduce; import java.io.File; import java.io.IOException; import java.sql.SQLException; import org.apache.avro.Schema; import org.apache.commons.io.FileUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.sqoop.mapreduce.hcat.SqoopHCatUtilities; import com.cloudera.sqoop.SqoopOptions; import com.cloudera.sqoop.config.ConfigurationHelper; import com.cloudera.sqoop.lib.LargeObjectLoader; import com.cloudera.sqoop.manager.ConnManager; import com.cloudera.sqoop.manager.ImportJobContext; import com.cloudera.sqoop.mapreduce.ImportJobBase; import com.cloudera.sqoop.mapreduce.db.DBConfiguration; import com.cloudera.sqoop.mapreduce.db.DataDrivenDBInputFormat; import com.cloudera.sqoop.orm.AvroSchemaGenerator; import org.apache.sqoop.util.FileSystemUtil; import org.kitesdk.data.Datasets; import org.kitesdk.data.mapreduce.DatasetKeyOutputFormat; /** * Actually runs a jdbc import job using the ORM files generated by the * sqoop.orm package. Uses DataDrivenDBInputFormat. */ public class DataDrivenImportJob extends ImportJobBase { public static final Log LOG = LogFactory.getLog( DataDrivenImportJob.class.getName()); @SuppressWarnings("unchecked") public DataDrivenImportJob(final SqoopOptions opts) { super(opts, null, DataDrivenDBInputFormat.class, null, null); } public DataDrivenImportJob(final SqoopOptions opts, final Class<? extends InputFormat> inputFormatClass, ImportJobContext context) { super(opts, null, inputFormatClass, null, context); } @Override protected void configureMapper(Job job, String tableName, String tableClassName) throws IOException { if (isHCatJob) { LOG.info("Configuring mapper for HCatalog import job"); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(SqoopHCatUtilities.getImportValueClass()); job.setMapperClass(SqoopHCatUtilities.getImportMapperClass()); return; } if (options.getFileLayout() == SqoopOptions.FileLayout.TextFile) { // For text files, specify these as the output types; for // other types, we just use the defaults. job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); } else if (options.getFileLayout() == SqoopOptions.FileLayout.AvroDataFile) { final String schemaNameOverride = null; Schema schema = generateAvroSchema(tableName, schemaNameOverride); try { writeAvroSchema(schema); } catch (final IOException e) { LOG.error("Error while writing Avro schema.", e); } AvroJob.setMapOutputSchema(job.getConfiguration(), schema); } else if (options.getFileLayout() == SqoopOptions.FileLayout.ParquetFile) { JobConf conf = (JobConf)job.getConfiguration(); // Kite SDK requires an Avro schema to represent the data structure of // target dataset. If the schema name equals to generated java class name, // the import will fail. So we use table name as schema name and add a // prefix "codegen_" to generated java class to avoid the conflict. final String schemaNameOverride = tableName; Schema schema = generateAvroSchema(tableName, schemaNameOverride); String uri = getKiteUri(conf, tableName); ParquetJob.WriteMode writeMode; if (options.doHiveImport()) { if (options.doOverwriteHiveTable()) { writeMode = ParquetJob.WriteMode.OVERWRITE; } else { writeMode = ParquetJob.WriteMode.APPEND; if (Datasets.exists(uri)) { LOG.warn("Target Hive table '" + tableName + "' exists! Sqoop will " + "append data into the existing Hive table. Consider using " + "--hive-overwrite, if you do NOT intend to do appending."); } } } else { // Note that there is no such an import argument for overwriting HDFS // dataset, so overwrite mode is not supported yet. // Sqoop's append mode means to merge two independent datasets. We // choose DEFAULT as write mode. writeMode = ParquetJob.WriteMode.DEFAULT; } ParquetJob.configureImportJob(conf, schema, uri, writeMode); } job.setMapperClass(getMapperClass()); } private String getKiteUri(Configuration conf, String tableName) throws IOException { if (options.doHiveImport()) { String hiveDatabase = options.getHiveDatabaseName() == null ? "default" : options.getHiveDatabaseName(); String hiveTable = options.getHiveTableName() == null ? tableName : options.getHiveTableName(); return String.format("dataset:hive:/%s/%s", hiveDatabase, hiveTable); } else { Path destination = getContext().getDestination(); return "dataset:" + FileSystemUtil.makeQualified(destination, conf); } } private Schema generateAvroSchema(String tableName, String schemaNameOverride) throws IOException { ConnManager connManager = getContext().getConnManager(); AvroSchemaGenerator generator = new AvroSchemaGenerator(options, connManager, tableName); return generator.generate(schemaNameOverride); } private void writeAvroSchema(final Schema schema) throws IOException { // Generate schema in JAR output directory. final File schemaFile = new File(options.getJarOutputDir(), schema.getName() + ".avsc"); LOG.info("Writing Avro schema file: " + schemaFile); FileUtils.forceMkdir(schemaFile.getParentFile()); FileUtils.writeStringToFile(schemaFile, schema.toString(true)); // Copy schema to code output directory. try { FileUtils.moveFileToDirectory(schemaFile, new File(options.getCodeOutputDir()), true); } catch (final IOException e) { LOG.debug("Could not move Avro schema file to code output directory.", e); } } @Override protected Class<? extends Mapper> getMapperClass() { if (options.getHCatTableName() != null) { return SqoopHCatUtilities.getImportMapperClass(); } if (options.getFileLayout() == SqoopOptions.FileLayout.TextFile) { return TextImportMapper.class; } else if (options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) { return SequenceFileImportMapper.class; } else if (options.getFileLayout() == SqoopOptions.FileLayout.AvroDataFile) { return AvroImportMapper.class; } else if (options.getFileLayout() == SqoopOptions.FileLayout.ParquetFile) { return ParquetImportMapper.class; } return null; } @Override protected Class<? extends OutputFormat> getOutputFormatClass() throws ClassNotFoundException { if (isHCatJob) { LOG.debug("Returning HCatOutputFormat for output format"); return SqoopHCatUtilities.getOutputFormatClass(); } if (options.getFileLayout() == SqoopOptions.FileLayout.TextFile) { return RawKeyTextOutputFormat.class; } else if (options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) { return SequenceFileOutputFormat.class; } else if (options.getFileLayout() == SqoopOptions.FileLayout.AvroDataFile) { return AvroOutputFormat.class; } else if (options.getFileLayout() == SqoopOptions.FileLayout.ParquetFile) { return DatasetKeyOutputFormat.class; } return null; } /** * Build the boundary query for the column of the result set created by * the given query. * @param col column name whose boundaries we're interested in. * @param query sub-query used to create the result set. * @return input boundary query as a string */ private String buildBoundaryQuery(String col, String query) { if (col == null || options.getNumMappers() == 1) { return ""; } // Replace table name with alias 't1' if column name is a fully // qualified name. This is needed because "tableName"."columnName" // in the input boundary query causes a SQL syntax error in most dbs // including Oracle and MySQL. String alias = "t1"; int dot = col.lastIndexOf('.'); String qualifiedName = (dot == -1) ? col : alias + col.substring(dot); ConnManager mgr = getContext().getConnManager(); String ret = mgr.getInputBoundsQuery(qualifiedName, query); if (ret != null) { return ret; } return "SELECT MIN(" + qualifiedName + "), MAX(" + qualifiedName + ") " + "FROM (" + query + ") AS " + alias; } @Override protected void configureInputFormat(Job job, String tableName, String tableClassName, String splitByCol) throws IOException { ConnManager mgr = getContext().getConnManager(); try { String username = options.getUsername(); if (null == username || username.length() == 0) { DBConfiguration.configureDB(job.getConfiguration(), mgr.getDriverClass(), options.getConnectString(), options.getFetchSize(), options.getConnectionParams()); } else { DBConfiguration.configureDB(job.getConfiguration(), mgr.getDriverClass(), options.getConnectString(), username, options.getPassword(), options.getFetchSize(), options.getConnectionParams()); } if (null != tableName) { // Import a table. String [] colNames = options.getColumns(); if (null == colNames) { colNames = mgr.getColumnNames(tableName); } String [] sqlColNames = null; if (null != colNames) { sqlColNames = new String[colNames.length]; for (int i = 0; i < colNames.length; i++) { sqlColNames[i] = mgr.escapeColName(colNames[i]); } } // It's ok if the where clause is null in DBInputFormat.setInput. String whereClause = options.getWhereClause(); // We can't set the class properly in here, because we may not have the // jar loaded in this JVM. So we start by calling setInput() with // DBWritable and then overriding the string manually. DataDrivenDBInputFormat.setInput(job, DBWritable.class, mgr.escapeTableName(tableName), whereClause, mgr.escapeColName(splitByCol), sqlColNames); // If user specified boundary query on the command line propagate it to // the job if (options.getBoundaryQuery() != null) { DataDrivenDBInputFormat.setBoundingQuery(job.getConfiguration(), options.getBoundaryQuery()); } } else { // Import a free-form query. String inputQuery = options.getSqlQuery(); String sanitizedQuery = inputQuery.replace( DataDrivenDBInputFormat.SUBSTITUTE_TOKEN, " (1 = 1) "); String inputBoundingQuery = options.getBoundaryQuery(); if (inputBoundingQuery == null) { inputBoundingQuery = buildBoundaryQuery(splitByCol, sanitizedQuery); } DataDrivenDBInputFormat.setInput(job, DBWritable.class, inputQuery, inputBoundingQuery); new DBConfiguration(job.getConfiguration()).setInputOrderBy( splitByCol); } if (options.getRelaxedIsolation()) { LOG .info("Enabling relaxed (read uncommitted) transaction " + "isolation for imports"); job.getConfiguration() .setBoolean(DBConfiguration.PROP_RELAXED_ISOLATION, true); } LOG.debug("Using table class: " + tableClassName); job.getConfiguration().set(ConfigurationHelper.getDbInputClassProperty(), tableClassName); job.getConfiguration().setLong(LargeObjectLoader.MAX_INLINE_LOB_LEN_KEY, options.getInlineLobLimit()); if (options.getSplitLimit() != null) { org.apache.sqoop.config.ConfigurationHelper.setSplitLimit( job.getConfiguration(), options.getSplitLimit()); } LOG.debug("Using InputFormat: " + inputFormatClass); job.setInputFormatClass(inputFormatClass); } finally { try { mgr.close(); } catch (SQLException sqlE) { LOG.warn("Error closing connection: " + sqlE); } } } }