package com.thinkbiganalytics.nifi.v2.sqoop.core;
/*-
* #%L
* thinkbig-nifi-hadoop-processors
* %%
* Copyright (C) 2017 ThinkBig Analytics
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.thinkbiganalytics.nifi.processor.AbstractNiFiProcessor;
import com.thinkbiganalytics.nifi.security.KerberosProperties;
import com.thinkbiganalytics.nifi.security.SpringSecurityContextLoader;
import com.thinkbiganalytics.nifi.v2.sqoop.PasswordMode;
import com.thinkbiganalytics.nifi.v2.sqoop.SqoopConnectionService;
import com.thinkbiganalytics.nifi.v2.sqoop.enums.CompressionAlgorithm;
import com.thinkbiganalytics.nifi.v2.sqoop.enums.ExtractDataFormat;
import com.thinkbiganalytics.nifi.v2.sqoop.enums.HiveDelimStrategy;
import com.thinkbiganalytics.nifi.v2.sqoop.enums.SqoopLoadStrategy;
import com.thinkbiganalytics.nifi.v2.sqoop.enums.TargetHdfsDirExistsStrategy;
import com.thinkbiganalytics.nifi.v2.sqoop.process.SqoopProcessResult;
import com.thinkbiganalytics.nifi.v2.sqoop.process.SqoopProcessRunner;
import com.thinkbiganalytics.nifi.v2.sqoop.security.KerberosConfig;
import com.thinkbiganalytics.nifi.v2.sqoop.utils.SqoopBuilder;
import com.thinkbiganalytics.nifi.v2.sqoop.utils.SqoopUtils;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.components.ValidationContext;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.ProcessorInitializationContext;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.util.StopWatch;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import javax.annotation.Nonnull;
/**
* NiFi Processor to import data from a relational source into HDFS via Sqoop
*/
@Tags({"thinkbig", "ingest", "sqoop", "rdbms", "database", "table"})
@CapabilityDescription("Import data from a relational source into HDFS via Sqoop")
@WritesAttributes({
@WritesAttribute(attribute = "sqoop.command.text", description = "The full Sqoop command executed"),
@WritesAttribute(attribute = "sqoop.result.code", description = "The exit code from Sqoop command execution"),
@WritesAttribute(attribute = "sqoop.run.seconds", description = "Total seconds taken to run the Sqoop command"),
@WritesAttribute(attribute = "sqoop.record.count", description = "Count of records imported"),
@WritesAttribute(attribute = "sqoop.output.hdfs", description = "HDFS location where data is written"),
})
public class ImportSqoop extends AbstractNiFiProcessor {
/**
* Property to provide the connection service for executing sqoop jobs.
*/
public static final PropertyDescriptor SQOOP_CONNECTION_SERVICE = new PropertyDescriptor.Builder()
.name("Sqoop Connection Service")
.description("Connection service for executing sqoop jobs.")
.required(true)
.identifiesControllerService(SqoopConnectionService.class)
.build();
/**
* Property to provide the table to extract from the source relational system.
*/
public static final PropertyDescriptor SOURCE_TABLE_NAME = new PropertyDescriptor.Builder()
.name("Source Table")
.description("The table to extract from the source relational system.")
.required(true)
.defaultValue("${source.table.name}")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
/**
* Property to provide the fields in the source table
* Field names (in order) to read from the source table. ie. the SELECT fields. Separate them using commas e.g. field1,field2,field3.
* + Inconsistent order will cause corruption of the downstream data. Indicate * to get all columns from table.
*/
public static final PropertyDescriptor SOURCE_TABLE_FIELDS = new PropertyDescriptor.Builder()
.name("Source Fields")
.description(
"Field names (in order) to read from the source table. ie. the SELECT fields. Separate them using commas e.g. field1,field2,field3. "
+ "Inconsistent order will cause corruption of the downstream data. Indicate * to get all columns from table.")
.required(true)
.defaultValue("${source.table.fields}")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
/**
* Property to provide the WHERE clause to use for the source table. This will act as a filter on source table.
*/
public static final PropertyDescriptor SOURCE_TABLE_WHERE_CLAUSE = new PropertyDescriptor.Builder()
.name("Source Table WHERE Clause")
.description("The WHERE clause to use for the source table. This will act as a filter on source table.")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
/**
* Property to provide the load type (full table, incremental based on last modified time field, incremental based on id field).
*/
public static final PropertyDescriptor SOURCE_LOAD_STRATEGY = new PropertyDescriptor.Builder()
.name("Source Load Strategy")
.description("The load type (full table, incremental based on last modified time field, incremental based on id field).")
.required(true)
.allowableValues(SqoopLoadStrategy.values())
.defaultValue(SqoopLoadStrategy.FULL_LOAD.toString())
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(false)
.build();
/**
* Property to provide source column name that contains the last modified time / id for incremental-type load. Only applicable for incremental-type load.
*/
public static final PropertyDescriptor SOURCE_CHECK_COLUMN_NAME = new PropertyDescriptor.Builder()
.name("Source Check Column Name (INCR)")
.description("Source column name that contains the last modified time / id for incremental-type load. Only applicable for incremental-type load.")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
/**
* Property to provide the property containing value of current watermark. Only applicable for incremental-type load.
*/
public static final PropertyDescriptor SOURCE_PROPERTY_WATERMARK = new PropertyDescriptor.Builder()
.name("Watermark Property (INCR)")
.description("Property containing value of current watermark. Only applicable for incremental-type load.")
.required(false)
.defaultValue("water.mark")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
/**
* Property to provide the source column value for the last modified time / id for tracking incremental-type load. Only applicable for
* incremental-type load. This should generally be ${value for Watermark Property (INCR)}.
*/
public static final PropertyDescriptor SOURCE_CHECK_COLUMN_LAST_VALUE = new PropertyDescriptor.Builder()
.name("Source Check Column Last Value (INCR)")
.description("Source column value for the last modified time / id for tracking incremental-type load. Only applicable for incremental-type load. "
+ "This should generally be ${value for Watermark Property (INCR)}.")
.required(false)
.defaultValue("${water.mark}")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
/**
* Property to provide a column of the table that should be used to split work units for parallel imports. By default,
* the table's primary key will be used. If this column is not provided, and the table does not have a
* primary key, one mapper will be used for the import.
*/
public static final PropertyDescriptor SOURCE_SPLIT_BY_FIELD = new PropertyDescriptor.Builder()
.name("Source Split By Field")
.description("A column of the table that should be used to split work units for parallel imports. "
+ "By default, the table's primary key will be used. "
+ "If this column is not provided, and the table does not have a primary key, one mapper will be used for the import.")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
/**
* Property to provide a query to get the min and max values for split by field column. This can improve the efficiency of
* import. The min and max values can also be explicitly provided (e.g. SELECT 1, 10). Query will be
* literally run without modification, so please ensure that it is correct. Query should return one row
* with exactly two columns.
*/
public static final PropertyDescriptor SOURCE_BOUNDARY_QUERY = new PropertyDescriptor.Builder()
.name("Source Boundary Query")
.description("A query to get the min and max values for split by field column. This can improve the efficiency of import. "
+ "The min and max values can also be explicitly provided (e.g. SELECT 1, 10). "
+ "Query will be literally run without modification, so please ensure that it is correct. "
+ "Query should return one row with exactly two columns.")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
/**
* Property to provide the number of map tasks to extract data in parallel. Higher values put more load on the source
* system. Also, consider capacity of cluster when setting this property value.
*/
public static final PropertyDescriptor CLUSTER_MAP_TASKS = new PropertyDescriptor.Builder()
.name("Cluster Map Tasks")
.description("Number of map tasks to extract data in parallel. "
+ "Higher values put more load on the source system. "
+ "Also, consider capacity of cluster when setting this property value.")
.required(false)
.expressionLanguageSupported(true)
.defaultValue("4")
.addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR)
.build();
/**
* Property to provide the name for the job in the UI.
*/
public static final PropertyDescriptor CLUSTER_UI_JOB_NAME = new PropertyDescriptor.Builder()
.name("Cluster UI Job Name")
.description("The job will be shown in the UI via the provided name. This is to allow ease of identification.")
.required(false)
.expressionLanguageSupported(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
/**
* Property to provide the target HDFS directory to land data in. The directory should
* not pre-exist. Otherwise, job will fail. This behavior is designed to prevent accidental overwrites.
* Also, verify that permissions on HDFS are appropriate.
*/
public static final PropertyDescriptor TARGET_HDFS_DIRECTORY = new PropertyDescriptor.Builder()
.name("Target HDFS Directory")
.description("Target HDFS directory to land data in. The directory should not pre-exist. "
+ "Otherwise, job will fail. This behavior is designed to prevent accidental overwrites. "
+ "Also, verify that permissions on HDFS are appropriate.")
.required(true)
.expressionLanguageSupported(true)
.defaultValue("/tmp")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
/**
* Property to provide the strategy for handling the case where target HDFS directory exists. By default, the import job
* will fail (to prevent accidental data corruption). This behavior can be modified to delete existing
* HDFS directory, and create it again during data import. If delete option is chosen, and the target
* HDFS directory does not exist, the data import will proceed and create it.
*/
public static final PropertyDescriptor TARGET_HDFS_DIRECTORY_EXISTS_STRATEGY = new PropertyDescriptor.Builder()
.name("Target HDFS Directory - If Exists?")
.description("Strategy for handling the case where target HDFS directory exists. "
+ "By default, the import job will fail (to prevent accidental data corruption). "
+ "This behavior can be modified to delete existing HDFS directory, and create it again during data import. "
+ "If delete option is chosen, and the target HDFS directory does not exist, the data import will proceed and create it.")
.required(true)
.expressionLanguageSupported(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.allowableValues(TargetHdfsDirExistsStrategy.values())
.defaultValue(TargetHdfsDirExistsStrategy.FAIL_IMPORT.toString())
.build();
/**
* Property to provide the format to land the extracted data in on HDFS.
*/
public static final PropertyDescriptor TARGET_EXTRACT_DATA_FORMAT = new PropertyDescriptor.Builder()
.name("Target Extract Data Format")
.description("Format to land the extracted data in on HDFS.")
.required(true)
.expressionLanguageSupported(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.allowableValues(ExtractDataFormat.values())
.defaultValue(ExtractDataFormat.TEXT.toString())
.build();
/**
* Property to provide the delimiter for fields for data landing in HDFS. This is relevant
* for TEXT and SEQUENCE_FILE target extract data formats.
*/
public static final PropertyDescriptor TARGET_HDFS_FILE_FIELD_DELIMITER = new PropertyDescriptor.Builder()
.name("Target HDFS File Field Delimiter")
.description("Delimiter for fields for data landing in HDFS. This is relevant for TEXT and SEQUENCE_FILE target extract data formats.")
.required(true)
.expressionLanguageSupported(true)
.defaultValue(",")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
/**
* Property to provide the delimiter for records for data landing in HDFS. This is relevant
* for TEXT and SEQUENCE_FILE target extract data formats.
*/
public static final PropertyDescriptor TARGET_HDFS_FILE_RECORD_DELIMITER = new PropertyDescriptor.Builder()
.name("Target HDFS File Record Delimiter")
.description("Delimiter for records for data landing in HDFS. This is relevant for TEXT and SEQUENCE_FILE target extract data formats.")
.required(true)
.expressionLanguageSupported(true)
.defaultValue("\\n")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
/**
* Property to provide the strategy to handle Hive-specific delimiters (\n, \r, \01).
* Facilitates downstream Hive ingestion.
*/
public static final PropertyDescriptor TARGET_HIVE_DELIM_STRATEGY = new PropertyDescriptor.Builder()
.name("Target Hive Delimiter Strategy")
.description("Strategy to handle Hive-specific delimiters (\\n, \\r, \\01). Facilitates downstream Hive ingestion.")
.required(true)
.expressionLanguageSupported(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.allowableValues(HiveDelimStrategy.values())
.defaultValue(HiveDelimStrategy.DROP.toString())
.build();
/**
* Property to provide the replacement delimiter for 'REPLACE' strategy.
* Facilitates downstream Hive ingestion.
*/
public static final PropertyDescriptor TARGET_HIVE_REPLACE_DELIM = new PropertyDescriptor.Builder()
.name("Target Hive Replacement Delimiter")
.description("Replacement delimiter for 'REPLACE' strategy. Facilitates downstream Hive ingestion.")
.required(false)
.expressionLanguageSupported(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
/**
* Property to provide the compression algorithm to use for data landing in HDFS.
* For PARQUET data format, compression is not supported.
*/
public static final PropertyDescriptor TARGET_COMPRESSION_ALGORITHM = new PropertyDescriptor.Builder()
.name("Target Compression Algorithm")
.description("Compression algorithm to use for data landing in HDFS. For PARQUET data format, compression is not supported.")
.required(true)
.expressionLanguageSupported(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.allowableValues(CompressionAlgorithm.values())
.defaultValue(CompressionAlgorithm.NONE.toString())
.build();
/**
* Property to provide the mapping to use for source columns (SQL type) to target (Java type). This
* will override Sqoop's default mapping. Only provide mappings that you wish to override. The
* source must contain the specified column/s. Also, this is REQUIRED for source columns that Sqoop
* is unable to map (An example is Oracle NCLOB data type). Mappings can be entered as COLUMN=Type
* pairs separated by comma. Ensure that there are no spaces in entry. Example: PO_ID=Integer,PO_DETAILS=String
*/
public static final PropertyDescriptor TARGET_COLUMN_TYPE_MAPPING = new PropertyDescriptor.Builder()
.name("Target Column Type Mapping")
.description("Mapping to use for source columns (SQL type) to target (Java type). "
+ "This will override Sqoop's default mapping. Only provide mappings that you wish to override. "
+ "The source must contain the specified column/s."
+ "Also, this is REQUIRED for source columns that Sqoop is unable to map (An example is Oracle NCLOB data type). "
+ "Mappings can be entered as COLUMN=Type pairs separated by comma. Ensure that there are no spaces in entry. "
+ "Example: PO_ID=Integer,PO_DETAILS=String")
.required(false)
.expressionLanguageSupported(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
/**
* Property to provide code generation directory for sqoop.
* A by-product of importing data is a Sqoop-generated Java class that can encapsulate one row of the
* imported table. This can help with subsequent MapReduce processing of the data. This property value
* specifies the output directory that Sqoop should use to create these artifacts. Please ensure that
* the specified directory has appropriate write permissions. DEFAULT VALUE = Working directory from
* where sqoop is launched (i.e. NiFi installation home directory)
*/
public static final PropertyDescriptor SQOOP_CODEGEN_DIR = new PropertyDescriptor.Builder()
.name("Sqoop Code Generation Directory")
.description("A by-product of importing data is a Sqoop-generated Java class that can encapsulate one row of the imported table. "
+ "This can help with subsequent MapReduce processing of the data. "
+ "This property value specifies the output directory that Sqoop should use to create these artifacts. "
+ "Please ensure that the specified directory has appropriate write permissions. "
+ "DEFAULT VALUE = Working directory from where sqoop is launched (i.e. NiFi installation home directory)")
.required(false)
.expressionLanguageSupported(true)
.addValidator(new StandardValidators.DirectoryExistsValidator(true, false))
.build();
/**
* This property is for SQL Server only. By default, the source schema is generally
* 'dbo'. If the source table to be ingested is in another schema, provide that value here.
*/
public static final PropertyDescriptor SOURCESPECIFIC_SQLSERVER_SCHEMA = new PropertyDescriptor.Builder()
.name("[SQL Server Only] Source Schema")
.description("This property is for SQL Server only. By default, the source schema is generally 'dbo'. "
+ "If the source table to be ingested is in another schema, provide that value here.")
.required(false)
.expressionLanguageSupported(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
/**
* The success relationship (for sqoop job success)
*/
public static final Relationship REL_SUCCESS = new Relationship.Builder()
.name("success")
.description("Sqoop import success")
.build();
/**
* The failure relationship (for sqoop job failure)
*/
public static final Relationship REL_FAILURE = new Relationship.Builder()
.name("failure")
.description("Sqoop import failure")
.build();
/**
* Property for Kerberos service principal
*/
private PropertyDescriptor KERBEROS_PRINCIPAL;
/**
* Property for Kerberos service keytab
*/
private PropertyDescriptor KERBEROS_KEYTAB;
private List<PropertyDescriptor> properties;
private Set<Relationship> relationships;
@Override
protected void init(@Nonnull final ProcessorInitializationContext context) {
super.init(context);
/* Create Kerberos properties */
final SpringSecurityContextLoader securityContextLoader = SpringSecurityContextLoader.create(context);
final KerberosProperties kerberosProperties = securityContextLoader.getKerberosProperties();
KERBEROS_KEYTAB = kerberosProperties.createKerberosKeytabProperty();
KERBEROS_PRINCIPAL = kerberosProperties.createKerberosPrincipalProperty();
/* Create list of properties */
final List<PropertyDescriptor> properties = new ArrayList<>();
properties.add(KERBEROS_PRINCIPAL);
properties.add(KERBEROS_KEYTAB);
properties.add(SQOOP_CONNECTION_SERVICE);
properties.add(SOURCE_TABLE_NAME);
properties.add(SOURCE_TABLE_FIELDS);
properties.add(SOURCE_TABLE_WHERE_CLAUSE);
properties.add(SOURCE_LOAD_STRATEGY);
properties.add(SOURCE_CHECK_COLUMN_NAME);
properties.add(SOURCE_CHECK_COLUMN_LAST_VALUE);
properties.add(SOURCE_PROPERTY_WATERMARK);
properties.add(SOURCE_SPLIT_BY_FIELD);
properties.add(SOURCE_BOUNDARY_QUERY);
properties.add(CLUSTER_MAP_TASKS);
properties.add(CLUSTER_UI_JOB_NAME);
properties.add(TARGET_HDFS_DIRECTORY);
properties.add(TARGET_HDFS_DIRECTORY_EXISTS_STRATEGY);
properties.add(TARGET_EXTRACT_DATA_FORMAT);
properties.add(TARGET_HDFS_FILE_FIELD_DELIMITER);
properties.add(TARGET_HDFS_FILE_RECORD_DELIMITER);
properties.add(TARGET_HIVE_DELIM_STRATEGY);
properties.add(TARGET_HIVE_REPLACE_DELIM);
properties.add(TARGET_COMPRESSION_ALGORITHM);
properties.add(TARGET_COLUMN_TYPE_MAPPING);
properties.add(SQOOP_CODEGEN_DIR);
properties.add(SOURCESPECIFIC_SQLSERVER_SCHEMA);
this.properties = Collections.unmodifiableList(properties);
/* Create list of relationships */
final Set<Relationship> relationships = new HashSet<>();
relationships.add(REL_SUCCESS);
relationships.add(REL_FAILURE);
this.relationships = Collections.unmodifiableSet(relationships);
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return properties;
}
@Override
public Set<Relationship> getRelationships() {
return relationships;
}
@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
final ComponentLog logger = getLog();
FlowFile flowFile = session.get();
if (flowFile == null) {
flowFile = session.create();
logger.info("Created a flow file having uuid: {}", new Object[]{flowFile.getAttribute(CoreAttributes.UUID.key())});
} else {
logger.info("Using an existing flow file having uuid: {}", new Object[]{flowFile.getAttribute(CoreAttributes.UUID.key())});
}
final String kerberosPrincipal = context.getProperty(KERBEROS_PRINCIPAL).getValue();
final String kerberosKeyTab = context.getProperty(KERBEROS_KEYTAB).getValue();
final SqoopConnectionService sqoopConnectionService = context.getProperty(SQOOP_CONNECTION_SERVICE).asControllerService(SqoopConnectionService.class);
final String sourceTableName = context.getProperty(SOURCE_TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue();
final String sourceTableFields = context.getProperty(SOURCE_TABLE_FIELDS).evaluateAttributeExpressions(flowFile).getValue();
final String sourceTableWhereClause = context.getProperty(SOURCE_TABLE_WHERE_CLAUSE).evaluateAttributeExpressions(flowFile).getValue();
final SqoopLoadStrategy sourceLoadStrategy = SqoopLoadStrategy.valueOf(context.getProperty(SOURCE_LOAD_STRATEGY).getValue());
final String sourceCheckColumnName = context.getProperty(SOURCE_CHECK_COLUMN_NAME).evaluateAttributeExpressions(flowFile).getValue();
final String sourcePropertyWatermark = context.getProperty(SOURCE_PROPERTY_WATERMARK).evaluateAttributeExpressions(flowFile).getValue();
final String sourceCheckColumnLastValue = context.getProperty(SOURCE_CHECK_COLUMN_LAST_VALUE).evaluateAttributeExpressions(flowFile).getValue();
final String sourceSplitByField = context.getProperty(SOURCE_SPLIT_BY_FIELD).evaluateAttributeExpressions(flowFile).getValue();
final String sourceBoundaryQuery = context.getProperty(SOURCE_BOUNDARY_QUERY).evaluateAttributeExpressions(flowFile).getValue();
final Integer clusterMapTasks = context.getProperty(CLUSTER_MAP_TASKS).evaluateAttributeExpressions(flowFile).asInteger();
final String clusterUIJobName = context.getProperty(CLUSTER_UI_JOB_NAME).evaluateAttributeExpressions(flowFile).getValue();
final String targetHdfsDirectory = context.getProperty(TARGET_HDFS_DIRECTORY).evaluateAttributeExpressions(flowFile).getValue();
final TargetHdfsDirExistsStrategy targetHdfsDirExistsStrategy = TargetHdfsDirExistsStrategy.valueOf(context.getProperty(TARGET_HDFS_DIRECTORY_EXISTS_STRATEGY).getValue());
final ExtractDataFormat targetExtractDataFormat = ExtractDataFormat.valueOf(context.getProperty(TARGET_EXTRACT_DATA_FORMAT).getValue());
final String targetHdfsFileFieldDelimiter = context.getProperty(TARGET_HDFS_FILE_FIELD_DELIMITER).evaluateAttributeExpressions(flowFile).getValue();
final String targetHdfsFileRecordDelimiter = context.getProperty(TARGET_HDFS_FILE_RECORD_DELIMITER).evaluateAttributeExpressions(flowFile).getValue();
final HiveDelimStrategy targetHiveDelimStrategy = HiveDelimStrategy.valueOf(context.getProperty(TARGET_HIVE_DELIM_STRATEGY).getValue());
final String targetHiveReplaceDelim = context.getProperty(TARGET_HIVE_REPLACE_DELIM).evaluateAttributeExpressions(flowFile).getValue();
final CompressionAlgorithm targetCompressionAlgorithm = CompressionAlgorithm.valueOf(context.getProperty(TARGET_COMPRESSION_ALGORITHM).getValue());
final String targetColumnTypeMapping = context.getProperty(TARGET_COLUMN_TYPE_MAPPING).evaluateAttributeExpressions(flowFile).getValue();
final String sqoopCodeGenDirectory = context.getProperty(SQOOP_CODEGEN_DIR).evaluateAttributeExpressions(flowFile).getValue();
final String sourceSpecificSqlServerSchema = context.getProperty(SOURCESPECIFIC_SQLSERVER_SCHEMA).evaluateAttributeExpressions(flowFile).getValue();
final String COMMAND_SHELL = "/bin/bash";
final String COMMAND_SHELL_FLAGS = "-c";
final StopWatch stopWatch = new StopWatch(false);
KerberosConfig kerberosConfig = new KerberosConfig()
.setLogger(logger)
.setKerberosPrincipal(kerberosPrincipal)
.setKerberosKeytab(kerberosKeyTab);
SqoopBuilder sqoopBuilder = new SqoopBuilder();
String sqoopCommand = sqoopBuilder
.setLogger(logger)
.setSourceConnectionString(sqoopConnectionService.getConnectionString())
.setSourceUserName(sqoopConnectionService.getUserName())
.setPasswordMode(sqoopConnectionService.getPasswordMode())
.setSourcePasswordHdfsFile(sqoopConnectionService.getPasswordHdfsFile())
.setSourcePasswordPassphrase(sqoopConnectionService.getPasswordPassphrase())
.setSourceEnteredPassword(sqoopConnectionService.getEnteredPassword())
.setSourceConnectionManager(sqoopConnectionService.getConnectionManager())
.setSourceDriver(sqoopConnectionService.getDriver())
.setSourceTableName(sourceTableName)
.setSourceTableFields(sourceTableFields)
.setSourceTableWhereClause(sourceTableWhereClause)
.setSourceLoadStrategy(sourceLoadStrategy)
.setSourceCheckColumnName(sourceCheckColumnName)
.setSourceCheckColumnLastValue(sourceCheckColumnLastValue)
.setSourceSplitByField(sourceSplitByField)
.setSourceBoundaryQuery(sourceBoundaryQuery)
.setClusterMapTasks(clusterMapTasks)
.setClusterUIJobName(clusterUIJobName)
.setTargetHdfsDirectory(targetHdfsDirectory)
.setTargetHdfsDirExistsStrategy(targetHdfsDirExistsStrategy)
.setTargetExtractDataFormat(targetExtractDataFormat)
.setTargetHdfsFileFieldDelimiter(targetHdfsFileFieldDelimiter)
.setTargetHdfsFileRecordDelimiter(targetHdfsFileRecordDelimiter)
.setTargetHiveDelimStrategy(targetHiveDelimStrategy)
.setTargetHiveReplaceDelim(targetHiveReplaceDelim)
.setTargetCompressionAlgorithm(targetCompressionAlgorithm)
.setTargetColumnTypeMapping(targetColumnTypeMapping)
.setSqoopCodeGenDirectory(sqoopCodeGenDirectory)
.setSourceSpecificSqlServerSchema(sourceSpecificSqlServerSchema)
.build();
List<String> sqoopExecutionCommand = new ArrayList<>();
sqoopExecutionCommand.add(COMMAND_SHELL);
sqoopExecutionCommand.add(COMMAND_SHELL_FLAGS);
sqoopExecutionCommand.add(sqoopCommand);
SqoopProcessRunner sqoopProcessRunner = new SqoopProcessRunner(kerberosConfig,
sqoopExecutionCommand,
logger,
sourceLoadStrategy);
logger.info("Starting execution of Sqoop command");
stopWatch.start();
SqoopProcessResult sqoopProcessResult = sqoopProcessRunner.execute();
long jobDurationSeconds = stopWatch.getElapsed(TimeUnit.SECONDS);
stopWatch.stop();
logger.info("Finished execution of Sqoop command");
int resultStatus = sqoopProcessResult.getExitValue();
SqoopUtils sqoopUtils = new SqoopUtils();
long recordsCount = sqoopUtils.getSqoopRecordCount(sqoopProcessResult, logger);
String sqoopCommandWithCredentialsMasked = sqoopUtils.maskCredentials(sqoopCommand,
sqoopUtils.getCredentialsToMask());
flowFile = session.putAttribute(flowFile, "sqoop.command.text", sqoopCommandWithCredentialsMasked);
flowFile = session.putAttribute(flowFile, "sqoop.result.code", String.valueOf(resultStatus));
flowFile = session.putAttribute(flowFile, "sqoop.run.seconds", String.valueOf(jobDurationSeconds));
flowFile = session.putAttribute(flowFile, "sqoop.record.count", String.valueOf(recordsCount));
flowFile = session.putAttribute(flowFile, "sqoop.output.hdfs", targetHdfsDirectory);
logger.info("Wrote result attributes to flow file");
if (resultStatus == 0) {
logger.info("Sqoop Import OK [Code {}]", new Object[]{resultStatus});
if (sourceLoadStrategy == SqoopLoadStrategy.INCREMENTAL_APPEND
|| sourceLoadStrategy == SqoopLoadStrategy.INCREMENTAL_LASTMODIFIED) {
if ((sourceLoadStrategy == SqoopLoadStrategy.INCREMENTAL_APPEND) && (recordsCount == 0)) {
flowFile = session.putAttribute(flowFile, sourcePropertyWatermark, sourceCheckColumnLastValue);
} else {
String newHighWaterMark = sqoopUtils.getNewHighWatermark(sqoopProcessResult);
if ((newHighWaterMark == null) || (newHighWaterMark.equals("NO_UPDATE")) || (newHighWaterMark.equals(""))) {
flowFile = session.putAttribute(flowFile, sourcePropertyWatermark, sourceCheckColumnLastValue);
} else {
flowFile = session.putAttribute(flowFile, sourcePropertyWatermark, newHighWaterMark);
}
}
}
session.transfer(flowFile, REL_SUCCESS);
} else {
logger.error("Sqoop Import FAIL [Code {}]", new Object[]{resultStatus});
session.transfer(flowFile, REL_FAILURE);
}
}
/**
* Called by the framework, this method does additional validation on properties
*
* @param validationContext used to retrieves the properties to check
* @return A collection of {@link ValidationResult} which will be checked by the framework
*/
@Override
protected Collection<ValidationResult> customValidate(ValidationContext validationContext) {
final List<ValidationResult> results = new ArrayList<>();
final SqoopLoadStrategy sourceLoadStrategy = SqoopLoadStrategy.valueOf(validationContext.getProperty(SOURCE_LOAD_STRATEGY).getValue());
final String sourceCheckColumnName = validationContext.getProperty(SOURCE_CHECK_COLUMN_NAME).evaluateAttributeExpressions().getValue();
final String sourceCheckColumnLastValue = validationContext.getProperty(SOURCE_CHECK_COLUMN_LAST_VALUE).evaluateAttributeExpressions().getValue();
final HiveDelimStrategy targetHiveDelimStrategy = HiveDelimStrategy.valueOf(validationContext.getProperty(TARGET_HIVE_DELIM_STRATEGY).getValue());
final String targetHiveReplaceDelim = validationContext.getProperty(TARGET_HIVE_REPLACE_DELIM).evaluateAttributeExpressions().getValue();
final SqoopConnectionService sqoopConnectionService = validationContext.getProperty(SQOOP_CONNECTION_SERVICE).asControllerService(SqoopConnectionService.class);
final PasswordMode passwordMode = sqoopConnectionService.getPasswordMode();
final ExtractDataFormat targetExtractDataFormat = ExtractDataFormat.valueOf(validationContext.getProperty(TARGET_EXTRACT_DATA_FORMAT).getValue());
final CompressionAlgorithm targetCompressionAlgorithm = CompressionAlgorithm.valueOf(validationContext.getProperty(TARGET_COMPRESSION_ALGORITHM).getValue());
final String targetColumnTypeMapping = validationContext.getProperty(TARGET_COLUMN_TYPE_MAPPING).evaluateAttributeExpressions().getValue();
SqoopUtils sqoopUtils = new SqoopUtils();
if (sourceLoadStrategy == SqoopLoadStrategy.INCREMENTAL_LASTMODIFIED || sourceLoadStrategy == SqoopLoadStrategy.INCREMENTAL_APPEND) {
if ((sourceCheckColumnName == null) || (sourceCheckColumnLastValue == null)) {
results.add(new ValidationResult.Builder()
.subject(this.getClass().getSimpleName())
.valid(false)
.explanation("Both 'Check Column Name' and 'Check Column Last Value' are required for incremental load.")
.build());
}
}
if (targetHiveDelimStrategy == HiveDelimStrategy.REPLACE) {
if (targetHiveReplaceDelim == null) {
results.add(new ValidationResult.Builder()
.subject(this.getClass().getSimpleName())
.valid(false)
.explanation("Replacement delimiter must be specified for Hive Delimiter REPLACE Strategy.")
.build());
}
}
if (passwordMode == PasswordMode.ENCRYPTED_ON_HDFS_FILE) {
if (sqoopConnectionService.getPasswordHdfsFile() == null || sqoopConnectionService.getPasswordPassphrase() == null) {
results.add(new ValidationResult.Builder()
.subject(this.getClass().getSimpleName())
.valid(false)
.explanation("For encrypted password on HDFS, both encrypted HDFS file location and passphrase are required.")
.build());
}
} else if (passwordMode == PasswordMode.ENCRYPTED_TEXT_ENTRY) {
if (sqoopConnectionService.getEnteredPassword() == null || sqoopConnectionService.getPasswordPassphrase() == null) {
results.add(new ValidationResult.Builder()
.subject(this.getClass().getSimpleName())
.valid(false)
.explanation("For encrypted password entry mode, both the encrypted password and passphrase are required.")
.build());
}
} else if (passwordMode == PasswordMode.CLEAR_TEXT_ENTRY) {
if (sqoopConnectionService.getEnteredPassword() == null) {
results.add(new ValidationResult.Builder()
.subject(this.getClass().getSimpleName())
.valid(false)
.explanation("For clear text password entry mode, the password must be provided.")
.build());
}
}
if ((targetExtractDataFormat == ExtractDataFormat.PARQUET) && (targetCompressionAlgorithm != CompressionAlgorithm.NONE)) {
results.add(new ValidationResult.Builder()
.subject(this.getClass().getSimpleName())
.valid(false)
.explanation("For PARQUET data format, compression is not supported.")
.build());
}
if ((sqoopUtils.isTeradataDatabase(sqoopConnectionService.getConnectionString()) && (sourceLoadStrategy == SqoopLoadStrategy.INCREMENTAL_LASTMODIFIED))) {
results.add(new ValidationResult.Builder()
.subject(this.getClass().getSimpleName())
.valid(false)
.explanation("For Teradata source system, INCREMENTAL_LASTMODIFIED mode of load is not supported. This is due to a known issue with Sqoop (SQOOP-2402).")
.build());
}
if ((targetColumnTypeMapping != null) && (!targetColumnTypeMapping.isEmpty())) {
if (!sqoopUtils.checkMappingInput(targetColumnTypeMapping)) {
results.add(new ValidationResult.Builder()
.subject(this.getClass().getSimpleName())
.valid(false)
.explanation("For Target Column Type Mapping, ensure that mappings are provided as COLUMN=Type pairs separated by comma. "
+ "Ensure no spaces in entry. "
+ "Example: PO_ID=Integer,PO_DETAILS=String")
.build());
}
}
return results;
}
}