package com.thinkbiganalytics.nifi.v2.sqoop.core; /*- * #%L * thinkbig-nifi-hadoop-processors * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.thinkbiganalytics.nifi.processor.AbstractNiFiProcessor; import com.thinkbiganalytics.nifi.security.KerberosProperties; import com.thinkbiganalytics.nifi.security.SpringSecurityContextLoader; import com.thinkbiganalytics.nifi.v2.sqoop.SqoopConnectionService; import com.thinkbiganalytics.nifi.v2.sqoop.enums.ExportNullInterpretationStrategy; import com.thinkbiganalytics.nifi.v2.sqoop.process.SqoopExportProcessRunner; import com.thinkbiganalytics.nifi.v2.sqoop.process.SqoopProcessResult; import com.thinkbiganalytics.nifi.v2.sqoop.security.KerberosConfig; import com.thinkbiganalytics.nifi.v2.sqoop.utils.SqoopExportBuilder; import com.thinkbiganalytics.nifi.v2.sqoop.utils.SqoopUtils; import org.apache.nifi.annotation.behavior.WritesAttribute; import org.apache.nifi.annotation.behavior.WritesAttributes; import org.apache.nifi.annotation.documentation.CapabilityDescription; import org.apache.nifi.annotation.documentation.Tags; import org.apache.nifi.components.PropertyDescriptor; import org.apache.nifi.components.ValidationContext; import org.apache.nifi.components.ValidationResult; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.flowfile.attributes.CoreAttributes; import org.apache.nifi.logging.ComponentLog; import org.apache.nifi.processor.ProcessContext; import org.apache.nifi.processor.ProcessSession; import org.apache.nifi.processor.ProcessorInitializationContext; import org.apache.nifi.processor.Relationship; import org.apache.nifi.processor.exception.ProcessException; import org.apache.nifi.processor.util.StandardValidators; import org.apache.nifi.util.StopWatch; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.concurrent.TimeUnit; import javax.annotation.Nonnull; /** * NiFi Processor to export data from HDFS to a relational system via Sqoop */ @Tags({"thinkbig", "export", "sqoop", "rdbms", "database", "table"}) @CapabilityDescription("Export data from HDFS to a relational system via Sqoop") @WritesAttributes({ @WritesAttribute(attribute = "sqoop.export.command.text", description = "The full Sqoop export command executed"), @WritesAttribute(attribute = "sqoop.export.result.code", description = "The exit code from Sqoop export command execution"), @WritesAttribute(attribute = "sqoop.export.run.seconds", description = "Total seconds taken to run the Sqoop export command"), @WritesAttribute(attribute = "sqoop.export.record.count", description = "Count of records exported"), @WritesAttribute(attribute = "sqoop.export.output.table", description = "Table name where data is written"), }) public class ExportSqoop extends AbstractNiFiProcessor { /** * Property to provide connection service for executing sqoop jobs. */ public static final PropertyDescriptor SQOOP_CONNECTION_SERVICE = new PropertyDescriptor.Builder() .name("Sqoop Connection Service") .description("Connection service for executing sqoop jobs.") .required(true) .identifiesControllerService(SqoopConnectionService.class) .build(); /** * Property to provide source HDFS directory to get the data from for export. */ public static final PropertyDescriptor SOURCE_HDFS_DIRECTORY = new PropertyDescriptor.Builder() .name("Source HDFS Directory") .description("Source HDFS directory to get the data from for export.") .required(true) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(true) .build(); /** * Property to provide delimiter for source data on HDFS. */ public static final PropertyDescriptor SOURCE_HDFS_FILE_DELIMITER = new PropertyDescriptor.Builder() .name("Source HDFS File Delimiter") .description("Delimiter for source data on HDFS.") .required(true) .expressionLanguageSupported(true) .defaultValue(",") .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .build(); /** * Property to provide method for identifying nulls in source HDFS data. */ public static final PropertyDescriptor SOURCE_NULL_INTERPRETATION_STRATEGY = new PropertyDescriptor.Builder() .name("Source Null Interpret Strategy") .description("Method for identifying nulls in source HDFS data. " + "For SQOOP_DEFAULT [{String column values: null in HDFS data -> null in relational system} {Non-string column values: null or empty_string in HDFS data -> null in relational system}]. " + "For HIVE_DEFAULT [{String column values: \\N in HDFS data -> null in relational system} {Non-string column values: \\N or empty_string in HDFS data -> null in relational system}]. " + "For CUSTOM_VALUES: Custom-provided identifiers to identify null values in string and non-string columns in HDFS data.") .required(true) .allowableValues(ExportNullInterpretationStrategy.values()) .defaultValue(ExportNullInterpretationStrategy.HIVE_DEFAULT.toString()) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(false) .build(); /** * Property to provide custom string for identifying null strings in HDFS data. */ public static final PropertyDescriptor SOURCE_NULL_CUSTOM_STRING_IDENTIFIER = new PropertyDescriptor.Builder() .name("Source Null String Identifier") .description("Custom string for identifying null strings in HDFS data.") .required(false) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(true) .build(); /** * Property to provide custom string for identifying null non-strings in HDFS data. */ public static final PropertyDescriptor SOURCE_NULL_CUSTOM_NON_STRING_IDENTIFIER = new PropertyDescriptor.Builder() .name("Source Null Non-String Identifier") .description("Custom string for identifying null non-strings in HDFS data.") .required(false) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(true) .build(); /** * Property to provide the table to populate in the target relational system. NOTE: This table must already exist. */ public static final PropertyDescriptor TARGET_TABLE_NAME = new PropertyDescriptor.Builder() .name("Target Table") .description("The table to populate in the target relational system. NOTE: This table must already exist.") .required(true) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(true) .build(); /** * Property to provide number of map tasks to export data in parallel. */ public static final PropertyDescriptor CLUSTER_MAP_TASKS = new PropertyDescriptor.Builder() .name("Cluster Map Tasks") .description("Number of map tasks to export data in parallel. Valid values are from 1 to 25. " + "Higher values put more load on the target relational system. " + "Also, consider capacity of cluster when setting this property value.") .required(false) .expressionLanguageSupported(true) .defaultValue("4") .addValidator(StandardValidators.createLongValidator(1L, 25L, true)) .build(); /** * Success relationship */ public static final Relationship REL_SUCCESS = new Relationship.Builder() .name("success") .description("Sqoop export success") .build(); /** * Failure relationship */ public static final Relationship REL_FAILURE = new Relationship.Builder() .name("failure") .description("Sqoop export failure") .build(); /* * Property for Kerberos service principal */ private PropertyDescriptor KERBEROS_PRINCIPAL; /* * Property for Kerberos service keytab */ private PropertyDescriptor KERBEROS_KEYTAB; private List<PropertyDescriptor> properties; private Set<Relationship> relationships; @Override protected void init(@Nonnull final ProcessorInitializationContext context) { super.init(context); /* Create Kerberos properties */ final SpringSecurityContextLoader securityContextLoader = SpringSecurityContextLoader.create(context); final KerberosProperties kerberosProperties = securityContextLoader.getKerberosProperties(); KERBEROS_KEYTAB = kerberosProperties.createKerberosKeytabProperty(); KERBEROS_PRINCIPAL = kerberosProperties.createKerberosPrincipalProperty(); /* Create list of properties */ final List<PropertyDescriptor> properties = new ArrayList<>(); properties.add(KERBEROS_PRINCIPAL); properties.add(KERBEROS_KEYTAB); properties.add(SQOOP_CONNECTION_SERVICE); properties.add(SOURCE_HDFS_DIRECTORY); properties.add(SOURCE_HDFS_FILE_DELIMITER); properties.add(SOURCE_NULL_INTERPRETATION_STRATEGY); properties.add(SOURCE_NULL_CUSTOM_STRING_IDENTIFIER); properties.add(SOURCE_NULL_CUSTOM_NON_STRING_IDENTIFIER); properties.add(TARGET_TABLE_NAME); properties.add(CLUSTER_MAP_TASKS); this.properties = Collections.unmodifiableList(properties); /* Create list of relationships */ final Set<Relationship> relationships = new HashSet<>(); relationships.add(REL_SUCCESS); relationships.add(REL_FAILURE); this.relationships = Collections.unmodifiableSet(relationships); } @Override protected List<PropertyDescriptor> getSupportedPropertyDescriptors() { return properties; } @Override public Set<Relationship> getRelationships() { return relationships; } @Override public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException { final ComponentLog logger = getLog(); FlowFile flowFile = session.get(); if (flowFile == null) { flowFile = session.create(); logger.info("Created a flow file having uuid: {}", new Object[]{flowFile.getAttribute(CoreAttributes.UUID.key())}); } else { logger.info("Using an existing flow file having uuid: {}", new Object[]{flowFile.getAttribute(CoreAttributes.UUID.key())}); } final String kerberosPrincipal = context.getProperty(KERBEROS_PRINCIPAL).getValue(); final String kerberosKeyTab = context.getProperty(KERBEROS_KEYTAB).getValue(); final SqoopConnectionService sqoopConnectionService = context.getProperty(SQOOP_CONNECTION_SERVICE).asControllerService(SqoopConnectionService.class); final String sourceHdfsDirectory = context.getProperty(SOURCE_HDFS_DIRECTORY).evaluateAttributeExpressions(flowFile).getValue(); final String sourceHdfsFileDelimiter = context.getProperty(SOURCE_HDFS_FILE_DELIMITER).evaluateAttributeExpressions(flowFile).getValue(); final ExportNullInterpretationStrategy sourceNullInterpretationStrategy = ExportNullInterpretationStrategy.valueOf(context.getProperty(SOURCE_NULL_INTERPRETATION_STRATEGY).getValue()); final String sourceNullCustomStringIdentifier = context.getProperty(SOURCE_NULL_CUSTOM_STRING_IDENTIFIER).evaluateAttributeExpressions(flowFile).getValue(); final String sourceNullCustomNonStringIdentifier = context.getProperty(SOURCE_NULL_CUSTOM_NON_STRING_IDENTIFIER).evaluateAttributeExpressions(flowFile).getValue(); final String targetTableName = context.getProperty(TARGET_TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue(); final Integer clusterMapTasks = context.getProperty(CLUSTER_MAP_TASKS).evaluateAttributeExpressions(flowFile).asInteger(); final String COMMAND_SHELL = "/bin/bash"; final String COMMAND_SHELL_FLAGS = "-c"; final StopWatch stopWatch = new StopWatch(false); KerberosConfig kerberosConfig = new KerberosConfig() .setLogger(logger) .setKerberosPrincipal(kerberosPrincipal) .setKerberosKeytab(kerberosKeyTab); SqoopExportBuilder sqoopExportBuilder = new SqoopExportBuilder(); String sqoopExportCommand = sqoopExportBuilder .setLogger(logger) .setTargetConnectionString(sqoopConnectionService.getConnectionString()) .setTargetUserName(sqoopConnectionService.getUserName()) .setPasswordMode(sqoopConnectionService.getPasswordMode()) .setTargetPasswordHdfsFile(sqoopConnectionService.getPasswordHdfsFile()) .setTargetPasswordPassphrase(sqoopConnectionService.getPasswordPassphrase()) .setTargetEnteredPassword(sqoopConnectionService.getEnteredPassword()) .setTargetConnectionManager(sqoopConnectionService.getConnectionManager()) .setTargetDriver(sqoopConnectionService.getDriver()) .setTargetTableName(targetTableName) .setSourceHdfsDirectory(sourceHdfsDirectory) .setSourceHdfsFileDelimiter(sourceHdfsFileDelimiter) .setSourceNullInterpretationStrategy(sourceNullInterpretationStrategy) .setSourceNullInterpretationStrategyCustomNullString(sourceNullCustomStringIdentifier) .setSourceNullInterpretationStrategyCustomNullNonString(sourceNullCustomNonStringIdentifier) .setClusterMapTasks(clusterMapTasks) .build(); List<String> sqoopExportExecutionCommand = new ArrayList<>(); sqoopExportExecutionCommand.add(COMMAND_SHELL); sqoopExportExecutionCommand.add(COMMAND_SHELL_FLAGS); sqoopExportExecutionCommand.add(sqoopExportCommand); SqoopExportProcessRunner sqoopExportProcessRunner = new SqoopExportProcessRunner(kerberosConfig, sqoopExportExecutionCommand, logger); logger.info("Starting execution of Sqoop export command"); stopWatch.start(); SqoopProcessResult sqoopExportProcessResult = sqoopExportProcessRunner.execute(); long jobDurationSeconds = stopWatch.getElapsed(TimeUnit.SECONDS); stopWatch.stop(); logger.info("Finished execution of Sqoop export command"); int resultExportStatus = sqoopExportProcessResult.getExitValue(); SqoopUtils sqoopUtils = new SqoopUtils(); long recordsExportCount = sqoopUtils.getSqoopExportRecordCount(sqoopExportProcessResult, logger); String sqoopExportCommandWithCredentialsMasked = sqoopUtils.maskCredentials(sqoopExportCommand, sqoopUtils.getCredentialsToMask()); flowFile = session.putAttribute(flowFile, "sqoop.export.command.text", sqoopExportCommandWithCredentialsMasked); flowFile = session.putAttribute(flowFile, "sqoop.export.result.code", String.valueOf(resultExportStatus)); flowFile = session.putAttribute(flowFile, "sqoop.export.run.seconds", String.valueOf(jobDurationSeconds)); flowFile = session.putAttribute(flowFile, "sqoop.export.record.count", String.valueOf(recordsExportCount)); flowFile = session.putAttribute(flowFile, "sqoop.export.output.table", targetTableName); logger.info("Wrote result attributes to flow file"); if (resultExportStatus == 0) { logger.info("Sqoop Export OK [Code {}]", new Object[]{resultExportStatus}); session.transfer(flowFile, REL_SUCCESS); } else { logger.info("Sqoop Export FAIL [Code {}]", new Object[]{resultExportStatus}); session.transfer(flowFile, REL_FAILURE); } } /** * Called by the framework this method does additional validation on properties * * @param validationContext used to retrieves the properties to check * @return A collection of {@link ValidationResult} which will be checked by the framework */ @Override protected Collection<ValidationResult> customValidate(ValidationContext validationContext) { final List<ValidationResult> results = new ArrayList<>(); final ExportNullInterpretationStrategy sourceNullInterpretationStrategy = ExportNullInterpretationStrategy.valueOf(validationContext.getProperty(SOURCE_NULL_INTERPRETATION_STRATEGY).getValue()); final String sourceNullCustomStringIdentifier = validationContext.getProperty(SOURCE_NULL_CUSTOM_STRING_IDENTIFIER).evaluateAttributeExpressions().getValue(); final String sourceNullCustomNonStringIdentifier = validationContext.getProperty(SOURCE_NULL_CUSTOM_NON_STRING_IDENTIFIER).evaluateAttributeExpressions().getValue(); if (sourceNullInterpretationStrategy == ExportNullInterpretationStrategy.CUSTOM_VALUES) { if ((sourceNullCustomStringIdentifier == null) || (sourceNullCustomNonStringIdentifier == null)) { results.add(new ValidationResult.Builder() .subject(this.getClass().getSimpleName()) .valid(false) .explanation("For Custom Source Null Interpret Strategy, custom strings for identifying null strings and null non-strings in HDFS data must be provided.") .build()); } } return results; } }