package com.thinkbiganalytics.nifi.v2.ingest;
/*-
* #%L
* thinkbig-nifi-core-processors
* %%
* Copyright (C) 2017 ThinkBig Analytics
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.thinkbiganalytics.ingest.TableMergeSyncSupport;
import com.thinkbiganalytics.nifi.processor.AbstractNiFiProcessor;
import com.thinkbiganalytics.nifi.v2.thrift.ThriftService;
import com.thinkbiganalytics.util.ColumnSpec;
import com.thinkbiganalytics.util.PartitionSpec;
import org.apache.commons.lang3.StringUtils;
import org.apache.nifi.annotation.behavior.EventDriven;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.util.StopWatch;
import java.sql.Connection;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.FEED_PARTITION;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.FIELD_SPECIFICATION;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.PARTITION_SPECIFICATION;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.REL_FAILURE;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.REL_SUCCESS;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.SOURCE_SCHEMA;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.SOURCE_TABLE;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.TARGET_SCHEMA;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.TARGET_TABLE;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.THRIFT_SERVICE;
@EventDriven
@InputRequirement(InputRequirement.Requirement.INPUT_ALLOWED)
@Tags({"hive", "ddl", "merge", "sync", "thinkbig"})
@CapabilityDescription("Fully synchronize or Merge values from a feed partition into the target table optionally supporting de-dupe and overwriting partitions. Sync will overwrite the entire table "
+ "to match the source."
)
public class MergeTable extends AbstractNiFiProcessor {
/**
* Merge using primary key
**/
public static final String STRATEGY_PK_MERGE = "PK_MERGE";
/**
* Merge with dedupe
**/
public static final String STRATEGY_DEDUPE_MERGE = "DEDUPE_AND_MERGE";
/**
* Merge allowing duplicates
**/
public static final String STRATEGY_MERGE = "MERGE";
/**
* Sync replace everything in table
**/
public static final String STRATEGY_SYNC = "SYNC";
/**
* Rolling SYNC same as SYNC but at a partition level overwriting only partitions present in source.
**/
public static final String STRATEGY_ROLLING_SYNC = "ROLLING_SYNC";
public static final PropertyDescriptor MERGE_STRATEGY = new PropertyDescriptor.Builder()
.name("Merge Strategy")
.description(
"Specifies the algorithm used to merge. Valid values are SYNC,MERGE, PK_MERGE, DEDUPE_AND_MERGE, and ROLLING_SYNC. Sync will completely overwrite the target table with the source data. "
+ "Rolling Sync will overwrite target partitions only when present in source. "
+ "Merge will append "
+ "the data into the target partitions. Dedupe will insert into the target partition but ensure no duplicate rows are remaining. PK Merge will insert or update existing rows "
+ "matching the"
+ " same primary key.")
.required(true)
.expressionLanguageSupported(true)
.allowableValues(STRATEGY_MERGE, STRATEGY_DEDUPE_MERGE, STRATEGY_PK_MERGE, STRATEGY_SYNC, STRATEGY_ROLLING_SYNC, "${metadata.table.targetMergeStrategy}")
.defaultValue("${metadata.table.targetMergeStrategy}")
.build();
public static final PropertyDescriptor HIVE_CONFIGURATIONS = new PropertyDescriptor.Builder()
.name("Hive Configurations")
.description("Pipe separated list of Hive Configurations that you would like to set for Hive queries ")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(true)
.build();
private final Set<Relationship> relationships;
private final List<PropertyDescriptor> propDescriptors;
public MergeTable() {
final Set<Relationship> r = new HashSet<>();
r.add(REL_SUCCESS);
r.add(REL_FAILURE);
relationships = Collections.unmodifiableSet(r);
final List<PropertyDescriptor> pds = new ArrayList<>();
pds.add(THRIFT_SERVICE);
pds.add(MERGE_STRATEGY);
pds.add(SOURCE_SCHEMA);
pds.add(SOURCE_TABLE);
pds.add(TARGET_SCHEMA);
pds.add(TARGET_TABLE);
pds.add(FEED_PARTITION);
pds.add(PARTITION_SPECIFICATION);
pds.add(FIELD_SPECIFICATION);
pds.add(HIVE_CONFIGURATIONS);
propDescriptors = Collections.unmodifiableList(pds);
}
@Override
public Set<Relationship> getRelationships() {
return relationships;
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return propDescriptors;
}
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
final ComponentLog logger = getLog();
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
String PROVENANCE_EXECUTION_STATUS_KEY = context.getName() + " Execution Status";
ThriftService thriftService = context.getProperty(THRIFT_SERVICE).asControllerService(ThriftService.class);
String partitionSpecString = context.getProperty(PARTITION_SPECIFICATION).evaluateAttributeExpressions(flowFile).getValue();
String sourceSchema = context.getProperty(SOURCE_SCHEMA).evaluateAttributeExpressions(flowFile).getValue();
String sourceTable = context.getProperty(SOURCE_TABLE).evaluateAttributeExpressions(flowFile).getValue();
String targetSchema = context.getProperty(TARGET_SCHEMA).evaluateAttributeExpressions(flowFile).getValue();
String targetTable = context.getProperty(TARGET_TABLE).evaluateAttributeExpressions(flowFile).getValue();
String feedPartitionValue = context.getProperty(FEED_PARTITION).evaluateAttributeExpressions(flowFile).getValue();
String mergeStrategyValue = context.getProperty(MERGE_STRATEGY).evaluateAttributeExpressions(flowFile).getValue();
String hiveConfigurations = context.getProperty(HIVE_CONFIGURATIONS).evaluateAttributeExpressions(flowFile).getValue();
final ColumnSpec[] columnSpecs = Optional.ofNullable(context.getProperty(FIELD_SPECIFICATION).evaluateAttributeExpressions(flowFile).getValue())
.filter(StringUtils::isNotEmpty)
.map(ColumnSpec::createFromString)
.orElse(new ColumnSpec[0]);
if (STRATEGY_PK_MERGE.equals(mergeStrategyValue) && (columnSpecs == null || columnSpecs.length == 0)) {
getLog().error("Missing required field specification for PK merge feature");
flowFile = session.putAttribute(flowFile, PROVENANCE_EXECUTION_STATUS_KEY, "Failed: Missing required field specification for PK merge feature");
session.transfer(flowFile, IngestProperties.REL_FAILURE);
return;
}
// Maintain default for backward compatibility
if (StringUtils.isEmpty(mergeStrategyValue)) {
mergeStrategyValue = STRATEGY_DEDUPE_MERGE;
}
logger.info(
"Merge strategy: " + mergeStrategyValue + " Using Source: " + sourceTable + " Target: " + targetTable + " feed partition:" + feedPartitionValue + " partSpec: " + partitionSpecString);
final StopWatch stopWatch = new StopWatch(true);
try (final Connection conn = thriftService.getConnection()) {
TableMergeSyncSupport mergeSupport = new TableMergeSyncSupport(conn);
mergeSupport.enableDynamicPartitions();
if (StringUtils.isNotEmpty(hiveConfigurations)) {
mergeSupport.setHiveConf(hiveConfigurations.split("\\|"));
}
PartitionSpec partitionSpec = new PartitionSpec(partitionSpecString);
if (STRATEGY_DEDUPE_MERGE.equals(mergeStrategyValue)) {
mergeSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec, feedPartitionValue, true);
} else if (STRATEGY_MERGE.equals(mergeStrategyValue)) {
mergeSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec, feedPartitionValue, false);
} else if (STRATEGY_SYNC.equals(mergeStrategyValue)) {
mergeSupport.doSync(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec, feedPartitionValue);
} else if (STRATEGY_ROLLING_SYNC.equals(mergeStrategyValue)) {
mergeSupport.doRollingSync(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec, feedPartitionValue);
} else if (STRATEGY_PK_MERGE.equals(mergeStrategyValue)) {
mergeSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec, feedPartitionValue, columnSpecs);
} else {
throw new UnsupportedOperationException("Failed to resolve the merge strategy");
}
stopWatch.stop();
session.getProvenanceReporter().modifyContent(flowFile, "Execution completed", stopWatch.getElapsed(TimeUnit.MILLISECONDS));
flowFile = session.putAttribute(flowFile, PROVENANCE_EXECUTION_STATUS_KEY, "Successful");
session.transfer(flowFile, REL_SUCCESS);
} catch (final Exception e) {
logger.error("Unable to execute merge doMerge for {} due to {}; routing to failure", new Object[]{flowFile, e}, e);
flowFile = session.putAttribute(flowFile, PROVENANCE_EXECUTION_STATUS_KEY, "Failed: " + e.getMessage());
session.transfer(flowFile, REL_FAILURE);
}
}
}