package com.thinkbiganalytics.nifi.v2.hdfs;
/*-
* #%L
* thinkbig-nifi-hadoop-processors
* %%
* Copyright (C) 2017 ThinkBig Analytics
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.gson.Gson;
import com.google.gson.JsonSyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.tools.DistCp;
import org.apache.hadoop.tools.DistCpOptions;
import org.apache.nifi.annotation.behavior.EventDriven;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import javax.annotation.Nonnull;
/**
* Processor for performing a distributed copy between two HDFS clusters
*
* @see <a href="http://hadoop.apache.org/docs/stable/hadoop-distcp/DistCp.html">http://hadoop.apache.org/docs/stable/hadoop-distcp/DistCp.html</a>
*/
@CapabilityDescription("Copies files from one HDFS location into another using DistCp MR job")
@EventDriven
@Tags({"hadoop", "HDFS", "filesystem", "thinkbig", "copy", "distributed copy", "distcp"})
public class DistCopyHDFS extends AbstractHadoopProcessor {
/**
* Relationship for failure
*/
public static final Relationship REL_FAILURE = new Relationship.Builder()
.name("failure")
.description("At least one of the provided files not found")
.build();
/**
* Relationship for success
*/
public static final Relationship REL_SUCCESS = new Relationship.Builder()
.name("success")
.description("Files copied to new location")
.build();
/**
* Property to define the source path, which is used as a base for all {@link FILES}
*/
public static final PropertyDescriptor SOURCE = new PropertyDescriptor.Builder()
.name("source.path")
.description("Absolute source path, if provided along with 'files' parameter will be treated as absolute " +
"path for relative paths provided in that parameter")
.required(false)
.addValidator(StandardValidators.ATTRIBUTE_EXPRESSION_LANGUAGE_VALIDATOR)
.expressionLanguageSupported(true)
.build();
/**
* Property that defines the destination path as an absolute path
*/
public static final PropertyDescriptor DESTINATION = new PropertyDescriptor.Builder()
.name("destination.path")
.description("Absolute destination path")
.required(true)
.addValidator(StandardValidators.ATTRIBUTE_EXPRESSION_LANGUAGE_VALIDATOR)
.expressionLanguageSupported(true)
.build();
/**
* property which is a JSON encoded list of files of the form:
* [{ "name" : "example" }]
*/
public static final PropertyDescriptor FILES = new PropertyDescriptor.Builder()
.name("files")
.description("JSON-encoded list of files, given like: " +
"[{\n" +
" \"name\": \"example\",\n" +
" }\n" +
"]")
.required(false)
.addValidator(StandardValidators.ATTRIBUTE_EXPRESSION_LANGUAGE_VALIDATOR)
.expressionLanguageSupported(true)
.build();
/**
* Output paths to other NiFi processors
*/
private static final Set<Relationship> relationships = ImmutableSet.of(REL_FAILURE, REL_SUCCESS);
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return ImmutableList.<PropertyDescriptor>builder().addAll(super.getSupportedPropertyDescriptors()).
add(DESTINATION).add(SOURCE).add(FILES).build();
}
/**
* get the relationships required for the Processor
*
* @return a set of relationships
*/
@Override
public Set<Relationship> getRelationships() {
return relationships;
}
/**
* onTrigger is called when the flow file proceeds through the processor
*
* @param context passed in by the framework and provides access to the data configured in the processor
* @param session passed in by the framework and provides access to the flow file
* @throws ProcessException if any framework actions fail
*/
@Override
public void onTrigger(@Nonnull final ProcessContext context, @Nonnull final ProcessSession session) throws ProcessException {
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
final FileSystem fs = getFileSystem(context);
if (fs == null) {
getLog().error("Couldn't initialize HDFS");
session.transfer(flowFile, REL_FAILURE);
return;
}
String filesJSON = context.getProperty(FILES).evaluateAttributeExpressions(flowFile).getValue();
String source = context.getProperty(SOURCE).evaluateAttributeExpressions(flowFile).getValue();
String destination = context.getProperty(DESTINATION).evaluateAttributeExpressions(flowFile).getValue();
Gson jsonParser = new Gson();
File[] filesList;
ArrayList<Path> pathsList = new ArrayList<>();
try {
if (!(filesJSON == null) && !filesJSON.isEmpty()) {
filesList = jsonParser.fromJson(filesJSON, File[].class);
if (filesList == null) {
filesList = new File[0];
}
if (source != null && !source.isEmpty()) {
for (File f : filesList) {
pathsList.add(new Path(source, f.getName()));
}
} else {
for (File f : filesList) {
pathsList.add(new Path(f.getName()));
}
}
} else {
if (source == null || source.isEmpty()) {
getLog().error(String.format("At least one of attributes: %s or %s needs to be set",
SOURCE.getName(), FILES.getName()));
session.transfer(flowFile, REL_FAILURE);
return;
}
pathsList.add(new Path(source));
}
DistCp distCp = getDistCp(pathsList, new Path(destination));
Job job = distCp.execute();
job.waitForCompletion(false);
} catch (JsonSyntaxException e) {
getLog().error("Files list attribute does not contain a proper JSON array");
session.transfer(flowFile, REL_FAILURE);
return;
} catch (Exception e) {
getLog().error("Exception during processor execution: " + e.getMessage());
session.transfer(flowFile, REL_FAILURE);
return;
}
session.transfer(flowFile, REL_SUCCESS);
}
/**
* method to construct a new DistCp object to perform the distcp
*
* @param pathsList A list of paths to be recursively copied from one cluster to another
* @param destination The root location on the target cluster
* @return a DistCp object
* @throws Exception if the construction of the {@link DistCp} object fails for any reason
*/
protected DistCp getDistCp(List<Path> pathsList, Path destination) throws Exception {
final Configuration conf = getConfiguration();
DistCpOptions opts = new DistCpOptions(pathsList, destination);
return new DistCp(conf, opts);
}
class File {
private String name;
public File(String name) {
this.name = name;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
}