/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.giraph.hive;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.giraph.conf.GiraphConfiguration;
import org.apache.giraph.conf.GiraphConstants;
import org.apache.giraph.graph.Computation;
import org.apache.giraph.hive.common.HiveUtils;
import org.apache.giraph.hive.input.edge.HiveEdgeInputFormat;
import org.apache.giraph.hive.input.edge.HiveToEdge;
import org.apache.giraph.hive.input.vertex.HiveToVertex;
import org.apache.giraph.hive.input.vertex.HiveVertexInputFormat;
import org.apache.giraph.hive.output.HiveVertexOutputFormat;
import org.apache.giraph.hive.output.VertexToHive;
import org.apache.giraph.io.formats.multi.EdgeInputFormatDescription;
import org.apache.giraph.io.formats.multi.InputFormatDescription;
import org.apache.giraph.io.formats.multi.MultiEdgeInputFormat;
import org.apache.giraph.io.formats.multi.MultiVertexInputFormat;
import org.apache.giraph.io.formats.multi.VertexInputFormatDescription;
import org.apache.giraph.job.GiraphJob;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import java.util.Arrays;
import java.util.List;
import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_EDGE_INPUT;
import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_VERTEX_INPUT;
import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_VERTEX_OUTPUT_DATABASE;
import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_VERTEX_OUTPUT_PARTITION;
import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_VERTEX_OUTPUT_PROFILE_ID;
import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_VERTEX_OUTPUT_TABLE;
import static org.apache.giraph.hive.common.GiraphHiveConstants.VERTEX_TO_HIVE_CLASS;
/**
* Hive Giraph Runner
*/
public class HiveGiraphRunner implements Tool {
/** logger */
private static final Logger LOG = Logger.getLogger(HiveGiraphRunner.class);
/** Prefix for log statements */
private static final String LOG_PREFIX = "\t";
/** workers */
protected int workers;
/** is verbose */
protected boolean isVerbose;
/** vertex class. */
private Class<? extends Computation> computationClass;
/** Descriptions of vertex input formats */
private List<VertexInputFormatDescription> vertexInputDescriptions =
Lists.newArrayList();
/** Descriptions of edge input formats */
private List<EdgeInputFormatDescription> edgeInputDescriptions =
Lists.newArrayList();
/** Hive Vertex writer */
private Class<? extends VertexToHive> vertexToHiveClass;
/** Skip output? (Useful for testing without writing) */
private boolean skipOutput = false;
/** Configuration */
private Configuration conf;
/** Create a new runner */
public HiveGiraphRunner() {
conf = new HiveConf(getClass());
}
public Class<? extends Computation> getComputationClass() {
return computationClass;
}
public void setComputationClass(
Class<? extends Computation> computationClass) {
this.computationClass = computationClass;
}
public List<VertexInputFormatDescription> getVertexInputDescriptions() {
return vertexInputDescriptions;
}
/**
* Whether to use vertex input.
*
* @return true if vertex input enabled (at least one HiveToVertex is set).
*/
public boolean hasVertexInput() {
return !vertexInputDescriptions.isEmpty();
}
/**
* Add vertex input
*
* @param hiveToVertexClass HiveToVertex class to use
* @param tableName Table name
* @param partitionFilter Partition filter, or null if no filter used
* @param additionalOptions Additional options, in the form "option=value"
*/
public void addVertexInput(Class<? extends HiveToVertex> hiveToVertexClass,
String tableName, String partitionFilter, String ... additionalOptions) {
VertexInputFormatDescription description =
new VertexInputFormatDescription(HiveVertexInputFormat.class);
description.addParameter(
HIVE_VERTEX_INPUT.getClassOpt().getKey(), hiveToVertexClass.getName());
description.addParameter(HIVE_VERTEX_INPUT.getProfileIdOpt().getKey(),
"vertex_input_profile_" + vertexInputDescriptions.size());
description.addParameter(
HIVE_VERTEX_INPUT.getTableOpt().getKey(), tableName);
if (partitionFilter != null && !partitionFilter.isEmpty()) {
description.addParameter(
HIVE_VERTEX_INPUT.getPartitionOpt().getKey(), partitionFilter);
}
addAdditionalOptions(description, additionalOptions);
vertexInputDescriptions.add(description);
}
public List<EdgeInputFormatDescription> getEdgeInputDescriptions() {
return edgeInputDescriptions;
}
/**
* Whether to use edge input.
*
* @return true if edge input enabled (at least one HiveToEdge is set).
*/
public boolean hasEdgeInput() {
return !edgeInputDescriptions.isEmpty();
}
/**
* Add edge input
*
* @param hiveToEdgeClass HiveToEdge class to use
* @param tableName Table name
* @param partitionFilter Partition filter, or null if no filter used
* @param additionalOptions Additional options, in the form "option=value"
*/
public void addEdgeInput(Class<? extends HiveToEdge> hiveToEdgeClass,
String tableName, String partitionFilter, String ... additionalOptions) {
EdgeInputFormatDescription description =
new EdgeInputFormatDescription(HiveEdgeInputFormat.class);
description.addParameter(
HIVE_EDGE_INPUT.getClassOpt().getKey(), hiveToEdgeClass.getName());
description.addParameter(HIVE_EDGE_INPUT.getProfileIdOpt().getKey(),
"edge_input_profile_" + edgeInputDescriptions.size());
description.addParameter(
HIVE_EDGE_INPUT.getTableOpt().getKey(), tableName);
if (partitionFilter != null && !partitionFilter.isEmpty()) {
description.addParameter(
HIVE_EDGE_INPUT.getPartitionOpt().getKey(), partitionFilter);
}
addAdditionalOptions(description, additionalOptions);
edgeInputDescriptions.add(description);
}
/**
* Add additional options to InputFormatDescription
*
* @param description InputFormatDescription
* @param additionalOptions Additional options
*/
private static void addAdditionalOptions(InputFormatDescription description,
String ... additionalOptions) {
for (String additionalOption : additionalOptions) {
String[] nameValue = split(additionalOption, "=");
if (nameValue.length != 2) {
throw new IllegalStateException("Invalid additional option format " +
additionalOption + ", 'name=value' format expected");
}
description.addParameter(nameValue[0], nameValue[1]);
}
}
public Class<? extends VertexToHive> getVertexToHiveClass() {
return vertexToHiveClass;
}
/**
* Whether we are writing vertices out.
*
* @return true if vertex output enabled
*/
public boolean hasVertexOutput() {
return !skipOutput && vertexToHiveClass != null;
}
/**
* Set vertex output
*
* @param vertexToHiveClass class for writing vertices to Hive.
* @param tableName Table name
* @param partitionFilter Partition filter, or null if no filter used
*/
public void setVertexOutput(
Class<? extends VertexToHive> vertexToHiveClass, String tableName,
String partitionFilter) {
this.vertexToHiveClass = vertexToHiveClass;
VERTEX_TO_HIVE_CLASS.set(conf, vertexToHiveClass);
HIVE_VERTEX_OUTPUT_PROFILE_ID.set(conf, "vertex_output_profile");
HIVE_VERTEX_OUTPUT_TABLE.set(conf, tableName);
if (partitionFilter != null) {
HIVE_VERTEX_OUTPUT_PARTITION.set(conf, partitionFilter);
}
}
/**
* main method
* @param args system arguments
* @throws Exception any errors from Hive Giraph Runner
*/
public static void main(String[] args) throws Exception {
HiveGiraphRunner runner = new HiveGiraphRunner();
System.exit(ToolRunner.run(runner, args));
}
@Override
public final int run(String[] args) throws Exception {
// process args
try {
handleCommandLine(args);
} catch (InterruptedException e) {
return 0;
} catch (IllegalArgumentException e) {
System.err.println(e.getMessage());
return -1;
}
// additional configuration for Hive
HiveUtils.addHadoopClasspathToTmpJars(conf);
HiveUtils.addHiveSiteXmlToTmpFiles(conf);
// setup GiraphJob
GiraphJob job = new GiraphJob(getConf(), getClass().getName());
GiraphConfiguration giraphConf = job.getConfiguration();
giraphConf.setComputationClass(computationClass);
giraphConf.setWorkerConfiguration(workers, workers, 100.0f);
initGiraphJob(job);
logOptions(giraphConf);
return job.run(isVerbose) ? 0 : -1;
}
/**
* Prepare vertex input settings in Configuration
*/
@SuppressWarnings("unchecked")
public void prepareHiveVertexInputs() {
if (vertexInputDescriptions.size() == 1) {
GiraphConstants.VERTEX_INPUT_FORMAT_CLASS.set(conf,
vertexInputDescriptions.get(0).getInputFormatClass());
vertexInputDescriptions.get(0).putParametersToConfiguration(conf);
} else {
GiraphConstants.VERTEX_INPUT_FORMAT_CLASS.set(conf,
MultiVertexInputFormat.class);
VertexInputFormatDescription.VERTEX_INPUT_FORMAT_DESCRIPTIONS.set(conf,
InputFormatDescription.toJsonString(vertexInputDescriptions));
}
}
/**
* Prepare edge input settings in Configuration
*/
@SuppressWarnings("unchecked")
public void prepareHiveEdgeInputs() {
if (edgeInputDescriptions.size() == 1) {
GiraphConstants.EDGE_INPUT_FORMAT_CLASS.set(conf,
edgeInputDescriptions.get(0).getInputFormatClass());
edgeInputDescriptions.get(0).putParametersToConfiguration(conf);
} else {
GiraphConstants.EDGE_INPUT_FORMAT_CLASS.set(conf,
MultiEdgeInputFormat.class);
EdgeInputFormatDescription.EDGE_INPUT_FORMAT_DESCRIPTIONS.set(conf,
InputFormatDescription.toJsonString(edgeInputDescriptions));
}
}
/**
* Prepare output settings in Configuration
*/
public void prepareHiveOutput() {
GiraphConstants.VERTEX_OUTPUT_FORMAT_CLASS.set(conf,
HiveVertexOutputFormat.class);
}
/**
* process arguments
* @param args to process
* @return CommandLine instance
* @throws org.apache.commons.cli.ParseException error parsing arguments
* @throws InterruptedException interrupted
*/
private CommandLine handleCommandLine(String[] args) throws ParseException,
InterruptedException {
Options options = new Options();
addOptions(options);
addMoreOptions(options);
CommandLineParser parser = new GnuParser();
final CommandLine cmdln = parser.parse(options, args);
if (args.length == 0 || cmdln.hasOption("help")) {
new HelpFormatter().printHelp(getClass().getName(), options, true);
throw new InterruptedException();
}
// Giraph classes
String computationClassStr = cmdln.getOptionValue("computationClass");
if (computationClassStr != null) {
computationClass = findClass(computationClassStr, Computation.class);
}
if (computationClass == null) {
throw new IllegalArgumentException(
"Need the Giraph " + Computation.class.getSimpleName() +
" class name (-computationClass) to use");
}
String[] vertexInputs = cmdln.getOptionValues("vertexInput");
if (vertexInputs != null && vertexInputs.length != 0) {
vertexInputDescriptions.clear();
for (String vertexInput : vertexInputs) {
String[] parameters = split(vertexInput, ",");
if (parameters.length < 2) {
throw new IllegalStateException("Illegal vertex input description " +
vertexInput + " - HiveToVertex class and table name needed");
}
addVertexInput(findClass(parameters[0], HiveToVertex.class),
parameters[1], elementOrNull(parameters, 2),
copyOfArray(parameters, 3));
}
}
String[] edgeInputs = cmdln.getOptionValues("edgeInput");
if (edgeInputs != null && edgeInputs.length != 0) {
edgeInputDescriptions.clear();
for (String edgeInput : edgeInputs) {
String[] parameters = split(edgeInput, ",");
if (parameters.length < 2) {
throw new IllegalStateException("Illegal edge input description " +
edgeInput + " - HiveToEdge class and table name needed");
}
addEdgeInput(findClass(parameters[0], HiveToEdge.class),
parameters[1], elementOrNull(parameters, 2),
copyOfArray(parameters, 3));
}
}
String output = cmdln.getOptionValue("output");
if (output != null) {
// Partition filter can contain commas so we limit the number of times
// we split
String[] parameters = split(output, ",", 3);
if (parameters.length < 2) {
throw new IllegalStateException("Illegal output description " +
output + " - VertexToHive class and table name needed");
}
setVertexOutput(findClass(parameters[0], VertexToHive.class),
parameters[1], elementOrNull(parameters, 2));
}
if (cmdln.hasOption("skipOutput")) {
skipOutput = true;
}
if (!hasVertexInput() && !hasEdgeInput()) {
throw new IllegalArgumentException(
"Need at least one of Giraph " +
HiveToVertex.class.getSimpleName() +
" (-vertexInput) and " +
HiveToEdge.class.getSimpleName() +
" (-edgeInput)");
}
if (vertexToHiveClass == null && !skipOutput) {
throw new IllegalArgumentException(
"Need the Giraph " + VertexToHive.class.getSimpleName() +
" (-output) to use");
}
String workersStr = cmdln.getOptionValue("workers");
if (workersStr == null) {
throw new IllegalArgumentException(
"Need to choose the number of workers (-w)");
}
String dbName = cmdln.getOptionValue("dbName", "default");
if (hasVertexInput()) {
HIVE_VERTEX_INPUT.getDatabaseOpt().set(conf, dbName);
prepareHiveVertexInputs();
}
if (hasEdgeInput()) {
HIVE_EDGE_INPUT.getDatabaseOpt().set(conf, dbName);
prepareHiveEdgeInputs();
}
if (!skipOutput) {
HIVE_VERTEX_OUTPUT_DATABASE.set(conf, dbName);
prepareHiveOutput();
} else {
LOG.warn("run: Warning - Output will be skipped!");
}
workers = Integer.parseInt(workersStr);
isVerbose = cmdln.hasOption("verbose");
// pick up -hiveconf arguments
HiveUtils.processHiveconfOptions(cmdln.getOptionValues("hiveconf"), conf);
processMoreArguments(cmdln);
return cmdln;
}
/**
* Add hive-related options to command line parser options
*
* @param options Options to use
*/
private void addOptions(Options options) {
options.addOption("h", "help", false, "Help");
options.addOption("v", "verbose", false, "Verbose");
options.addOption("D", "hiveconf", true,
"property=value for Hive/Hadoop configuration");
options.addOption("w", "workers", true, "Number of workers");
if (computationClass == null) {
options.addOption(null, "computationClass", true,
"Giraph Computation class to use");
}
options.addOption("db", "dbName", true, "Hive database name");
// Vertex input settings
options.addOption("vi", "vertexInput", true, getInputOptionDescription(
"vertex", HiveToVertex.class.getSimpleName()));
// Edge input settings
options.addOption("ei", "edgeInput", true, getInputOptionDescription(
"edge", HiveToEdge.class.getSimpleName()));
// Vertex output settings
options.addOption("o", "output", true,
"Giraph " + VertexToHive.class.getSimpleName() + " class to use," +
" table name and partition filter (optional). Example:\n" +
"\"MyVertexToHive, myTableName, a=1,b=two\"");
options.addOption("s", "skipOutput", false, "Skip output?");
}
/**
* Get description for the input format option (vertex or edge).
*
* @param inputType Type of input (vertex or edge)
* @param hiveToObjectClassName HiveToVertex or HiveToEdge
* @return Description for the input format option
*/
private static String getInputOptionDescription(String inputType,
String hiveToObjectClassName) {
StringBuilder inputOption = new StringBuilder();
inputOption.append("Giraph ").append(hiveToObjectClassName).append(
" class to use, table name and partition filter (optional).");
inputOption.append(" Additional options for the input format can be " +
"specified as well.");
inputOption.append(" You can set as many ").append(inputType).append(
" inputs as you like.");
inputOption.append(" Example:\n");
inputOption.append("\"My").append(hiveToObjectClassName).append(
", myTableName, a<2 AND b='two', option1=value1, option2=value2\"");
return inputOption.toString();
}
/**
*
* @param className to find
* @param base base class
* @param <T> class type found
* @return type found
*/
private <T> Class<? extends T> findClass(String className, Class<T> base) {
try {
Class<?> cls = Class.forName(className);
if (base.isAssignableFrom(cls)) {
return cls.asSubclass(base);
}
return null;
} catch (ClassNotFoundException e) {
throw new IllegalArgumentException(className + ": Invalid class name");
}
}
@Override
public final Configuration getConf() {
return conf;
}
@Override
public final void setConf(Configuration conf) {
this.conf = new GiraphConfiguration(conf);
}
/**
* Override this method to add more command-line options. You can process
* them by also overriding {@link #processMoreArguments(CommandLine)}.
*
* @param options Options
*/
protected void addMoreOptions(Options options) {
}
/**
* Override this method to process additional command-line arguments. You
* may want to declare additional options by also overriding
* {@link #addMoreOptions(org.apache.commons.cli.Options)}.
*
* @param cmd Command
*/
protected void processMoreArguments(CommandLine cmd) {
}
/**
* Override this method to do additional setup with the GiraphJob that will
* run.
*
* @param job GiraphJob that is going to run
*/
protected void initGiraphJob(GiraphJob job) { }
/**
* Log the options set by user
*
* @param giraphConf GiraphConfiguration
*/
private void logOptions(GiraphConfiguration giraphConf) {
LOG.info(getClass().getSimpleName() + " with");
LOG.info(LOG_PREFIX + "-computationClass=" +
computationClass.getCanonicalName());
for (VertexInputFormatDescription description : vertexInputDescriptions) {
LOG.info(LOG_PREFIX + "Vertex input: " + description);
}
for (EdgeInputFormatDescription description : edgeInputDescriptions) {
LOG.info(LOG_PREFIX + "Edge input: " + description);
}
if (GiraphConstants.VERTEX_OUTPUT_FORMAT_CLASS.contains(giraphConf)) {
LOG.info(LOG_PREFIX + "Output: VertexToHive=" +
vertexToHiveClass.getCanonicalName() + ", table=" +
HIVE_VERTEX_OUTPUT_TABLE.get(conf) + ", partition=\"" +
HIVE_VERTEX_OUTPUT_PARTITION.get(conf) + "\"");
}
LOG.info(LOG_PREFIX + "-workers=" + workers);
}
/**
* Split a string using separator and trim the results
*
* @param stringToSplit String to split
* @param separator Separator
* @return Separated strings, trimmed
*/
private static String[] split(String stringToSplit, String separator) {
return split(stringToSplit, separator, -1);
}
/**
* Split a string using separator and trim the results
*
* @param stringToSplit String to split
* @param separator Separator
* @param limit See {@link String#split(String, int)}
* @return Separated strings, trimmed
*/
private static String[] split(String stringToSplit, String separator,
int limit) {
Splitter splitter = Splitter.on(separator).trimResults();
if (limit > 0) {
splitter = splitter.limit(limit);
}
return Iterables.toArray(splitter.split(stringToSplit), String.class);
}
/**
* Get the element in array at certain position, or null if the position is
* out of array size
*
* @param array Array
* @param position Position
* @return Element at the position or null if the position is out of array
*/
private static String elementOrNull(String[] array, int position) {
return (position < array.length) ? array[position] : null;
}
/**
* Return a copy of array from some position to the end,
* or empty array if startIndex is out of array size
*
* @param array Array to take a copy from
* @param startIndex Starting position
* @return Copy of part of the array
*/
private static String[] copyOfArray(String[] array, int startIndex) {
if (array.length <= startIndex) {
return new String[0];
} else {
return Arrays.copyOfRange(array, startIndex, array.length);
}
}
}