/*******************************************************************************
* Copyright 2013
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.dkpro.bigdata.hadoop;
import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobStatus;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.resource.ResourceInitializationException;
import org.dkpro.bigdata.io.hadoop.BinCasWithTypeSystemWritable;
import org.dkpro.bigdata.io.hadoop.CASWritableSequenceFileWriter;
/**
* Base class for running UIMA Pipelines on the cluster, see also
* https://maggie/wiki/bin/view/DKPro/ExecutingDKProPipelinesOnHadoop
*
* @author zorn
*
*/
public abstract class DkproHadoopDriver
extends Configured
implements Tool, EngineFactory
{
private Class<? extends DkproMapper> mapperClass = DkproMapper.class;
private Class<? extends DkproReducer> reducerClass = DkproReducer.class;
private JobConf job;
public abstract Class getInputFormatClass();
/**
* Get the mapper implementation
*/
public Class<? extends DkproMapper> getMapperClass()
{
return this.mapperClass;
}
/**
* Set a custom mapper implementation
*/
public void setMapperClass(Class<? extends DkproMapper> mapperClass)
{
this.mapperClass = mapperClass;
}
public Class<? extends DkproReducer> getReducerClass()
{
return this.reducerClass;
}
/**
* Set a custom reducer implementation
*/
public void setReducerClass(Class<? extends DkproReducer> reducerClass)
{
this.reducerClass = reducerClass;
}
/**
* Implement this method to configure your job.
*
* @param job
*/
@Override
public abstract void configure(JobConf job);
/**
* Runs the UIMA pipeline.
*
* @return 0 if Hadoop job succeeded, 1 if job failed, 2 if it was killed, otherwise 3
*
* @see org.apache.hadoop.util.Tool#run(java.lang.String[])
*/
@Override
public int run(String[] args)
throws Exception
{
if (args.length < 2) {
System.out.println("Usage: " + this.getClass().getSimpleName()
+ " [hadoop-params] input output [job-params]");
System.exit(1);
}
this.job = new JobConf(getConf(), DkproHadoopDriver.class);
final FileSystem fs = FileSystem.get(this.job);
// set the factory class name
this.job.set("dkpro.uima.factory", this.getClass().getName());
Path inputPath;
if (args[0].contains(",")) {
String[] inputPaths = args[0].split(",");
inputPath = new Path(inputPaths[0]);
for (String path : inputPaths) {
FileInputFormat.addInputPath(job, new Path(path));
}
}
else {
inputPath = new Path(args[0]); // input
FileInputFormat.setInputPaths(this.job, inputPath);
}
String outDir = args[1];
if (!getConf().getBoolean("dkpro.output.overwrite", true)) {
outDir = getUniqueDirectoryName(outDir, fs);
}
final Path outputPath = new Path(outDir);// output
final CollectionReader reader = buildCollectionReader();
// if a collection reader was defined, import data into hdfs
// try {
// final Class<?> c = Class.forName("org.apache.hadoop.io.compress.SnappyCodec");
// FileOutputFormat.setOutputCompressorClass(this.job,
// (Class<? extends CompressionCodec>) c);
// }
// catch (final Exception e) {
//
// }
if (reader != null) {
final AnalysisEngine xcasWriter = AnalysisEngineFactory.createEngine(
CASWritableSequenceFileWriter.class, // createTypeSystemDescription(),
CASWritableSequenceFileWriter.PARAM_PATH, inputPath.toString(),
CASWritableSequenceFileWriter.PARAM_COMPRESS, true,
CASWritableSequenceFileWriter.PARAM_FS, job.get(("fs.default.name"), "file:/"));
runPipeline(reader, xcasWriter);
}
// cleanup previous output
fs.delete(outputPath, true);
// this is a sensible default for the UKP cluster
// int numMappers = 256;
// if (args.length > 2) {
// numMappers = Integer.parseInt(args[2]);
// }
FileOutputFormat.setOutputPath(this.job, outputPath);
// SequenceFileOutputFormat.setCompressOutput(this.job, true);
if (this.job.get("mapred.output.compress") == null) {
this.job.setBoolean("mapred.output.compress", true);
}
// Just in case compression is on
this.job.set("mapred.output.compression.type", "BLOCK");
if (this.job.getBoolean("dkpro.output.writecas", true)) {
if (this.job.getBoolean("dkpro.output.plaintext", false)) {
this.job.setOutputFormat(TextOutputFormat.class);
}
else {
this.job.setOutputFormat(SequenceFileOutputFormat.class);
}
} else {
job.setOutputFormat(NullOutputFormat.class);
}
// this.job.set("mapred.output.compression.codec",
// "org.apache.hadoop.io.compress.GzipCodec");
// use compression
// setup some sensible defaults
this.job.setMapperClass(this.mapperClass);
this.job.setReducerClass(this.reducerClass);
if (getInputFormatClass() != null) {
this.job.setInputFormat(getInputFormatClass());
}
else {
this.job.setInputFormat(SequenceFileInputFormat.class);
}
// this.job.setOutputFormat(TextOutputFormat.class);
this.job.setMapOutputKeyClass(Text.class);
this.job.setMapOutputValueClass(BinCasWithTypeSystemWritable.class);
this.job.setOutputKeyClass(Text.class);
this.job.setOutputValueClass(BinCasWithTypeSystemWritable.class);
this.job.setJobName(this.getClass().getSimpleName());
// this.job.set("mapred.child.java.opts", "-Xmx1g");
// this.job.setInt("mapred.job.map.memory.mb", 1280);
// this.job.setInt("mapred.job.reduce.memory.mb", 1280);
// this.job.setNumMapTasks(numMappers);
this.job.setNumReduceTasks(0);
configure(this.job);
// create symlinks for distributed resources
DistributedCache.createSymlink(this.job);
// sLogger.info("Running job "+job.getJobName());
RunningJob runningJob = JobClient.runJob(this.job);
runningJob.waitForCompletion();
int status = runningJob.getJobState();
if (status == JobStatus.SUCCEEDED) {
return 0;
} else if (status == JobStatus.FAILED) {
return 1;
} else if (status == JobStatus.KILLED) {
return 2;
} else {
return 3;
}
}
private String getUniqueDirectoryName(String dir, FileSystem fs) throws IllegalArgumentException, IOException {
int outDirSuffix = 2;
String uniqueDir = dir;
while (fs.exists(new Path(uniqueDir))) {
uniqueDir = dir + outDirSuffix;
outDirSuffix++;
}
return uniqueDir;
}
/**
* Register a data archive to be distributed to the distributed cache. The resource can than be
* accessed from any UIMA component by specifying $name within the configuration.
*
* Archives that are bigger than 4 GB need to be .tar.gz, because Java6 zip implementation does
* not support zip > 4GB
*
* For External Resources, the ER has to be setup using job.getResource("name") in the
* build*Engine method.
*
*
* @param name
* identifier for the arcive
* @param uri
* URI of the archive, can be file:/... or hdfs://...
*/
public void registerDataArchive(String name, URI uri)
{
try {
DistributedCache.addCacheArchive(new URI(uri.toString() + "#" + name), this.job);
String resources = this.job.get("dkpro.resources", "");
if (!resources.isEmpty()) {
resources += ",";
}
resources += name;
this.job.set("dkpro.resources", resources);
}
catch (final URISyntaxException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* Overwrite this method if you need to import data using a UIMA collection reader
*
* @return
* @throws ResourceInitializationException
*/
public CollectionReader buildCollectionReader()
throws ResourceInitializationException
{
return null;
}
@Override
public AnalysisEngineDescription buildReducerEngine(Configuration job)
throws ResourceInitializationException {
return null;
}
}