DkproHadoopDriver.java example

Explorer

dkpro-bigdata-master
- dkpro-bigdata-collocations
  - src
    - main
      - java
        org
        dkpro
        bigdata
        collocations
        AssocReducer.java
        AssociationMetrics.java
        CollocCombiner.java
        CollocDriver.java
        CollocMapper.java
        CollocReducer.java
        Gram.java
        GramKey.java
        GramKeyGroupComparator.java
        GramKeyPartitioner.java
- dkpro-bigdata-examples
  - src
    - main
      - java
        org
        dkpro
        bigdata
        examples
        CasConsumerExample.java
        ExternalDataExample.java
        Text2CASExample.java
        UimaPipelineOnHadoop.java
- dkpro-bigdata-hadoop
  - src
    - main
      - java
        org
        dkpro
        bigdata
        hadoop
        AnalysisEngineUtil.java
        DkproHadoopDriver.java
        DkproMapper.java
        DkproReducer.java
        EngineFactory.java
        FeatureCountHadoopDriver.java
        UIMAMapReduceBase.java
        XMLDescriptorRunner.java
    - test
      - java
        org
        dkpro
        bigdata
        hadoop
        CasConsumerOutputTest.java
        DkproHadoopDriverTest.java
- dkpro-bigdata-io-hadoop
  - src
    - main
      - java
        org
        dkpro
        bigdata
        io
        hadoop
        ARCInputFormat.java
        BinCasWithTypeSystemWritable.java
        BinCasWritable.java
        CASWritable.java
        CASWritableSequenceFileWriter.java
        CollectionReaderWrapper.java
        CrawlerRecord.java
        DummyEncodingDetector.java
        EncodingDetector.java
        FormatConverterMapper.java
        GenericKeyValueLineRecordReader.java
        GenericMultiLineRecordReader.java
        HdfsResourceLoaderLocator.java
        LeipzigInputFormat.java
        MultiLineText2CASInputFormat.java
        Text2CASInputFormat.java
        WARCInputFormat.java
        XCASSequenceFileWriter.java
        XmiSequenceFileWriter.java
    - test
      - java
        org
        dkpro
        bigdata
        io
        hadoop
        ARCInputFormatTest.java
        BinCasWithTypeSystemWritableTest.java
        BinCasWritableTest.java
        CASWritableTest.java
        HdfsResourceLoaderLocatorTest.java
        InputFormatTest.java
        LeipzigInputFormatTest.java
        WARCInputFormatTest.java

/*******************************************************************************
 * Copyright 2013
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universität Darmstadt
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.dkpro.bigdata.hadoop;

import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobStatus;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.resource.ResourceInitializationException;
import org.dkpro.bigdata.io.hadoop.BinCasWithTypeSystemWritable;
import org.dkpro.bigdata.io.hadoop.CASWritableSequenceFileWriter;

/**
 * Base class for running UIMA Pipelines on the cluster, see also
 * https://maggie/wiki/bin/view/DKPro/ExecutingDKProPipelinesOnHadoop
 * 
 * @author zorn
 * 
 */
public abstract class DkproHadoopDriver
    extends Configured
    implements Tool, EngineFactory
{
    private Class<? extends DkproMapper> mapperClass = DkproMapper.class;
    private Class<? extends DkproReducer> reducerClass = DkproReducer.class;

    private JobConf job;

    public abstract Class getInputFormatClass();

    /**
     * Get the mapper implementation
     */
    public Class<? extends DkproMapper> getMapperClass()
    {
        return this.mapperClass;
    }

    /**
     * Set a custom mapper implementation
     */
    public void setMapperClass(Class<? extends DkproMapper> mapperClass)
    {
        this.mapperClass = mapperClass;
    }

    public Class<? extends DkproReducer> getReducerClass()
    {
        return this.reducerClass;
    }

    /**
     * Set a custom reducer implementation
     */
    public void setReducerClass(Class<? extends DkproReducer> reducerClass)
    {
        this.reducerClass = reducerClass;
    }

    /**
     * Implement this method to configure your job.
     * 
     * @param job
     */
    @Override
    public abstract void configure(JobConf job);

    /**
     * Runs the UIMA pipeline.
     * 
     * @return 0 if Hadoop job succeeded, 1 if job failed, 2 if it was killed, otherwise 3
     * 
     * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
     */
    @Override
    public int run(String[] args)
        throws Exception
    {
        if (args.length < 2) {
            System.out.println("Usage: " + this.getClass().getSimpleName()
                    + " [hadoop-params] input output [job-params]");
            System.exit(1);
        }
        this.job = new JobConf(getConf(), DkproHadoopDriver.class);
        final FileSystem fs = FileSystem.get(this.job);
         // set the factory class name
        this.job.set("dkpro.uima.factory", this.getClass().getName());
        Path inputPath;
        if (args[0].contains(",")) {
            String[] inputPaths = args[0].split(",");
            inputPath = new Path(inputPaths[0]);
            for (String path : inputPaths) {
                FileInputFormat.addInputPath(job, new Path(path));
            }
        }
        else {
            inputPath = new Path(args[0]); // input
            FileInputFormat.setInputPaths(this.job, inputPath);

        }
        String outDir = args[1];
		if (!getConf().getBoolean("dkpro.output.overwrite", true)) {
			outDir = getUniqueDirectoryName(outDir, fs);
		}
        final Path outputPath = new Path(outDir);// output
        final CollectionReader reader = buildCollectionReader();
        // if a collection reader was defined, import data into hdfs
        // try {
        // final Class<?> c = Class.forName("org.apache.hadoop.io.compress.SnappyCodec");
        // FileOutputFormat.setOutputCompressorClass(this.job,
        // (Class<? extends CompressionCodec>) c);
        // }
        // catch (final Exception e) {
        //
        // }
        if (reader != null) {
            final AnalysisEngine xcasWriter = AnalysisEngineFactory.createEngine(
                    CASWritableSequenceFileWriter.class, // createTypeSystemDescription(),
                    CASWritableSequenceFileWriter.PARAM_PATH, inputPath.toString(),
                    CASWritableSequenceFileWriter.PARAM_COMPRESS, true,
                    CASWritableSequenceFileWriter.PARAM_FS, job.get(("fs.default.name"), "file:/"));
            runPipeline(reader, xcasWriter);
        }
        // cleanup previous output
        fs.delete(outputPath, true);
        // this is a sensible default for the UKP cluster
//        int numMappers = 256;
        // if (args.length > 2) {
        // numMappers = Integer.parseInt(args[2]);
        // }

        FileOutputFormat.setOutputPath(this.job, outputPath);
        // SequenceFileOutputFormat.setCompressOutput(this.job, true);

        if (this.job.get("mapred.output.compress") == null) {
            this.job.setBoolean("mapred.output.compress", true);
        }
        // Just in case compression is on
        this.job.set("mapred.output.compression.type", "BLOCK");

        if (this.job.getBoolean("dkpro.output.writecas", true)) {
	        if (this.job.getBoolean("dkpro.output.plaintext", false)) {
	            this.job.setOutputFormat(TextOutputFormat.class);
	        }
	        else {
	            this.job.setOutputFormat(SequenceFileOutputFormat.class);
	        }
        } else {
    		job.setOutputFormat(NullOutputFormat.class);
        }
        // this.job.set("mapred.output.compression.codec",
        // "org.apache.hadoop.io.compress.GzipCodec");
        // use compression
        // setup some sensible defaults
        this.job.setMapperClass(this.mapperClass);
        this.job.setReducerClass(this.reducerClass);
        if (getInputFormatClass() != null) {
            this.job.setInputFormat(getInputFormatClass());
        }
        else {
            this.job.setInputFormat(SequenceFileInputFormat.class);
        }
        // this.job.setOutputFormat(TextOutputFormat.class);
        this.job.setMapOutputKeyClass(Text.class);
        this.job.setMapOutputValueClass(BinCasWithTypeSystemWritable.class);
        this.job.setOutputKeyClass(Text.class);
        this.job.setOutputValueClass(BinCasWithTypeSystemWritable.class);
        this.job.setJobName(this.getClass().getSimpleName());
        // this.job.set("mapred.child.java.opts", "-Xmx1g");
//        this.job.setInt("mapred.job.map.memory.mb", 1280);
//        this.job.setInt("mapred.job.reduce.memory.mb", 1280);
//        this.job.setNumMapTasks(numMappers);
        this.job.setNumReduceTasks(0);
        configure(this.job);

        // create symlinks for distributed resources
        DistributedCache.createSymlink(this.job);
        // sLogger.info("Running job "+job.getJobName());

        RunningJob runningJob = JobClient.runJob(this.job);
        runningJob.waitForCompletion();
        int status = runningJob.getJobState();
        if (status == JobStatus.SUCCEEDED) {
        	return 0;
        } else if (status == JobStatus.FAILED) {
        	return 1;
        } else if (status == JobStatus.KILLED) {
        	return 2;
        } else {
        	return 3;
        }

    }
    
    private String getUniqueDirectoryName(String dir, FileSystem fs) throws IllegalArgumentException, IOException {
		int outDirSuffix = 2;
		String uniqueDir = dir;
		while (fs.exists(new Path(uniqueDir))) {
			uniqueDir = dir + outDirSuffix;
			outDirSuffix++;
		}
		return uniqueDir;
    }

    /**
     * Register a data archive to be distributed to the distributed cache. The resource can than be
     * accessed from any UIMA component by specifying $name within the configuration.
     * 
     * Archives that are bigger than 4 GB need to be .tar.gz, because Java6 zip implementation does
     * not support zip > 4GB
     * 
     * For External Resources, the ER has to be setup using job.getResource("name") in the
     * build*Engine method.
     * 
     * 
     * @param name
     *            identifier for the arcive
     * @param uri
     *            URI of the archive, can be file:/... or hdfs://...
     */
    public void registerDataArchive(String name, URI uri)
    {
        try {
            DistributedCache.addCacheArchive(new URI(uri.toString() + "#" + name), this.job);
            String resources = this.job.get("dkpro.resources", "");
            if (!resources.isEmpty()) {
                resources += ",";
            }
            resources += name;
            this.job.set("dkpro.resources", resources);

        }
        catch (final URISyntaxException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }

    /**
     * Overwrite this method if you need to import data using a UIMA collection reader
     * 
     * @return
     * @throws ResourceInitializationException
     */
    public CollectionReader buildCollectionReader()
        throws ResourceInitializationException
    {
        return null;
    }
    
    @Override
    public AnalysisEngineDescription buildReducerEngine(Configuration job)
    		throws ResourceInitializationException {
    	return null;
    }

}