FeatureCountHadoopDriver.java example

Explorer

dkpro-bigdata-master
- dkpro-bigdata-collocations
  - src
    - main
      - java
        org
        dkpro
        bigdata
        collocations
        AssocReducer.java
        AssociationMetrics.java
        CollocCombiner.java
        CollocDriver.java
        CollocMapper.java
        CollocReducer.java
        Gram.java
        GramKey.java
        GramKeyGroupComparator.java
        GramKeyPartitioner.java
- dkpro-bigdata-examples
  - src
    - main
      - java
        org
        dkpro
        bigdata
        examples
        CasConsumerExample.java
        ExternalDataExample.java
        Text2CASExample.java
        UimaPipelineOnHadoop.java
- dkpro-bigdata-hadoop
  - src
    - main
      - java
        org
        dkpro
        bigdata
        hadoop
        AnalysisEngineUtil.java
        DkproHadoopDriver.java
        DkproMapper.java
        DkproReducer.java
        EngineFactory.java
        FeatureCountHadoopDriver.java
        UIMAMapReduceBase.java
        XMLDescriptorRunner.java
    - test
      - java
        org
        dkpro
        bigdata
        hadoop
        CasConsumerOutputTest.java
        DkproHadoopDriverTest.java
- dkpro-bigdata-io-hadoop
  - src
    - main
      - java
        org
        dkpro
        bigdata
        io
        hadoop
        ARCInputFormat.java
        BinCasWithTypeSystemWritable.java
        BinCasWritable.java
        CASWritable.java
        CASWritableSequenceFileWriter.java
        CollectionReaderWrapper.java
        CrawlerRecord.java
        DummyEncodingDetector.java
        EncodingDetector.java
        FormatConverterMapper.java
        GenericKeyValueLineRecordReader.java
        GenericMultiLineRecordReader.java
        HdfsResourceLoaderLocator.java
        LeipzigInputFormat.java
        MultiLineText2CASInputFormat.java
        Text2CASInputFormat.java
        WARCInputFormat.java
        XCASSequenceFileWriter.java
        XmiSequenceFileWriter.java
    - test
      - java
        org
        dkpro
        bigdata
        io
        hadoop
        ARCInputFormatTest.java
        BinCasWithTypeSystemWritableTest.java
        BinCasWritableTest.java
        CASWritableTest.java
        HdfsResourceLoaderLocatorTest.java
        InputFormatTest.java
        LeipzigInputFormatTest.java
        WARCInputFormatTest.java

/*******************************************************************************
 * Copyright 2013
 * TU Darmstadt, FG Sprachtechnologie
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.dkpro.bigdata.hadoop;

import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.jcas.JCas;
import org.dkpro.bigdata.io.hadoop.CASWritable;

/**
 * Base class for counting features (n-grams, cooccurrences, etc.) from serialized CAS instances.
 * 
 * @author Johannes Simon
 * 
 */
public abstract class FeatureCountHadoopDriver
    extends Configured
    implements Tool
{

    /**
     * Used by FeatureCountHadoopDriver to map each CAS to a set of features, e.g. its n-grams or
     * cooccurrences.
     */
    public interface CountableFeatureExtractor
    {
        public void configure(JobConf job);

        public Collection<Text> extract(JCas aJCas);
    }

    /**
     * Maps CAS instances to a set of features extracted using a custom CountableFeatureExtractor
     * implementation.
     */
    private static class CountableFeatureMapper
        extends MapReduceBase
        implements Mapper<Text, CASWritable, Text, LongWritable>
    {
        private CountableFeatureExtractor featureExtractor;

        @Override
        public void configure(JobConf job)
        {
            String featureExtractorClass = job.get("dkpro.uima.countablefeatureextractor");
            if (featureExtractorClass != null) {
                try {
                    featureExtractor = (CountableFeatureExtractor) Class.forName(
                            featureExtractorClass).newInstance();
                    featureExtractor.configure(job);
                }
                catch (InstantiationException e) {
                    throw new RuntimeException(e);
                }
                catch (IllegalAccessException e) {
                    throw new RuntimeException(e);
                }
                catch (ClassNotFoundException e) {
                    throw new RuntimeException(e);
                }
            }
        }

        @Override
        public void map(Text key, CASWritable value, OutputCollector<Text, LongWritable> output,
                Reporter reporter)
            throws IOException
        {
            final CAS aCAS = value.getCAS();
            try {
                Collection<Text> features = featureExtractor.extract(aCAS.getJCas());
                for (Text feature : features) {
                    output.collect(feature, new LongWritable(1));
                }
            }
            catch (CASException e) {
                reporter.incrCounter("uima", e.toString(), 1);
            }
        }
    }

    /**
     * Reduces all occurrences of one feature to its frequency.
     */
    private static class CountableFeatureReducer
        extends MapReduceBase
        implements Reducer<Text, LongWritable, Text, LongWritable>
    {
        @Override
        public void reduce(Text key, Iterator<LongWritable> values,
                OutputCollector<Text, LongWritable> collector, Reporter reporter)
            throws IOException
        {
            long count = 0;
            while (values.hasNext()) {
                count += values.next().get();
            }
            collector.collect(key, new LongWritable(count));
        }
    }

    private JobConf job;

    public abstract Class<? extends CountableFeatureExtractor> getCountableFeatureExtractorClass();

    /**
     * Implement this method to configure your job.
     * 
     * @param job
     */
    public abstract void configure(JobConf job);

    @Override
    public int run(String[] args)
        throws Exception
    {

        if (args.length < 2) {
            System.out.println("Usage: " + this.getClass().getSimpleName()
                    + " [hadoop-params] input output [job-params]");
            System.exit(1);
        }
        this.job = new JobConf(getConf(), DkproHadoopDriver.class);

        // set the factory class name
        this.job.set("dkpro.uima.countablefeatureextractor", getCountableFeatureExtractorClass()
                .getName());

        final Path inputPath = new Path(args[0]);
        final Path outputPath = new Path(args[1]);

        // this is a sensible default for the UKP cluster
        int numMappers = 76;
        int numReducers = 76;

        FileInputFormat.setInputPaths(this.job, inputPath);
        FileOutputFormat.setOutputPath(this.job, outputPath);

        // setup some sensible defaults
        this.job.setMapperClass(CountableFeatureMapper.class);
        this.job.setCombinerClass(CountableFeatureReducer.class);
        this.job.setReducerClass(CountableFeatureReducer.class);
        this.job.setInputFormat(SequenceFileInputFormat.class);
        this.job.setOutputFormat(TextOutputFormat.class);
        this.job.setMapOutputKeyClass(Text.class);
        this.job.setMapOutputValueClass(LongWritable.class);
        this.job.setOutputKeyClass(Text.class);
        this.job.setOutputValueClass(LongWritable.class);
        this.job.setJobName(this.getClass().getSimpleName());
        this.job.setInt("mapred.job.map.memory.mb", 1280);
        this.job.setInt("mapred.job.reduce.memory.mb", 1280);
        this.job.setNumMapTasks(numMappers);
        this.job.setNumReduceTasks(numReducers);
        configure(this.job);

        // create symlinks for distributed resources
        DistributedCache.createSymlink(this.job);
            // sLogger.info("Running job "+job.getJobName());

        JobClient.runJob(this.job);

        return 0;
    }

}