/******************************************************************************* * Copyright 2013 * TU Darmstadt, FG Sprachtechnologie * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package org.dkpro.bigdata.hadoop; import java.io.IOException; import java.util.Collection; import java.util.Iterator; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.jcas.JCas; import org.dkpro.bigdata.io.hadoop.CASWritable; /** * Base class for counting features (n-grams, cooccurrences, etc.) from serialized CAS instances. * * @author Johannes Simon * */ public abstract class FeatureCountHadoopDriver extends Configured implements Tool { /** * Used by FeatureCountHadoopDriver to map each CAS to a set of features, e.g. its n-grams or * cooccurrences. */ public interface CountableFeatureExtractor { public void configure(JobConf job); public Collection<Text> extract(JCas aJCas); } /** * Maps CAS instances to a set of features extracted using a custom CountableFeatureExtractor * implementation. */ private static class CountableFeatureMapper extends MapReduceBase implements Mapper<Text, CASWritable, Text, LongWritable> { private CountableFeatureExtractor featureExtractor; @Override public void configure(JobConf job) { String featureExtractorClass = job.get("dkpro.uima.countablefeatureextractor"); if (featureExtractorClass != null) { try { featureExtractor = (CountableFeatureExtractor) Class.forName( featureExtractorClass).newInstance(); featureExtractor.configure(job); } catch (InstantiationException e) { throw new RuntimeException(e); } catch (IllegalAccessException e) { throw new RuntimeException(e); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } } @Override public void map(Text key, CASWritable value, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException { final CAS aCAS = value.getCAS(); try { Collection<Text> features = featureExtractor.extract(aCAS.getJCas()); for (Text feature : features) { output.collect(feature, new LongWritable(1)); } } catch (CASException e) { reporter.incrCounter("uima", e.toString(), 1); } } } /** * Reduces all occurrences of one feature to its frequency. */ private static class CountableFeatureReducer extends MapReduceBase implements Reducer<Text, LongWritable, Text, LongWritable> { @Override public void reduce(Text key, Iterator<LongWritable> values, OutputCollector<Text, LongWritable> collector, Reporter reporter) throws IOException { long count = 0; while (values.hasNext()) { count += values.next().get(); } collector.collect(key, new LongWritable(count)); } } private JobConf job; public abstract Class<? extends CountableFeatureExtractor> getCountableFeatureExtractorClass(); /** * Implement this method to configure your job. * * @param job */ public abstract void configure(JobConf job); @Override public int run(String[] args) throws Exception { if (args.length < 2) { System.out.println("Usage: " + this.getClass().getSimpleName() + " [hadoop-params] input output [job-params]"); System.exit(1); } this.job = new JobConf(getConf(), DkproHadoopDriver.class); // set the factory class name this.job.set("dkpro.uima.countablefeatureextractor", getCountableFeatureExtractorClass() .getName()); final Path inputPath = new Path(args[0]); final Path outputPath = new Path(args[1]); // this is a sensible default for the UKP cluster int numMappers = 76; int numReducers = 76; FileInputFormat.setInputPaths(this.job, inputPath); FileOutputFormat.setOutputPath(this.job, outputPath); // setup some sensible defaults this.job.setMapperClass(CountableFeatureMapper.class); this.job.setCombinerClass(CountableFeatureReducer.class); this.job.setReducerClass(CountableFeatureReducer.class); this.job.setInputFormat(SequenceFileInputFormat.class); this.job.setOutputFormat(TextOutputFormat.class); this.job.setMapOutputKeyClass(Text.class); this.job.setMapOutputValueClass(LongWritable.class); this.job.setOutputKeyClass(Text.class); this.job.setOutputValueClass(LongWritable.class); this.job.setJobName(this.getClass().getSimpleName()); this.job.setInt("mapred.job.map.memory.mb", 1280); this.job.setInt("mapred.job.reduce.memory.mb", 1280); this.job.setNumMapTasks(numMappers); this.job.setNumReduceTasks(numReducers); configure(this.job); // create symlinks for distributed resources DistributedCache.createSymlink(this.job); // sLogger.info("Running job "+job.getJobName()); JobClient.runJob(this.job); return 0; } }