/******************************************************************************* * Copyright 2012 * TU Darmstadt, UKP Lab * with FG Sprachtechnologie * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package org.dkpro.bigdata.hadoop; import java.io.IOException; import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.ProcessTrace; import org.apache.uima.util.ProcessTraceEvent; import org.dkpro.bigdata.io.hadoop.CASWritable; /** * This class expects a UIMA Consumer as engine, it will not collect the output but will copy * everything from the local directory to HDFS after completion. Useful for e.g. LuceneIndexWriter * * @author zorn * */ public class DkproReducer extends UIMAMapReduceBase implements Reducer<Writable, CASWritable, Writable, CASWritable> { Log logger = LogFactory.getLog("DkproReducer"); @Override public void reduce(Writable key, Iterator<CASWritable> values, OutputCollector<Writable, CASWritable> output, Reporter reporter) throws IOException { while (values.hasNext()) { final CAS aCAS = values.next().getCAS(); try { // let uima process the cas final ProcessTrace result = this.engine.process(aCAS); for (final ProcessTraceEvent event : result.getEvents()) { reporter.incrCounter("uima", "map event " + event.getType(), 1); } outValue.setCAS(aCAS); reporter.incrCounter("uima", "overall doc size", outValue.getCAS().getDocumentText() .length()); output.collect(key, outValue); } catch (final AnalysisEngineProcessException e) { reporter.incrCounter("uima", e.toString(), 1); if (failures++ > maxFailures) throw new IOException(e); } } } @Override AnalysisEngineDescription getEngineDescription(EngineFactory factory, JobConf job) throws ResourceInitializationException { return factory.buildReducerEngine(job); } @Override public void configure(JobConf job) { super.configure(job); try { // create an output writable of the appropriate type outValue = (CASWritable) job.getOutputValueClass().newInstance(); } catch (Exception e) { throw new RuntimeException(e); } } }