/*******************************************************************************
* Copyright 2012-13
* TU Darmstadt, UKP Lab and FG Sprachtechnologie
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.dkpro.bigdata.hadoop;
import java.io.IOException;
import java.util.Random;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.ProcessTrace;
import org.apache.uima.util.ProcessTraceEvent;
import org.dkpro.bigdata.io.hadoop.CASWritable;
/**
* A mapper for building pipelines with M/R. The engine is _NOT_ supposed to be
* a CasConsumer, the resulting cas will be written to HDFS and can be used as
* input to a Mapper process again.
*
* @author zorn
*/
public class DkproMapper extends UIMAMapReduceBase implements
Mapper<Text, CASWritable, Text, CASWritable> {
public enum INPUT_FORMAT {
CAS, TEXT, WEBARCHIVE
}
private final Random random;
private String docLanguage;
public DkproMapper() {
super();
this.random = new Random();
}
@Override
public void map(Text key, CASWritable value,
OutputCollector<Text, CASWritable> output, Reporter reporter)
throws IOException {
final CAS aCAS = value.getCAS();
/*
* SAMPLING: Process and emit only a sample of the corpus
*/
if (samplingPropability != 100) {
if (random.nextInt(100) >= samplingPropability) {
reporter.incrCounter("uima", "sampling: SKIPPED", 1);
return;
}
}
reporter.incrCounter("uima", "sampling: NOT SKIPPED", 1);
try {
if (docLanguage != null) {
aCAS.setDocumentLanguage(docLanguage);
}
// let uima process the cas
final ProcessTrace result = this.engine.process(aCAS);
for (final ProcessTraceEvent event : result.getEvents()) {
reporter.incrCounter("uima", "map event " + event.getType(), 1);
}
final Text outkey = getOutputKey(key, aCAS);
// update counters
if (aCAS.getDocumentText() != null) {
reporter.incrCounter("uima", "overall doc size",
aCAS.getDocumentText().length());
}
if (this.job.getBoolean("dkpro.output.writecas", true)) {
outValue.setCAS(aCAS);
output.collect(outkey, outValue);
}
} catch (final AnalysisEngineProcessException e) {
reporter.incrCounter("uima", e.toString(), 1);
if (failures++ > maxFailures) {
throw new IOException(e);
}
}
}
/**
* Overwrite this method to generate keys for the map-outputs With the
* default implementation all cases will be passed through a single reducer,
* which disables parallelization but has the advantage to have one single
* output file for the whole collection.
*/
protected Text getOutputKey(Text key, CAS aCAS) {
return key;
}
@Override
AnalysisEngineDescription getEngineDescription(EngineFactory factory,
JobConf job) throws ResourceInitializationException {
return factory.buildMapperEngine(job);
}
@Override
public void configure(JobConf job) {
super.configure(job);
try {
// create an output writable of the appropriate type
outValue = (CASWritable) job.getMapOutputValueClass().newInstance();
docLanguage = job.get("dkpro.document.language");
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}