/******************************************************************************* * Copyright 2013 * TU Darmstadt, FG Sprachtechnologie * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package org.dkpro.bigdata.hadoop; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.ToolRunner; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; import org.dkpro.bigdata.io.hadoop.CASWritable; import org.dkpro.bigdata.io.hadoop.MultiLineText2CASInputFormat; import org.dkpro.bigdata.io.hadoop.MultiLineText2CASInputFormat.DocumentTextExtractor; /** * Annotates output of SentenceExtractCompactJob with UIMA annotators. * * @author Johannes Simon * */ public class XMLDescriptorRunner extends DkproHadoopDriver { public static void main(String[] args) throws Exception { XMLDescriptorRunner job = new XMLDescriptorRunner(); Configuration conf = new Configuration(); // conf.set("fs.default.name", "file:///"); int res = ToolRunner.run(conf, job, args); System.exit(res); } @Override public int run(String[] args) throws Exception { if (args.length != 3) { System.out.println("Usage: XMLDescriptorRunner [hadoop-params] <input> <output> <xml-descriptor-file>"); return 1; } String descriptorPath = args[2]; Path p = new Path(descriptorPath); FileSystem fs = FileSystem.get(getConf()); if (!fs.exists(p)) { System.out.println("Error: Specified xml descriptor file does not exist in HDFS: " + descriptorPath); return 1; } getConf().set("org.jobimtext.hadoop.uima.descriptor", descriptorPath); return super.run(args); } @Override public void configure(JobConf job) { MultiLineText2CASInputFormat.setDocumentTextExtractorClass(job, SimpleLineInputFormat.class); } @Override public AnalysisEngineDescription buildMapperEngine(Configuration conf) throws ResourceInitializationException { String descriptorPath = conf.get("org.jobimtext.hadoop.uima.descriptor"); System.err.println("Loading Analysis Engine: " + descriptorPath); Path p = new Path(descriptorPath); AnalysisEngineDescription desc = null; try { FileSystem fs = FileSystem.get(conf); XMLInputSource input = new XMLInputSource(fs.open(p), null); desc = UIMAFramework.getXMLParser() .parseAnalysisEngineDescription(input); } catch (InvalidXMLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return desc; } @Override public AnalysisEngineDescription buildReducerEngine(Configuration job) throws ResourceInitializationException { return null; } @Override public Class<? extends InputFormat<Text, CASWritable>> getInputFormatClass() { return MultiLineText2CASInputFormat.class; } public static class SimpleLineInputFormat implements DocumentTextExtractor { @Override public Text extractDocumentText(Text key, Text value) { return value; } } }