/******************************************************************************* * Copyright 2013 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package org.dkpro.bigdata.hadoop; import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobStatus; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapred.lib.NullOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReader; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.resource.ResourceInitializationException; import org.dkpro.bigdata.io.hadoop.BinCasWithTypeSystemWritable; import org.dkpro.bigdata.io.hadoop.CASWritableSequenceFileWriter; /** * Base class for running UIMA Pipelines on the cluster, see also * https://maggie/wiki/bin/view/DKPro/ExecutingDKProPipelinesOnHadoop * * @author zorn * */ public abstract class DkproHadoopDriver extends Configured implements Tool, EngineFactory { private Class<? extends DkproMapper> mapperClass = DkproMapper.class; private Class<? extends DkproReducer> reducerClass = DkproReducer.class; private JobConf job; public abstract Class getInputFormatClass(); /** * Get the mapper implementation */ public Class<? extends DkproMapper> getMapperClass() { return this.mapperClass; } /** * Set a custom mapper implementation */ public void setMapperClass(Class<? extends DkproMapper> mapperClass) { this.mapperClass = mapperClass; } public Class<? extends DkproReducer> getReducerClass() { return this.reducerClass; } /** * Set a custom reducer implementation */ public void setReducerClass(Class<? extends DkproReducer> reducerClass) { this.reducerClass = reducerClass; } /** * Implement this method to configure your job. * * @param job */ @Override public abstract void configure(JobConf job); /** * Runs the UIMA pipeline. * * @return 0 if Hadoop job succeeded, 1 if job failed, 2 if it was killed, otherwise 3 * * @see org.apache.hadoop.util.Tool#run(java.lang.String[]) */ @Override public int run(String[] args) throws Exception { if (args.length < 2) { System.out.println("Usage: " + this.getClass().getSimpleName() + " [hadoop-params] input output [job-params]"); System.exit(1); } this.job = new JobConf(getConf(), DkproHadoopDriver.class); final FileSystem fs = FileSystem.get(this.job); // set the factory class name this.job.set("dkpro.uima.factory", this.getClass().getName()); Path inputPath; if (args[0].contains(",")) { String[] inputPaths = args[0].split(","); inputPath = new Path(inputPaths[0]); for (String path : inputPaths) { FileInputFormat.addInputPath(job, new Path(path)); } } else { inputPath = new Path(args[0]); // input FileInputFormat.setInputPaths(this.job, inputPath); } String outDir = args[1]; if (!getConf().getBoolean("dkpro.output.overwrite", true)) { outDir = getUniqueDirectoryName(outDir, fs); } final Path outputPath = new Path(outDir);// output final CollectionReader reader = buildCollectionReader(); // if a collection reader was defined, import data into hdfs // try { // final Class<?> c = Class.forName("org.apache.hadoop.io.compress.SnappyCodec"); // FileOutputFormat.setOutputCompressorClass(this.job, // (Class<? extends CompressionCodec>) c); // } // catch (final Exception e) { // // } if (reader != null) { final AnalysisEngine xcasWriter = AnalysisEngineFactory.createEngine( CASWritableSequenceFileWriter.class, // createTypeSystemDescription(), CASWritableSequenceFileWriter.PARAM_PATH, inputPath.toString(), CASWritableSequenceFileWriter.PARAM_COMPRESS, true, CASWritableSequenceFileWriter.PARAM_FS, job.get(("fs.default.name"), "file:/")); runPipeline(reader, xcasWriter); } // cleanup previous output fs.delete(outputPath, true); // this is a sensible default for the UKP cluster // int numMappers = 256; // if (args.length > 2) { // numMappers = Integer.parseInt(args[2]); // } FileOutputFormat.setOutputPath(this.job, outputPath); // SequenceFileOutputFormat.setCompressOutput(this.job, true); if (this.job.get("mapred.output.compress") == null) { this.job.setBoolean("mapred.output.compress", true); } // Just in case compression is on this.job.set("mapred.output.compression.type", "BLOCK"); if (this.job.getBoolean("dkpro.output.writecas", true)) { if (this.job.getBoolean("dkpro.output.plaintext", false)) { this.job.setOutputFormat(TextOutputFormat.class); } else { this.job.setOutputFormat(SequenceFileOutputFormat.class); } } else { job.setOutputFormat(NullOutputFormat.class); } // this.job.set("mapred.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // use compression // setup some sensible defaults this.job.setMapperClass(this.mapperClass); this.job.setReducerClass(this.reducerClass); if (getInputFormatClass() != null) { this.job.setInputFormat(getInputFormatClass()); } else { this.job.setInputFormat(SequenceFileInputFormat.class); } // this.job.setOutputFormat(TextOutputFormat.class); this.job.setMapOutputKeyClass(Text.class); this.job.setMapOutputValueClass(BinCasWithTypeSystemWritable.class); this.job.setOutputKeyClass(Text.class); this.job.setOutputValueClass(BinCasWithTypeSystemWritable.class); this.job.setJobName(this.getClass().getSimpleName()); // this.job.set("mapred.child.java.opts", "-Xmx1g"); // this.job.setInt("mapred.job.map.memory.mb", 1280); // this.job.setInt("mapred.job.reduce.memory.mb", 1280); // this.job.setNumMapTasks(numMappers); this.job.setNumReduceTasks(0); configure(this.job); // create symlinks for distributed resources DistributedCache.createSymlink(this.job); // sLogger.info("Running job "+job.getJobName()); RunningJob runningJob = JobClient.runJob(this.job); runningJob.waitForCompletion(); int status = runningJob.getJobState(); if (status == JobStatus.SUCCEEDED) { return 0; } else if (status == JobStatus.FAILED) { return 1; } else if (status == JobStatus.KILLED) { return 2; } else { return 3; } } private String getUniqueDirectoryName(String dir, FileSystem fs) throws IllegalArgumentException, IOException { int outDirSuffix = 2; String uniqueDir = dir; while (fs.exists(new Path(uniqueDir))) { uniqueDir = dir + outDirSuffix; outDirSuffix++; } return uniqueDir; } /** * Register a data archive to be distributed to the distributed cache. The resource can than be * accessed from any UIMA component by specifying $name within the configuration. * * Archives that are bigger than 4 GB need to be .tar.gz, because Java6 zip implementation does * not support zip > 4GB * * For External Resources, the ER has to be setup using job.getResource("name") in the * build*Engine method. * * * @param name * identifier for the arcive * @param uri * URI of the archive, can be file:/... or hdfs://... */ public void registerDataArchive(String name, URI uri) { try { DistributedCache.addCacheArchive(new URI(uri.toString() + "#" + name), this.job); String resources = this.job.get("dkpro.resources", ""); if (!resources.isEmpty()) { resources += ","; } resources += name; this.job.set("dkpro.resources", resources); } catch (final URISyntaxException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * Overwrite this method if you need to import data using a UIMA collection reader * * @return * @throws ResourceInitializationException */ public CollectionReader buildCollectionReader() throws ResourceInitializationException { return null; } @Override public AnalysisEngineDescription buildReducerEngine(Configuration job) throws ResourceInitializationException { return null; } }