/* * Cloud9: A MapReduce Library for Hadoop * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package edu.umd.cloud9.webgraph.driver; import java.io.IOException; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.lib.IdentityReducer; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import tl.lin.data.array.ArrayListWritable; import edu.umd.cloud9.collection.DocnoMapping; import edu.umd.cloud9.webgraph.DriverUtil; import edu.umd.cloud9.webgraph.data.AnchorText; import edu.umd.cloud9.webgraph.data.IndexableAnchorText; /** * Creates an indexable collection of anchors. * * @author Nima Asadi * */ public class BuildIndexableAnchorCollection extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(BuildIndexableAnchorCollection.class); public static class MyMapper extends MapReduceBase implements Mapper<IntWritable, ArrayListWritable<AnchorText>, IntWritable, IndexableAnchorText> { private static final IndexableAnchorText sOutputValue = new IndexableAnchorText(); private static DocnoMapping docnoMapping; private static int maxContentLength; public void configure(JobConf job) { maxContentLength = job.getInt("Cloud9.maxContentLength", 0); String docnoMappingClass = job.get("Cloud9.DocnoMappingClass", "edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping"); try { docnoMapping = (DocnoMapping) Class.forName(docnoMappingClass).newInstance(); } catch(Exception e) { throw new RuntimeException("Class " + docnoMappingClass + " not found!"); } Path[] localFiles; try { localFiles = DistributedCache.getLocalCacheFiles(job); } catch (IOException e) { throw new RuntimeException("Local cache files not read properly."); } try { docnoMapping.loadMapping(localFiles[0], FileSystem.getLocal(job)); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException("Error initializing DocnoMapping!"); } } public void map(IntWritable key, ArrayListWritable<AnchorText> value, OutputCollector<IntWritable, IndexableAnchorText> output, Reporter reporter) throws IOException { sOutputValue.clear(); sOutputValue.setDocid(docnoMapping.getDocid(key.get())); if(maxContentLength > 0) { sOutputValue.concatenateAnchors(value, maxContentLength); } else { sOutputValue.concatenateAnchors(value); } output.collect(key, sOutputValue); } } public BuildIndexableAnchorCollection() { } private static int printUsage() { System.out.println("usage: [-input collection-path] [-output output-path]" + " [-docnoClass docno-mapping-class] [-docno docno-mapping-file]" + " [-numReducers num-reducers] [optional:-maxLength maximum content length]"); ToolRunner.printGenericCommandUsage(System.out); return -1; } /** * Runs this tool. */ public int run(String[] args) throws Exception { if (args.length < 5) { printUsage(); return -1; } JobConf conf = new JobConf(getConf()); FileSystem fs = FileSystem.get(conf); String collectionPath = DriverUtil.argValue(args, DriverUtil.CL_INPUT); String outputPath = DriverUtil.argValue(args, DriverUtil.CL_OUTPUT); String docnoMappingClass = DriverUtil.argValue(args, DriverUtil.CL_DOCNO_MAPPING_CLASS); String docnoMapping = DriverUtil.argValue(args, DriverUtil.CL_DOCNO_MAPPING); int numReducers = Integer.parseInt(DriverUtil.argValue(args, DriverUtil.CL_NUMBER_OF_REDUCERS)); if(DriverUtil.argExists(args, DriverUtil.CL_MAX_LENGTH)) { conf.setInt("Cloud9.maxContentLength", Integer.parseInt(DriverUtil.argValue(args, DriverUtil.CL_MAX_LENGTH))); } conf.set("Cloud9.DocnoMappingClass", docnoMappingClass); LOG.info("Tool name: BuildAnchorTextForwardIndex"); LOG.info(" - collection path: " + collectionPath); LOG.info(" - output path: " + outputPath); LOG.info(" - docno-mapping class: " + docnoMappingClass); LOG.info(" - docno-mapping file: " + docnoMapping); if(args.length == 6) { LOG.info(" - maximum content length: " + conf.getInt("Cloud9.maxContentLength", 0)); } conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setJobName("BuildIndexableAnchorCollection"); conf.setJarByClass(BuildIndexableAnchorCollection.class); conf.setNumMapTasks(100); conf.setNumReduceTasks(numReducers); DistributedCache.addCacheFile(new URI(docnoMapping), conf); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); SequenceFileInputFormat.setInputPaths(conf, new Path(collectionPath)); SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(IndexableAnchorText.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(IdentityReducer.class); // delete the output directory if it exists already fs.delete(new Path(outputPath), true); RunningJob job = JobClient.runJob(conf); return 0; } /** * Dispatches command-line arguments to the tool via the * <code>ToolRunner</code>. */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new BuildIndexableAnchorCollection(), args); System.exit(res); } }