/*
* Cloud9: A Hadoop toolkit for working with big data
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package edu.umd.cloud9.collection.clue;
import java.io.IOException;
import java.util.Arrays;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
/**
* <p>
* Program to uncompress the ClueWeb09 collection from the original distribution WARC files and
* repack as <code>SequenceFiles</code>.
* </p>
*
* <p>
* The program takes the following command-line arguments:
* </p>
*
* <ul>
* <li>[base-path] base path of the ClueWeb09 distribution</li>
* <li>[output-path] output path</li>
* <li>[segment-num] segment number (1 through 10)</li>
* <li>[docno-mapping-data-file] docno mapping data file</li>
* <li>(block|record|none) to indicate block-compression, record-compression, or no compression</li>
* </ul>
*
* <p>
* Here's a sample invocation:
* </p>
*
* <pre>
* hadoop jar dist/cloud9-X.X.X-fatjar.jar edu.umd.cloud9.collection.clue.RepackClueWarcRecords \
* /collections/ClueWeb09/data /collections/ClueWeb09/data.block/en.01 1 \
* /collections/ClueWeb09/docno-mapping.dat block
* </pre>
*
* @author Jimmy Lin
*/
public class RepackClueWarcRecords extends Configured implements Tool {
private static final Logger LOG = Logger.getLogger(RepackClueWarcRecords.class);
private static enum Records { TOTAL, PAGES };
private static class MyMapper extends MapReduceBase implements
Mapper<LongWritable, ClueWarcRecord, IntWritable, ClueWarcRecord> {
private static final IntWritable DOCNO = new IntWritable();
private ClueWarcDocnoMapping docnoMapping = new ClueWarcDocnoMapping();
public void configure(JobConf job) {
try {
docnoMapping.loadMapping(new Path(job.get("DocnoMappingDataFile")), FileSystem.get(job));
} catch (Exception e) {
throw new RuntimeException("Error loading docno mapping data file!");
}
}
public void map(LongWritable key, ClueWarcRecord doc,
OutputCollector<IntWritable, ClueWarcRecord> output, Reporter reporter) throws IOException {
reporter.incrCounter(Records.TOTAL, 1);
String id = doc.getHeaderMetadataItem("WARC-TREC-ID");
if (id != null) {
reporter.incrCounter(Records.PAGES, 1);
DOCNO.set(docnoMapping.getDocno(id));
output.collect(DOCNO, doc);
}
}
}
/**
* Creates an instance of this tool.
*/
public RepackClueWarcRecords() {
}
private static int printUsage() {
System.out.println("usage: [base-path] [output-path] [segment-num] [docno-mapping-data-file] (block|record|none)");
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
/**
* Runs this tool.
*/
public int run(String[] args) throws Exception {
if (args.length != 5) {
printUsage();
return -1;
}
String basePath = args[0];
String outputPath = args[1];
int segment = Integer.parseInt(args[2]);
String data = args[3];
String compressionType = args[4];
if (!compressionType.equals("block") && !compressionType.equals("record")
&& !compressionType.equals("none")) {
System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
System.exit(-1);
}
// Default block size.
int blocksize = 1000000;
JobConf conf = new JobConf(RepackClueWarcRecords.class);
conf.setJobName("RepackClueWarcRecords:segment" + segment);
conf.set("DocnoMappingDataFile", data);
LOG.info("Tool name: RepackClueWarcRecords");
LOG.info(" - base path: " + basePath);
LOG.info(" - output path: " + outputPath);
LOG.info(" - segment number: " + segment);
LOG.info(" - docno mapping data file: " + data);
LOG.info(" - compression type: " + compressionType);
if (compressionType.equals("block")) {
LOG.info(" - block size: " + blocksize);
}
int mapTasks = 10;
conf.setNumMapTasks(mapTasks);
conf.setNumReduceTasks(0);
ClueCollectionPathConstants.addEnglishCollectionPart(conf, basePath, segment);
SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));
if (compressionType.equals("none")) {
SequenceFileOutputFormat.setCompressOutput(conf, false);
} else {
SequenceFileOutputFormat.setCompressOutput(conf, true);
if (compressionType.equals("record")) {
SequenceFileOutputFormat
.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);
} else {
SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
conf.setInt("io.seqfile.compress.blocksize", blocksize);
}
}
conf.setInputFormat(ClueWarcInputFormat.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(ClueWarcRecord.class);
conf.setMapperClass(MyMapper.class);
// Delete the output directory if it exists already.
FileSystem.get(conf).delete(new Path(outputPath), true);
JobClient.runJob(conf);
return 0;
}
/**
* Dispatches command-line arguments to the tool via the {@code ToolRunner}.
*/
public static void main(String[] args) throws Exception {
LOG.info("Running " + RepackClueWarcRecords.class.getCanonicalName() + " with args "
+ Arrays.toString(args));
ToolRunner.run(new RepackClueWarcRecords(), args);
}
}