RepackClueWarcRecords.java example

Explorer
Cloud9-master
- src
/*
 * Cloud9: A Hadoop toolkit for working with big data
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package edu.umd.cloud9.collection.clue;

import java.io.IOException;
import java.util.Arrays;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

/**
 * <p>
 * Program to uncompress the ClueWeb09 collection from the original distribution WARC files and
 * repack as <code>SequenceFiles</code>.
 * </p>
 *
 * <p>
 * The program takes the following command-line arguments:
 * </p>
 *
 * <ul>
 * <li>[base-path] base path of the ClueWeb09 distribution</li>
 * <li>[output-path] output path</li>
 * <li>[segment-num] segment number (1 through 10)</li>
 * <li>[docno-mapping-data-file] docno mapping data file</li>
 * <li>(block|record|none) to indicate block-compression, record-compression, or no compression</li>
 * </ul>
 *
 * <p>
 * Here's a sample invocation:
 * </p>
 *
 * <pre>
 * hadoop jar dist/cloud9-X.X.X-fatjar.jar edu.umd.cloud9.collection.clue.RepackClueWarcRecords \
 *  /collections/ClueWeb09/data /collections/ClueWeb09/data.block/en.01 1 \
 *  /collections/ClueWeb09/docno-mapping.dat block
 * </pre>
 *
 * @author Jimmy Lin
 */
public class RepackClueWarcRecords extends Configured implements Tool {
  private static final Logger LOG = Logger.getLogger(RepackClueWarcRecords.class);

  private static enum Records { TOTAL, PAGES };

  private static class MyMapper extends MapReduceBase implements
      Mapper<LongWritable, ClueWarcRecord, IntWritable, ClueWarcRecord> {

    private static final IntWritable DOCNO = new IntWritable();
    private ClueWarcDocnoMapping docnoMapping = new ClueWarcDocnoMapping();

    public void configure(JobConf job) {
      try {
        docnoMapping.loadMapping(new Path(job.get("DocnoMappingDataFile")), FileSystem.get(job));
      } catch (Exception e) {
        throw new RuntimeException("Error loading docno mapping data file!");
      }
    }

    public void map(LongWritable key, ClueWarcRecord doc,
        OutputCollector<IntWritable, ClueWarcRecord> output, Reporter reporter) throws IOException {
      reporter.incrCounter(Records.TOTAL, 1);

      String id = doc.getHeaderMetadataItem("WARC-TREC-ID");

      if (id != null) {
        reporter.incrCounter(Records.PAGES, 1);

        DOCNO.set(docnoMapping.getDocno(id));
        output.collect(DOCNO, doc);
      }
    }
  }

  /**
   * Creates an instance of this tool.
   */
  public RepackClueWarcRecords() {
  }

  private static int printUsage() {
    System.out.println("usage: [base-path] [output-path] [segment-num] [docno-mapping-data-file] (block|record|none)");
    ToolRunner.printGenericCommandUsage(System.out);
    return -1;
  }

  /**
   * Runs this tool.
   */
  public int run(String[] args) throws Exception {
    if (args.length != 5) {
      printUsage();
      return -1;
    }

    String basePath = args[0];
    String outputPath = args[1];
    int segment = Integer.parseInt(args[2]);
    String data = args[3];
    String compressionType = args[4];

    if (!compressionType.equals("block") && !compressionType.equals("record")
        && !compressionType.equals("none")) {
      System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
      System.exit(-1);
    }

    // Default block size.
    int blocksize = 1000000;

    JobConf conf = new JobConf(RepackClueWarcRecords.class);
    conf.setJobName("RepackClueWarcRecords:segment" + segment);

    conf.set("DocnoMappingDataFile", data);

    LOG.info("Tool name: RepackClueWarcRecords");
    LOG.info(" - base path: " + basePath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - segment number: " + segment);
    LOG.info(" - docno mapping data file: " + data);
    LOG.info(" - compression type: " + compressionType);

    if (compressionType.equals("block")) {
      LOG.info(" - block size: " + blocksize);
    }

    int mapTasks = 10;

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);

    ClueCollectionPathConstants.addEnglishCollectionPart(conf, basePath, segment);

    SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));

    if (compressionType.equals("none")) {
      SequenceFileOutputFormat.setCompressOutput(conf, false);
    } else {
      SequenceFileOutputFormat.setCompressOutput(conf, true);

      if (compressionType.equals("record")) {
        SequenceFileOutputFormat
            .setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);
      } else {
        SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
        conf.setInt("io.seqfile.compress.blocksize", blocksize);
      }
    }

    conf.setInputFormat(ClueWarcInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(ClueWarcRecord.class);

    conf.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
  }

  /**
   * Dispatches command-line arguments to the tool via the {@code ToolRunner}.
   */
  public static void main(String[] args) throws Exception {
    LOG.info("Running " + RepackClueWarcRecords.class.getCanonicalName() + " with args "
        + Arrays.toString(args));
    ToolRunner.run(new RepackClueWarcRecords(), args);
  }
}