CrawlDBIndexWriter.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
package org.commoncrawl.mapred.ec2.postprocess.crawldb;

import java.io.IOException;
import java.net.URI;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.Vector;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.KeyBasedSequenceFileIndex;
import org.commoncrawl.util.MultiFileMergeUtils;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.Tuples.Pair;

@SuppressWarnings("static-access")
public class CrawlDBIndexWriter {

  static final Log LOG = LogFactory.getLog(CrawlDBIndexWriter.class);
  
  static Options options = new Options();
  
  static { 
    options.addOption(OptionBuilder.withArgName("input").hasArg(true).isRequired().withDescription("Input Path").create("input"));
    options.addOption(OptionBuilder.withArgName("output").hasArg(true).isRequired().withDescription("Output Path").create("output"));
    options.addOption(OptionBuilder.withArgName("shards").hasArg(true).withDescription("Shard Count (test only)").create("shards"));
  }
  
  public static void main(String[] args)throws Exception {
    CommandLineParser parser = new GnuParser();
    
    try { 
      // parse the command line arguments
      CommandLine cmdLine = parser.parse( options, args );
      
      // build the index... 
      buildIndex(cmdLine);
    }
    catch (Exception e) { 
      LOG.error(CCStringUtils.stringifyException(e));
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp( "CrawlDBIndexWriter", options );
      throw e;
    }
 
  }

  static void buildIndex(CommandLine commandLine)throws IOException { 
    
    Path inputPath = new Path(commandLine.getOptionValue("input"));
    Path outputPath = new Path(commandLine.getOptionValue("output"));
    
    int shardCount = CrawlDBCommon.NUM_SHARDS;
    if (commandLine.hasOption("shards")) { 
      shardCount = Integer.parseInt(commandLine.getOptionValue("shards"));
    }
    
    Configuration conf = new Configuration();
    
    // construct input paths ...  
    ArrayList<Path> inputPaths = new ArrayList<Path>();
    
    inputPaths.add(inputPath);
    
    JobConf jobConf = new JobBuilder("Index Builder", conf)
    .inputs(inputPaths)
    .inputFormat(MultiFileMergeUtils.MultiFileMergeInputFormat.class)
    .mapperKeyValue(IntWritable.class, Text.class)
    .outputKeyValue(TextBytes.class, TextBytes.class)
    .outputFormat(NullOutputFormat.class)
    .reducer(CrawlDBIndexWriterReducer.class,false)
    .partition(MultiFileMergeUtils.MultiFileMergePartitioner.class)
    .numReducers(shardCount)
    .speculativeExecution(true)
    .output(outputPath)
    .compressMapOutput(true)
    .compressor(CompressionType.BLOCK, GzipCodec.class)
    .maxMapAttempts(10)
    .maxReduceAttempts(3)
    .maxReduceTaskFailures(5)
    .reuseJVM(1)
    .build();
    
    LOG.info("Starting JOB:" + jobConf);
    try { 
      JobClient.runJob(jobConf);
      LOG.info("Finished JOB:" + jobConf);
    }
    catch (Exception e) { 
      LOG.info("JOB Exec Failed for:" + jobConf);
      LOG.error(CCStringUtils.stringifyException(e));
    }
  }  
  
  public static class CrawlDBIndexWriterReducer implements Reducer<IntWritable, Text ,TextBytes,TextBytes> {

    JobConf _conf;

    @Override
    public void configure(JobConf job) {
      LOG.info("Configuring");
      _conf = job;
    }

    @Override
    public void close() throws IOException {
      // TODO Auto-generated method stub
      
    }

    
    private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
    static {
      NUMBER_FORMAT.setMinimumIntegerDigits(5);
      NUMBER_FORMAT.setGroupingUsed(false);
    }    

    @Override
    public void reduce(IntWritable key, Iterator<Text> values,
        OutputCollector<TextBytes, TextBytes> output, Reporter reporter)
        throws IOException {
      
      LOG.info("Shard:" + key.get());
      // collect all incoming paths first
      Vector<Path> incomingPaths = new Vector<Path>();
      
      Set<String> fsType = new HashSet<String>();
      
      while(values.hasNext()){ 
        String path = values.next().toString();
        LOG.info("Found Incoming Path:" + path);
        incomingPaths.add(new Path(path));
        // convert to uri ... 
        URI uri = new Path(path).toUri();
        // get scheme if present ... 
        String scheme = uri.getScheme();
        if (scheme == null || scheme.length() == 0) { 
          fsType.add("default");
        }
        else { 
          fsType.add(scheme);
        }
      }
      
      if (fsType.size() != 1) { 
        throw new IOException("Only One Input Scheme at a time supported!");
      }
      
      // pick filesystem based on path ... 
      FileSystem fs = FileSystem.get(incomingPaths.get(0).toUri(),_conf);

      LOG.info("FileSystem is:" + fs.toString());
      // create output path... 
      Path indexFilePath = new Path(FileOutputFormat.getWorkOutputPath(_conf),"part-" + NUMBER_FORMAT.format(key.get()));
      
      // create crawl db writer, which is the actual reducer we want to use ...  
      KeyBasedSequenceFileIndex.IndexWriter<TextBytes,TextBytes> indexWriter = new KeyBasedSequenceFileIndex.IndexWriter<TextBytes,TextBytes>(_conf, indexFilePath);

      try { 
        Pair<KeyAndValueData<TextBytes>,Iterable<RawRecordValue>> nextItem = null;
        
        // read the single sharded file ... 
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, incomingPaths.get(0), _conf);
        
        try { 
          DataOutputBuffer keyBuffer = new DataOutputBuffer();
          // walk tuples and feed them to the actual reducer ...  
          long preReadPos = reader.getPosition();
          while (reader.nextRawKey(keyBuffer) != -1) {
            long postReadPos = reader.getPosition();
            if (postReadPos != preReadPos) { 
              indexWriter.indexItem(keyBuffer.getData(),0,keyBuffer.getLength(),null,0,0,preReadPos);
            }
            preReadPos = postReadPos;
            
            reporter.progress();
            keyBuffer.reset();
          }
          
        }
        finally { 
          reader.close();
        }
      }
      finally { 
        indexWriter.close();
      }
    }
    
  }

  
}