CrawlDBIndexSearch.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
package org.commoncrawl.mapred.ec2.postprocess.crawldb;

import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import javax.annotation.Nullable;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.SequenceFile.ValueBytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.commoncrawl.hadoop.mergeutils.TextFileSpillWriter;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBMergeSortReducer.RawValueIterator;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.KeyBasedSequenceFileIndex;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue;
import org.commoncrawl.util.S3NFileSystem;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.Tuples.Pair;
import org.commoncrawl.util.URLUtils;

import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

@SuppressWarnings("static-access")
public class CrawlDBIndexSearch {
  
  static Options options = new Options();
  
  static final Log LOG = LogFactory.getLog(CrawlDBIndexSearch.class);

  static { 
    options.addOption(OptionBuilder.withArgName("indexpath").hasArg(true).isRequired().withDescription("Index Path").create("indexpath"));
    options.addOption(OptionBuilder.withArgName("dbpath").hasArg(true).withDescription("Database Path").create("dbpath"));
    options.addOption(OptionBuilder.withArgName("dbpaths").hasArg(true).withDescription("Database Paths").create("dbpaths"));
    options.addOption(OptionBuilder.withArgName("domain").hasArg(true).isRequired().withDescription("Domain Name").create("domain"));
    options.addOption(OptionBuilder.withArgName("outputpath").hasArg(true).isRequired().withDescription("Output Path").create("outputpath"));
    options.addOption(OptionBuilder.withArgName("fileprefix").hasArg(true).withDescription("Index File Prefix").create("fileprefix"));
  }
  
  public static void main(String[] args) throws Exception {
    CommandLineParser parser = new GnuParser();
    
    try { 
      // parse the command line arguments
      CommandLine cmdLine = parser.parse( options, args );
      
      runQuery(cmdLine);
    }
    catch (Exception e) { 
      LOG.error(CCStringUtils.stringifyException(e));
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp( "CrawlDBBlekkoMerge", options );
      
      throw e;
    }
    finally {
      System.exit(0);
    }
  }
  
  private static FileSystem getFileSystemForPath(Path path,Configuration conf)throws IOException { 
    // override S3N 
    conf.setClass("fs.s3n.impl", S3NFileSystem.class,FileSystem.class);
    return FileSystem.get(path.toUri(),conf);
  }
  
  private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
  static {
    NUMBER_FORMAT.setMinimumIntegerDigits(5);
    NUMBER_FORMAT.setGroupingUsed(false);
  }    

  private static Function<TextBytes,String> createStandardTransformer() { 
    final JsonParser parser = new JsonParser();
    return new Function<TextBytes, String>() {

      @Override
      public String apply(final TextBytes arg0) {
        JsonObject jsonObj = parser.parse(arg0.toString()).getAsJsonObject();
        
        ArrayList<String> fields = Lists.newArrayList();
        
        if (jsonObj != null) {
          JsonObject blekkoStatus = jsonObj.getAsJsonObject(CrawlDBCommon.TOPLEVEL_BLEKKO_METADATA_PROPERTY);
          if (blekkoStatus != null) {
            // blekko url 
            fields.add("1");
            // blekko crawl status 
            if (blekkoStatus.has(CrawlDBCommon.BLEKKO_METADATA_STATUS) 
                && blekkoStatus.get(CrawlDBCommon.BLEKKO_METADATA_STATUS).getAsString().equalsIgnoreCase("crawled")) { 
              fields.add("1");
            }
            else { 
              fields.add("0");
            }
          }
          else { 
            // is blekko url 
            fields.add("0");
            // blekko crawled 
            fields.add("0");
          }
          JsonObject crawlStatus = jsonObj.getAsJsonObject(CrawlDBCommon.TOPLEVEL_SUMMARYRECORD_PROPRETY);
          JsonObject linkStatus = jsonObj.getAsJsonObject(CrawlDBCommon.TOPLEVEL_LINKSTATUS_PROPERTY);
          if (crawlStatus != null || linkStatus != null) { 
            // is cc url 
            fields.add("1");
            if (crawlStatus != null
                && crawlStatus.get(CrawlDBCommon.SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY).getAsInt() != 0) { 
                // crawled ... 
                fields.add("1");
            }
            else { 
              // crawled ... 
              fields.add("0");
            }
            if (linkStatus != null 
                && linkStatus.has(CrawlDBCommon.LINKSTATUS_EXTRADOMAIN_SOURCES_COUNT_PROPERTY)
                && linkStatus.get(CrawlDBCommon.LINKSTATUS_EXTRADOMAIN_SOURCES_COUNT_PROPERTY).getAsInt() != 0) { 
              // ext hrefs
              fields.add("1");
            }
            else { 
              // ext hrefs 
              fields.add("0");
            }
          }
          else { 
            // is cc url ... 
            fields.add("0");
            // crawled (not) 
            fields.add("0");
            // ext hrefs 
            fields.add("0");
          }
        }
        else { 
          for (int i=0;i<5;++i)
            fields.add("0");
        }
        return Joiner.on("\t").join(fields).toString();
      }
    };
  }
  
  private static void runQuery(CommandLine commandLine) throws IOException {
    
    final Configuration conf = new Configuration();
    
    URLFPV2 fp = URLUtils.getURLFPV2FromHost(commandLine.getOptionValue("domain"));
    if (fp == null) { 
      throw new IOException("Invalid Domain:" + commandLine.getOptionValue("domain"));
    }

    // construct min/max keys
    final Pair<TextBytes,TextBytes> minMaxKeys 
      = CrawlDBKey.generateMinMaxKeysForDomain(
          fp.getRootDomainHash(),
          (fp.getRootDomainHash() == fp.getDomainHash()) ? -1L:fp.getDomainHash());
    
    Path dbPath = null;
    final Path indexPath = new Path(commandLine.getOptionValue("indexpath"));

    
    Map<Integer,Pair<Path,Path>> shardToPathMap = Maps.newTreeMap();
    
    if (commandLine.hasOption("dbpath")) { 
      dbPath = new Path(commandLine.getOptionValue("dbpath"));
      FileSystem dbFS      = getFileSystemForPath(dbPath,conf);
      FileStatus parts[] = dbFS.globStatus(new Path(indexPath,"part-*"));
      for (FileStatus part : parts) { 
        Path itemPath = part.getPath();
        int shardIndex = Integer.parseInt(itemPath.getName().substring("part-".length()));
        Pair<Path,Path> tuple = new Pair<Path,Path>(itemPath,new Path(indexPath,"index-" + NUMBER_FORMAT.format(shardIndex)));
        shardToPathMap.put(shardIndex, tuple);
      }
    }
    else if (commandLine.hasOption("dbpaths")) {
      
      Path partsFilePath = new Path(commandLine.getOptionValue("dbpaths"));
      LOG.info("Parts File:" + partsFilePath);
      
      FSDataInputStream inputStream = FileSystem.get(partsFilePath.toUri(),conf).open(partsFilePath);
      try { 
        BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
        String line = null;
        while ((line = reader.readLine()) != null) { 
          Path itemPath = new Path(line);
          int shardIndex = Integer.parseInt(itemPath.getName().substring("part-".length()));
          Pair<Path,Path> tuple = new Pair<Path,Path>(itemPath,new Path(indexPath,"index-" + NUMBER_FORMAT.format(shardIndex)));
          shardToPathMap.put(shardIndex, tuple);
        }
      }
      finally { 
        inputStream.close();
      }
    }
     
    if (shardToPathMap.size() == 0) { 
      throw new IOException("No Valid Shards Specified!");
    }
    
    final Path outputPath = new Path(commandLine.getOptionValue("outputpath"));
    final FileSystem outputFS  = getFileSystemForPath(outputPath,conf);

    // for each part invoke a parallel query 
    List<Callable<String>> callables = Lists.newArrayList();
    for (Map.Entry<Integer,Pair<Path,Path>> entry : shardToPathMap.entrySet()) { 
      final int threadIndex = entry.getKey();

      final FileSystem indexFS   = getFileSystemForPath(entry.getValue().e1,conf);
      final FileSystem dbFS      = getFileSystemForPath(entry.getValue().e0,conf);

      final Path indexFilePath = entry.getValue().e1;
      final Path crawlDBFilePath = entry.getValue().e0;
      final Path outputFilePath = new Path(outputPath,"part-" + NUMBER_FORMAT.format(threadIndex));
      final DataOutputBuffer minKeyOutputBuffer = new DataOutputBuffer();
      final DataOutputBuffer maxKeyOutputBuffer = new DataOutputBuffer();

      minMaxKeys.e0.write(minKeyOutputBuffer);
      minMaxKeys.e1.write(maxKeyOutputBuffer);
      
      callables.add(new Callable<String>() {

        @Override
        public String call() throws Exception {
          try { 
            // load index file 
            LOG.info("Loading Index:" + threadIndex);
            KeyBasedSequenceFileIndex<TextBytes> index = new KeyBasedSequenceFileIndex<TextBytes>(conf,indexFilePath,new CrawlDBKey.LinkKeyComparator());
            KeyBasedSequenceFileIndex.IndexReader<TextBytes> reader = new KeyBasedSequenceFileIndex.IndexReader<TextBytes>(index);
            
            // find best position 
            LOG.info("Searching Index:" + threadIndex);
            long seqFilePos = reader.findBestPositionForKey(minKeyOutputBuffer.getData(),0,minKeyOutputBuffer.getLength());
            LOG.info("Search of Index:" + threadIndex + " Returned Pos:" + seqFilePos);
  
            
            SequenceFile.Writer writer = SequenceFile.createWriter(outputFS, conf,outputFilePath, Text.class, Text.class,CompressionType.BLOCK,new SnappyCodec());
            try { 
              // open a reader 
              LOG.info("Opening CrawlDB File at Path:" + crawlDBFilePath);
              SequenceFile.Reader seqReader = new SequenceFile.Reader(dbFS, crawlDBFilePath, conf);
              RawComparator<TextBytes> comparator = new CrawlDBKey.LinkKeyComparator();
              
              try { 
                LOG.info("Seeking Index File:" + seqFilePos);
                // seqReader.seek(seqFilePos - 16);
                seqReader.sync(seqFilePos - 16);
                LOG.info("Position Now:" + seqReader.getPosition());
                
                boolean enteredRange = false;
                boolean exitedRange = false;
                
                DataOutputBuffer keyOutputBuffer = new DataOutputBuffer();
                DataOutputBuffer valueOutputBuffer = new DataOutputBuffer();
                DataInputBuffer valueInputBuffer = new DataInputBuffer();
                TextBytes        valueText = new TextBytes();
                Text             urlText = new Text();
                ValueBytesWrapper valBytesWrap = new ValueBytesWrapper();
                valBytesWrap.sourceData = valueOutputBuffer; 
                
                DataInputBuffer keyInputBuffer = new DataInputBuffer();
                JsonParser parser = new JsonParser();
                
                ValueBytes valueBytes = seqReader.createValueBytes();
                
                TextBytes keyBytes = new TextBytes();
                long emittedKeyCount=0;
                while (!exitedRange) { 
                  if (seqReader.nextRawKey(keyOutputBuffer)== -1) { 
                    break;
                  }
                  else {
                    keyInputBuffer.reset(keyOutputBuffer.getData(),0,keyOutputBuffer.getLength());
                    keyBytes.setFromRawTextBytes(keyInputBuffer);
                    //LOG.info("SeqFileKey:" + debugStr + " TargetKey:" + minMaxKeys.e0);
                    
                    if (!enteredRange) { 
                      if (comparator.compare(keyOutputBuffer.getData(),0, keyOutputBuffer.getLength(),
                          minKeyOutputBuffer.getData(), 0, minKeyOutputBuffer.getLength()) >= 0) { 
                        LOG.info("Entered Range");
                        enteredRange = true;
                      }
                    }
                    if (enteredRange) { 
                      if (comparator.compare(keyOutputBuffer.getData(),0, keyOutputBuffer.getLength(),
                          maxKeyOutputBuffer.getData(), 0, maxKeyOutputBuffer.getLength()) > 0) {
                        LOG.info("Exited Range - Emitted Keys:" + emittedKeyCount);
                        exitedRange = true;
                      }
                    }
                    if (enteredRange && !exitedRange) {
                      long keyType = CrawlDBKey.getLongComponentFromKey(keyBytes, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID);
                      if (keyType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal()) { 
                        seqReader.nextRawValue(valueBytes);
                        // valid data found
                        valueBytes.writeUncompressedBytes(valueOutputBuffer);
                        valueInputBuffer.reset(valueOutputBuffer.getData(), valueOutputBuffer.getLength());
                        valueText.setFromRawTextBytes(valueInputBuffer);
                        // inefficient ... but in the interest of getting shit done ...
                        try { 
                          JsonObject object = parser.parse(valueText.toString()).getAsJsonObject();
                          urlText.set(object.get("source_url").getAsString());
                          // write the key out raw ... 
                          keyOutputBuffer.reset();
                          urlText.write(keyOutputBuffer);
                          // write out key/val raw 
                          writer.appendRaw(
                              keyOutputBuffer.getData(), 
                              0, 
                              keyOutputBuffer.getLength(),
                              valBytesWrap);
                              
                              
                          emittedKeyCount++;
                        }
                        catch (Exception e) { 
                          LOG.error(CCStringUtils.stringifyException(e));
                          throw new IOException("Invalid JSON!:" + valueText.toString());
                        }
                      }
                    }
                  }
                  keyOutputBuffer.reset();
                  valueOutputBuffer.reset();
                }
              }
              finally { 
                seqReader.close();
              }
            }
            finally { 
              writer.close();
            }
            return outputFilePath.toString();
          }
          catch (Exception e) { 
            LOG.error(CCStringUtils.stringifyException(e));
            throw e;
          }
        } 
      });
    }
    // create the exector
    ExecutorService executor = Executors.newFixedThreadPool(500);
    LOG.info("Queueing " + callables.size() + " Work Items");
    // for now, just standard transformer ... 
    Function<TextBytes,String> valueTransformer = createStandardTransformer();
    
    // execute queued items 
    try {
      List<Future<String>> futures = executor.invokeAll(callables);
      mergeResults(conf,outputFS,outputPath,valueTransformer);
    } catch (InterruptedException e1) {
      // TODO Auto-generated catch block
      e1.printStackTrace();
    }
    // shutdown executor service gracefully  
    executor.shutdown();
    LOG.info("Waiting for shutdown");
    // wait for completion 
    try {
      executor.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS);
      LOG.info("Execution Completed");
    } catch (InterruptedException e) {
      LOG.error("Execution Interrupted!");
    }
  }
  
  static void mergeResults(Configuration conf,FileSystem fs,Path outputPath,Function<TextBytes,String> valueTransformer)throws IOException { 
    // collect parts 
    FileStatus parts[] = fs.globStatus(new Path(outputPath,"part-*"));
    List<Path> inputs = ImmutableList.copyOf(
        Iterators.transform(Iterators.forArray(parts),new Function<FileStatus,Path>() {

      @Override
      @Nullable
      public Path apply(@Nullable FileStatus arg0) {
        return arg0.getPath();
      } 
      
    }));
    // feed to merger ... 
    // set up merge attributes
    Configuration localMergeConfig = new Configuration(conf);
    // we don't want to use a grouping comparator because the we are using the reducer code from the intermediate 
    // merge 
    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS,TextBytes.Comparator.class, RawComparator.class);
    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS,TextBytes.class, WritableComparable.class);
    

    Writer outputWriter = new OutputStreamWriter(fs.create(
        new Path(outputPath,"results.txt"),
        true,
        1000000
        ),Charset.forName("UTF-8"));

    try { 
      // initialize reader ... 
      LOG.info("FileSystem is:" + fs.toString());
      MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(fs, inputs, localMergeConfig);

      try { 
        RawValueIterator rawValueIterator = new RawValueIterator();
        
        Pair<KeyAndValueData<TextBytes>,Iterable<RawRecordValue>> nextItem = null;
        
        TextBytes valueText = new TextBytes();
        DataInputBuffer inputBuffer = new DataInputBuffer();
        
        // walk tuples and feed them to the actual reducer ...  
        while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {
          for (RawRecordValue rawValue : nextItem.e1) { 
            
            inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength());
            valueText.setFromRawTextBytes(inputBuffer);
            outputWriter.write(nextItem.e0._keyObject.toString()+"\t"+valueTransformer.apply(valueText)+"\n");
          }
        }
      }
      finally { 
        multiFileInputReader.close();
      }
    }
    finally {
      try {
        outputWriter.flush();
      }
      finally { 
        IOUtils.closeStream(outputWriter);
      }
    }
  }
  
  /** 
   * We need this to avoid double buffer copies 
   * @author rana
   *
   */
  private static class ValueBytesWrapper implements ValueBytes {
    public DataOutputBuffer sourceData;
    @Override
    public void writeUncompressedBytes(DataOutputStream outStream)
        throws IOException {
      outStream.write(sourceData.getData(),0,sourceData.getLength());
    }

    @Override
    public void writeCompressedBytes(DataOutputStream outStream)
        throws IllegalArgumentException, IOException {
      outStream.write(sourceData.getData(),0,sourceData.getLength());
    }

    @Override
    public int getSize() {
      return sourceData.getLength();
    } 
    
  }

}