S3GetMetdataJob.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.hadoop.io;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.InetAddress;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapRunnable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TaskAttemptID;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.protocol.shared.ArcFileHeaderItem;
import org.commoncrawl.protocol.shared.ArcFileItem;
import org.commoncrawl.util.ArcFileReader;
import org.commoncrawl.util.IPAddressUtils;

/**
 * 
 * @author rana
 * 
 */
public class S3GetMetdataJob implements MapRunnable<Text, ArcFileItem, Text, CrawlURLMetadata> {

  /** logging **/
  private static final Log LOG = LogFactory.getLog(S3GetMetdataJob.class);

  /** the task's attempt id **/
  private TaskAttemptID _attemptID = null;
  private int _maxAttemptsPerTask = -1;
  private String _splitDetails = null;

  public static final String ARCFileHeader_ParseSegmentId = "x_commoncrawl_ParseSegmentId";
  public static final String ARCFileHeader_OriginalURL = "x_commoncrawl_OriginalURL";
  public static final String ARCFileHeader_URLFP = "x_commoncrawl_URLFP";
  public static final String ARCFileHeader_HostFP = "x_commoncrawl_HostFP";
  public static final String ARCFileHeader_Signature = "x_commoncrawl_Signature";
  public static final String ARCFileHeader_CrawlNumber = "x_commoncrawl_CrawlNo";
  public static final String ARCFileHeader_CrawlerId = "x_commoncrawl_CrawlerId";
  public static final String ARCFileHeader_FetchTimeStamp = "x_commoncrawl_FetchTimestamp";

  public static void main(String[] args) {

    String accessKey = args[0];
    String secretKey = args[1];

    String paths[] = {
    // "2008/06",
    // "2008/07",
    // "2008/08",
    // "2008/09",
    // "2008/10",
    // "2008/11",
    "2009" };

    for (int pathIndex = 0; pathIndex < paths.length; ++pathIndex) {

      LOG.info("Processing Path:" + paths[pathIndex]);

      JobConf job = new JobConf(S3GetMetdataJob.class);

      Path tempDir = new Path(job.get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

      LOG.info("Output for Path:" + paths[pathIndex] + " is:" + tempDir);
      System.out.println("Output Path is:" + tempDir);

      job.setJobName("S3 To CrawlURLMetadata Job for Path:" + paths[pathIndex]);

      // setup s3 properties
      JetS3tARCSource.setMaxRetries(job, 1);
      // set up S3 credentials ...
      JetS3tARCSource.setAWSAccessKeyID(job, accessKey);
      JetS3tARCSource.setAWSSecretAccessKey(job, secretKey);
      ARCSplitCalculator.setFilesPerSplit(job, 25);
      // set up arc reader properties
      ArcFileReader.setIOTimeoutValue(30000);
      // set input prefixes ...
      JetS3tARCSource.setInputPrefixes(job, paths[pathIndex]);
      // and S3 bucket name ...
      JetS3tARCSource.setBucketName(job, "commoncrawl");
      // and setup arc source for ArcInputFormat
      ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class);
      // and set up input format ...
      job.setInputFormat(ARCInputFormat.class);
      // set mapper ...
      job.setMapRunnerClass(S3GetMetdataJob.class);
      // setup reducer (identity in this case ... )
      job.setReducerClass(IdentityReducer.class);
      // standard output format ...
      job.setOutputFormat(SequenceFileOutputFormat.class);
      // set output path
      FileOutputFormat.setOutputPath(job,tempDir);
      // map output types
      job.setMapOutputKeyClass(Text.class);
      job.setMapOutputValueClass(CrawlURLMetadata.class);
      // reduce output types
      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(CrawlURLMetadata.class);
      // double the number of reducers ...
      // job.setNumReduceTasks(job.getNumReduceTasks() * 2);

      // run the job ...
      try {
        LOG.info("Starting Job:" + job.getJobName());
        JobClient.runJob(job);
        LOG.info("Finished Job:" + job.getJobName());

        Path finalPath = new Path("jobout/" + paths[pathIndex] + "/result");
        LOG.info("Copying Job Output to:" + finalPath);
        FileSystem fs = FileSystem.get(job);

        try {
          fs.mkdirs(finalPath.getParent());
          fs.rename(tempDir, finalPath);
          LOG.info("Copied Job Output to:" + finalPath);
        } finally {
          // fs.close();
        }

      } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        e.printStackTrace();
      }
    }
  }

  public void close() throws IOException {

  }

  public void configure(JobConf job) {

    _attemptID = TaskAttemptID.forName(job.get("mapred.task.id"));
    _maxAttemptsPerTask = job.getInt("mapred.max.tracker.failures", 4);
    _splitDetails = job.get(ARCSplitReader.SPLIT_DETAILS, "Spit Details Unknown");
  }

  public void map(Text key, ArcFileItem value, OutputCollector<Text, CrawlURLMetadata> output, Reporter reporter)
      throws IOException {

    try {
      // create a url metadata
      CrawlURLMetadata urlMetadataOut = new CrawlURLMetadata();

      // set direct fields ...

      // set arc file metadata fields ...
      urlMetadataOut.setArcFileName(value.getArcFileName());
      urlMetadataOut.setArcFileOffset(value.getArcFilePos());
      // set ip field ..
      InetAddress address = InetAddress.getByName(value.getHostIP());
      urlMetadataOut.setServerIP(IPAddressUtils.IPV4AddressToInteger(address.getAddress()));
      // set fetch length
      urlMetadataOut.setLastFetchSize(value.getContent().getCount());

      // walk headers ...
      for (ArcFileHeaderItem headerItem : value.getHeaderItems()) {
        if (headerItem.getItemKey().equalsIgnoreCase(ARCFileHeader_ParseSegmentId)) {
          urlMetadataOut.setParseDataSegNo(Integer.parseInt(headerItem.getItemValue()));
        } else if (headerItem.getItemKey().equalsIgnoreCase("Content-Type")) {
          urlMetadataOut.setContentType(headerItem.getItemValue());
        } else if (headerItem.getItemKey().equalsIgnoreCase("Content-Length")) {
          urlMetadataOut.setContentLength(Integer.parseInt((headerItem.getItemValue())));
        } else if (headerItem.getItemKey().equalsIgnoreCase(ARCFileHeader_URLFP)) {
          urlMetadataOut.setUrlFP(Long.parseLong(headerItem.getItemValue()));
        } else if (headerItem.getItemKey().equalsIgnoreCase(ARCFileHeader_HostFP)) {
          urlMetadataOut.setHostFP(Long.parseLong(headerItem.getItemValue()));
        } else if (headerItem.getItemKey().equalsIgnoreCase(ARCFileHeader_Signature)) {
          urlMetadataOut.setSignature(headerItem.getItemValue());
        } else if (headerItem.getItemKey().equalsIgnoreCase(ARCFileHeader_CrawlNumber)) {
          urlMetadataOut.setCrawlNumber(Integer.parseInt(headerItem.getItemValue()));
        } else if (headerItem.getItemKey().equalsIgnoreCase(ARCFileHeader_FetchTimeStamp)) {
          urlMetadataOut.setLastFetchTimestamp(Long.parseLong(headerItem.getItemValue()));
        }
      }

      if (output != null) {
        output.collect(key, urlMetadataOut);
      }
    }
    // catch any type of exception and log it ONLY for now
    catch (Exception e) {

    }
  }

  public void run(RecordReader<Text, ArcFileItem> input, OutputCollector<Text, CrawlURLMetadata> output,
      Reporter reporter) throws IOException {

    int lastValidPos = 0;
    try {
      // allocate key & value instances that are re-used for all entries
      Text key = input.createKey();
      ArcFileItem value = input.createValue();

      while (input.next(key, value)) {

        lastValidPos = value.getArcFilePos();

        // map pair to output
        map(key, value, output, reporter);
      }
    } catch (IOException e) {

      String errorMessage = "Exception processing Split:" + _splitDetails + " Exception:"
          + StringUtils.stringifyException(e);
      LOG.error(errorMessage);

      if (_attemptID.getId() == 0 || (lastValidPos == 0 && _attemptID.getId() != _maxAttemptsPerTask - 1)) {
        throw new IOException(errorMessage);
      }

      // and just ignore the message
    } catch (Throwable e) {
      String errorMessage = "Unknown Exception processing Split:" + _splitDetails + " Exception:"
          + StringUtils.stringifyException(e);
      LOG.error(errorMessage);
      // if attempt number is not max attempt number configured...
      if (_attemptID.getId() != _maxAttemptsPerTask - 1) {
        // then bubble up exception
        throw new IOException(errorMessage);
      }

    } finally {
      close();
    }

  }

  @org.junit.Test
  public void testMapper() throws Exception {

    final ArcFileReader reader = new ArcFileReader();

    Thread thread = new Thread(new Runnable() {

      public void run() {
        try {

          while (reader.hasMoreItems()) {
            ArcFileItem item = new ArcFileItem();

            reader.getNextItem(item);

            map(new Text(item.getUri()), item, null, null);

          }
          LOG.info("NO MORE ITEMS... BYE");
        } catch (IOException e) {
          LOG.error(StringUtils.stringifyException(e));
        }
      }

    });

    // run the thread ...
    thread.start();

    File file = new File("/Users/rana/Downloads/1213886083018_0.arc.gz");
    ReadableByteChannel channel = Channels.newChannel(new FileInputStream(file));

    try {

      int totalBytesRead = 0;
      for (;;) {

        ByteBuffer buffer = ByteBuffer.allocate(ArcFileReader.DEFAULT_BLOCK_SIZE);

        int bytesRead = channel.read(buffer);
        LOG.info("Read " + bytesRead + " From File");

        if (bytesRead == -1) {
          reader.finished();
          break;
        } else {
          buffer.flip();
          totalBytesRead += buffer.remaining();
          reader.available(buffer);
        }
      }
    } finally {
      channel.close();
    }

    // now wait for thread to die ...
    LOG.info("Done Reading File.... Waiting for ArcFileThread to DIE");
    thread.join();
    LOG.info("Done Reading File.... ArcFileThread to DIED");

  }
}