CrawlPipelineStep.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2012 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.mapred.pipelineV3;

import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

/**
 * A single map-reduce step in a sequence of steps encapsulated by a Task
 * 
 * @author rana
 * 
 */
public abstract class CrawlPipelineStep {

  public static Path makeUniqueFullyQualifiedOutputDirPath(Configuration conf,Path basePath, long databaseId) throws IOException {
    FileSystem fs = FileSystem.get(basePath.toUri(),conf);
    Path uniquePath = new Path(basePath, Long.toString(databaseId));
    return fs.getFileStatus(uniquePath).getPath();
  }

  public static Path makeUniqueOutputDirPath(Path basePath, long databaseId)
      throws IOException {
    return new Path(basePath, Long.toString(databaseId));
  }

  public CrawlPipelineTask _task;

  public String            _name;

  public String            _outputDirName;

  private static final Log LOG = LogFactory.getLog(CrawlPipelineStep.class);

  public CrawlPipelineStep(CrawlPipelineTask task, String name,
      String outputDirName) {
    _task = task;
    _name = name;
    _outputDirName = outputDirName;
  }

  public <Type extends CrawlPipelineTask> Type findTaskOfType(
      Class<? extends CrawlPipelineTask> classType) {
    CrawlPipelineTask current = getTask();
    while (current != null) {
      if (current.getClass() == classType)
        return (Type) current;
      else
        for (CrawlPipelineTask dependency : current.getTaskDependencies()) {
          if (dependency.getClass() == classType) {
            return (Type) dependency;
          } else {
            CrawlPipelineTask searchResult = dependency
                .findTaskOfType(classType);
            if (searchResult != null)
              return (Type) searchResult;
          }
        }
      current = current.getTask();
    }
    return null;
  }

  public Path getBaseOutputDirForStep() throws IOException {
    // if the task does not promote its output or this is not the last step in the workflow 
    // for this task ... 
    if (!_task.promoteFinalStepOutput() || this != _task.getFinalStep()) {
      // then return a path that is composed of the tasks's base dir, and the step output dir 
      return _task.getOutputDirForStep(getOutputDirName());
    }
    else { 
      // otherwise, if this task does promote the output of the final step in the task... 
      // return the task's base dir as the output path 
      // (this is accomodate the fact that renames in S3N are not lightweight like they
      //  are on HDFS. As a matter of fact, renames/relocation of files > 5GB in size are
      //  even possible :-( 
      return _task.getTaskOutputBaseDir();
    }
  }

  public Configuration getConf() throws IOException {
    return _task.getConf();
  }

  /**
   * 
   * @return list of dependcies requires for this task to run
   * @throws IOException
   */
  public Path[] getDependencies() throws IOException {
    return new Path[0];
  }

  public String getDescription() {
    if (_task != null) {
      return _task.getDescription() + " - Step(" + getName() + "):";
    } else {
      return getName();
    }
  }

  public FileSystem getFileSystem() throws IOException {
    return _task.getFileSystem();
  }

  public abstract Log getLogger();

  public String getName() {
    return _name;
  }

  public Path getOutputDir() throws IOException {
    return makeUniqueOutputDirPath(getBaseOutputDirForStep(), _task
        .getLatestDatabaseTimestamp());
  }

  /**
   * get output dir for a given class type by walking the node graph from the
   * top
   * 
   * @param targetClass
   * @return
   * @throws IOException
   */
  public Path getOutputDirForStep(Class<? extends CrawlPipelineStep> targetClass)
      throws IOException {
    CrawlPipelineTask rootTask = getRootTask();
    return rootTask.getOutputDirForStep(targetClass);
  }

  public String getOutputDirName() {
    return _outputDirName;
  }

  public String getPipelineStepName() {
    return _name;
  }

  public CrawlPipelineTask getRootTask() {
    CrawlPipelineTask current = getTask();
    while (current.getTask() != null)
      current = current.getTask();
    return current;
  }

  public CrawlPipelineTask getTask() {
    return _task;
  }

  public long getTaskIdentityId() throws IOException {
    return _task.getTaskIdentityId();
  }

  public Path getTaskIdentityPath() throws IOException {
    return _task.getTaskIdentityPath();
  }

  /*
  public Path getTempDir() throws IOException {
    return _task.getTempDirForStep(this);
  }
  */

  public boolean isComplete() throws IOException {
    FileSystem fs = getFileSystem();
    Path outputDir = getOutputDir();
    if (!fs.exists(outputDir)) {
      getLogger().info(
          "Output Dir at: " + outputDir + " not found for:" + getDescription());
      return false;
    }
    getLogger().info(
        "Output Dir found at: " + outputDir + ". Step:" + getDescription()
            + " is complete.");
    return true;
  }

  public boolean isRunnable() throws IOException {

    getLogger().info("Checking dependencies for:" + getDescription());

    FileSystem fs = getFileSystem();

    for (Path path : getDependencies()) {
      if (!fs.exists(path)) {
        getLogger().info(
            "File:" + path + " does NOT exist." + getDescription()
                + " is not Runnable");

        return false;
      }
    }
    getLogger().info("All dependencies for " + getDescription() + " met.");

    return true;
  }

  protected boolean isTask() {
    return false;
  }

  public abstract void runStep(Path outputPathLocation) throws IOException;

  void doStep() throws IOException {

    Path outputDir = getOutputDir();

    getLogger().info(
        "Running " + getDescription() + " finaloutput:"
            + outputDir);

    runStep(outputDir);

    getLogger().info(
        "Finished running: " + getDescription());

  }
}