/** * Copyright 2012 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.mapred.pipelineV3; import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; /** * A single map-reduce step in a sequence of steps encapsulated by a Task * * @author rana * */ public abstract class CrawlPipelineStep { public static Path makeUniqueFullyQualifiedOutputDirPath(Configuration conf,Path basePath, long databaseId) throws IOException { FileSystem fs = FileSystem.get(basePath.toUri(),conf); Path uniquePath = new Path(basePath, Long.toString(databaseId)); return fs.getFileStatus(uniquePath).getPath(); } public static Path makeUniqueOutputDirPath(Path basePath, long databaseId) throws IOException { return new Path(basePath, Long.toString(databaseId)); } public CrawlPipelineTask _task; public String _name; public String _outputDirName; private static final Log LOG = LogFactory.getLog(CrawlPipelineStep.class); public CrawlPipelineStep(CrawlPipelineTask task, String name, String outputDirName) { _task = task; _name = name; _outputDirName = outputDirName; } public <Type extends CrawlPipelineTask> Type findTaskOfType( Class<? extends CrawlPipelineTask> classType) { CrawlPipelineTask current = getTask(); while (current != null) { if (current.getClass() == classType) return (Type) current; else for (CrawlPipelineTask dependency : current.getTaskDependencies()) { if (dependency.getClass() == classType) { return (Type) dependency; } else { CrawlPipelineTask searchResult = dependency .findTaskOfType(classType); if (searchResult != null) return (Type) searchResult; } } current = current.getTask(); } return null; } public Path getBaseOutputDirForStep() throws IOException { // if the task does not promote its output or this is not the last step in the workflow // for this task ... if (!_task.promoteFinalStepOutput() || this != _task.getFinalStep()) { // then return a path that is composed of the tasks's base dir, and the step output dir return _task.getOutputDirForStep(getOutputDirName()); } else { // otherwise, if this task does promote the output of the final step in the task... // return the task's base dir as the output path // (this is accomodate the fact that renames in S3N are not lightweight like they // are on HDFS. As a matter of fact, renames/relocation of files > 5GB in size are // even possible :-( return _task.getTaskOutputBaseDir(); } } public Configuration getConf() throws IOException { return _task.getConf(); } /** * * @return list of dependcies requires for this task to run * @throws IOException */ public Path[] getDependencies() throws IOException { return new Path[0]; } public String getDescription() { if (_task != null) { return _task.getDescription() + " - Step(" + getName() + "):"; } else { return getName(); } } public FileSystem getFileSystem() throws IOException { return _task.getFileSystem(); } public abstract Log getLogger(); public String getName() { return _name; } public Path getOutputDir() throws IOException { return makeUniqueOutputDirPath(getBaseOutputDirForStep(), _task .getLatestDatabaseTimestamp()); } /** * get output dir for a given class type by walking the node graph from the * top * * @param targetClass * @return * @throws IOException */ public Path getOutputDirForStep(Class<? extends CrawlPipelineStep> targetClass) throws IOException { CrawlPipelineTask rootTask = getRootTask(); return rootTask.getOutputDirForStep(targetClass); } public String getOutputDirName() { return _outputDirName; } public String getPipelineStepName() { return _name; } public CrawlPipelineTask getRootTask() { CrawlPipelineTask current = getTask(); while (current.getTask() != null) current = current.getTask(); return current; } public CrawlPipelineTask getTask() { return _task; } public long getTaskIdentityId() throws IOException { return _task.getTaskIdentityId(); } public Path getTaskIdentityPath() throws IOException { return _task.getTaskIdentityPath(); } /* public Path getTempDir() throws IOException { return _task.getTempDirForStep(this); } */ public boolean isComplete() throws IOException { FileSystem fs = getFileSystem(); Path outputDir = getOutputDir(); if (!fs.exists(outputDir)) { getLogger().info( "Output Dir at: " + outputDir + " not found for:" + getDescription()); return false; } getLogger().info( "Output Dir found at: " + outputDir + ". Step:" + getDescription() + " is complete."); return true; } public boolean isRunnable() throws IOException { getLogger().info("Checking dependencies for:" + getDescription()); FileSystem fs = getFileSystem(); for (Path path : getDependencies()) { if (!fs.exists(path)) { getLogger().info( "File:" + path + " does NOT exist." + getDescription() + " is not Runnable"); return false; } } getLogger().info("All dependencies for " + getDescription() + " met."); return true; } protected boolean isTask() { return false; } public abstract void runStep(Path outputPathLocation) throws IOException; void doStep() throws IOException { Path outputDir = getOutputDir(); getLogger().info( "Running " + getDescription() + " finaloutput:" + outputDir); runStep(outputDir); getLogger().info( "Finished running: " + getDescription()); } }