/**
* Copyright 2012 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.mapred.pipelineV3;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import com.google.common.collect.ImmutableList;
/**
* A Task, consisting of a set of map-reduce steps that are run in sequence
*
* @author rana
*
*/
public abstract class CrawlPipelineTask extends CrawlPipelineStep implements
Tool {
private static final Log LOG = LogFactory
.getLog(CrawlPipelineStep.class);
Configuration _conf;
ArrayList<CrawlPipelineStep> _steps = new ArrayList<CrawlPipelineStep>();
ArrayList<CrawlPipelineTask> _dependencyList = new ArrayList<CrawlPipelineTask>();
protected String _args[];
protected Path _identityBasePath;
protected Path _rootOutputDir;
/**
* constructor for top level task
*
* @param conf
* @param taskDescription
* @throws IOException
*/
public CrawlPipelineTask(Configuration conf,String taskDescription,String outputDirName) throws IOException {
super(null, taskDescription, outputDirName);
setConf(conf);
}
/**
* constructor for a task running as a step in another task
*
* @param parentTask
* @param taskDescription
* @throws IOException
*/
public CrawlPipelineTask(CrawlPipelineTask parentTask,String taskDescription,String outputDirName) throws IOException {
super(parentTask, taskDescription, outputDirName);
setConf(parentTask.getConf());
_identityBasePath = parentTask.getTaskIdentityBasePath();
}
public CrawlPipelineTask addStep(CrawlPipelineStep step) throws IOException {
_steps.add(step);
return this;
}
public void addTaskDependency(Class<? extends CrawlPipelineTask> taskClass)
throws IOException {
try {
_dependencyList.add(taskClass.newInstance());
} catch (Exception e) {
LOG.error("Failed to create dependent task of type:" + taskClass
+ " Error:" + StringUtils.stringifyException(e));
throw new IOException(e);
}
}
protected void finalStepComplete(CrawlPipelineStep finalStep,
Path finalStepOutputDir) throws IOException {
if (promoteFinalStepOutput()) {
FileSystem fs = getFileSystem();
Path taskAsStepOutputDir = getOutputDir();
fs.mkdirs(taskAsStepOutputDir);
getLogger().info(
"finalStepComplete callback triggered - promoting output from:"
+ finalStep.getDescription() + " to output dir:"
+ taskAsStepOutputDir);
// copy everything from final step into task output ...
FileStatus files[] = fs.globStatus(new Path(finalStepOutputDir, "*"));
if (files.length != 0) {
fs.delete(taskAsStepOutputDir, true);
fs.mkdirs(taskAsStepOutputDir);
}
for (FileStatus file : files) {
fs.rename(file.getPath(), new Path(taskAsStepOutputDir, file.getPath()
.getName()));
}
}
}
@Override
public Configuration getConf() {
return _conf;
}
@Override
public FileSystem getFileSystem() throws IOException {
return FileSystem.get(getTaskIdentityBasePath().toUri(), _conf);
}
public long getLatestDatabaseTimestamp() throws IOException {
FileSystem fs = FileSystem.get(getTaskIdentityBasePath().toUri(), _conf);
LOG
.info("Scanning for Database Candidates in:"
+ getTaskIdentityBasePath());
FileStatus candidates[] = fs.globStatus(new Path(getTaskIdentityBasePath(),
"*"));
long candidateTimestamp = -1L;
for (FileStatus candidate : candidates) {
LOG.info("Found Seed Candidate:" + candidate.getPath());
try {
long timestamp = Long.parseLong(candidate.getPath().getName());
if (candidateTimestamp == -1 || candidateTimestamp < timestamp) {
candidateTimestamp = timestamp;
}
} catch (Exception e) {
LOG.error("Skipping Path:" + candidate.getPath());
}
}
LOG.info("Selected Candidate is:" + candidateTimestamp);
return candidateTimestamp;
}
/**
* Search Graph and return context based output path for given step
*
* @param classId
* @return
*/
@Override
public Path getOutputDirForStep(Class<? extends CrawlPipelineStep> targetClass)
throws IOException {
Path pathOut = null;
for (CrawlPipelineStep step : _steps) {
if (step.getClass() == targetClass) {
pathOut = step.getOutputDir();
} else if (step instanceof CrawlPipelineTask) {
pathOut = ((CrawlPipelineTask) step).getOutputDirForStep(targetClass);
}
if (pathOut != null) {
break;
}
}
if (pathOut == null) {
for (CrawlPipelineTask dependency : _dependencyList) {
pathOut = dependency.getOutputDirForStep(targetClass);
if (pathOut != null)
break;
}
}
return pathOut;
}
public Path getOutputDirForStep(String stepName) throws IOException {
return new Path(getTaskOutputBaseDir(), stepName);
}
@Override
public String getPipelineStepName() {
return "Task:" + _name;
}
public List<CrawlPipelineStep> getSteps() {
return ImmutableList.copyOf(_steps);
}
public List<CrawlPipelineTask> getTaskDependencies() {
return _dependencyList;
}
public Path getTaskIdentityBasePath() {
return _identityBasePath;
}
public void setTaskIdentityBasePath(Path path) {
_identityBasePath = path;
}
public Path getRootOutputDir() {
return _rootOutputDir;
}
public void setRootOutputDir(Path rootOutputDir) {
_rootOutputDir = rootOutputDir;
}
public Path getTaskOutputBaseDir() throws IOException {
if (getTask() != null) {
return new Path(getTask().getTaskOutputBaseDir(), getOutputDirName());
}
else {
return new Path(_rootOutputDir,getOutputDirName());
}
}
@Override
public long getTaskIdentityId() throws IOException {
return getLatestDatabaseTimestamp();
}
@Override
public Path getTaskIdentityPath() throws IOException {
return new Path(getTaskIdentityBasePath(), Long
.toString(getLatestDatabaseTimestamp()));
}
public Path getTempDirForStep(CrawlPipelineStep step) throws IOException {
Path tempOutputDir = new Path(CrawlEnvironment.getHadoopConfig().get(
"mapred.temp.dir", ".")
+ "/" + step.getOutputDirName() + "-" + System.currentTimeMillis());
return tempOutputDir;
}
@Override
protected boolean isTask() {
return true;
}
/** overload to parse arguements **/
protected void parseArgs() throws IOException {
}
/**
* overload to initialize task after argument parsing ...
*
* @throws IOException
*/
public void initTask(String[] args)throws IOException {
// save args ...
_args = args;
// parse args ..
parseArgs();
}
protected boolean promoteFinalStepOutput() {
return true;
}
@Override
public int run(String[] args) throws Exception {
// init task
initTask(args);
// run task next
return runTask();
}
@Override
public void runStep(Path unused) throws IOException {
try {
int result = runTask();
if (result != 0) {
throw new IOException(getDescription() + " Failed With ErrorCode:"
+ result);
}
} catch (Exception e) {
throw new IOException(getDescription() + " Failed With Exception:"
+ StringUtils.stringifyException(e));
}
}
public CrawlPipelineStep getFinalStep() {
if (_steps.size() != 0) {
return _steps.get(_steps.size()-1);
}
return null;
}
public int runTask() throws Exception {
for (CrawlPipelineTask dependency : _dependencyList) {
getLogger().info(
getDescription() + " - Running Dependency:"
+ dependency.getDescription());
int result = dependency.run(_args);
if (result != 0) {
getLogger().error(
"Dependency: " + dependency.getDescription()
+ " failed to complete successfully!");
return result;
}
}
try {
getLogger().info(getDescription() + " - Iterating Steps");
if (_steps.size() != 0) {
CrawlPipelineStep finalStep = _steps.get(_steps.size() - 1);
for (CrawlPipelineStep step : _steps) {
// getLogger().info(getDescription() + " - Processing Step:" +
// step.getName());
if (!step.isComplete()) {
// getLogger().info(getDescription() + " - Step:" + step.getName() +
// " needs running. Checking dependencies");
if (!step.isRunnable()) {
getLogger().info(
getDescription() + " - Step:" + step.getName()
+ " is not runnable!");
return 1;
} else {
if (step.isTask()) {
((CrawlPipelineTask)step).initTask(_args);
}
getLogger().info(
getDescription() + " - Running Step:" + step.getName());
step.doStep();
getLogger().info(
getDescription() + " - Finished Running Step:"
+ step.getName());
}
}
if (step == finalStep) {
getLogger().info(
getDescription() + " Final Step Complete - Calling Finalize");
finalStepComplete(step, step.getOutputDir());
}
}
}
} catch (IOException e) {
getLogger().error(
getDescription() + " threw Exception:"
+ StringUtils.stringifyException(e));
return 1;
}
return 0;
}
@Override
public void setConf(Configuration conf) {
_conf = conf;
CrawlEnvironment.setHadoopConfig(_conf);
}
/**
* get any passed in args
* @return
*/
public String[] getArgs() {
return _args;
}
}