/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.pagerank.slave;
import java.io.File;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.commoncrawl.async.CallbackWithResult;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.service.crawler.filters.SuperDomainFilter;
import org.commoncrawl.service.pagerank.Constants;
import org.commoncrawl.service.pagerank.IterationInfo;
import org.commoncrawl.service.pagerank.PRMasterState;
import org.commoncrawl.service.pagerank.PageRankJobConfig;
import org.commoncrawl.service.pagerank.slave.PageRankUtils.PRValueMap;
import org.commoncrawl.util.CCStringUtils;
public class BeginPageRankTask extends PageRankTask<BeginPageRankTask.BeginPageRankTaskResult> {
private static final Log LOG = LogFactory.getLog(BeginPageRankTask.class);
private PageRankJobConfig _config;
private int _prMasterStatus;
private boolean _isCancelled = false;
public BeginPageRankTask(PageRankJobConfig jobConfig,int serverStatus, PageRankSlaveServer server,CallbackWithResult<BeginPageRankTaskResult> completionCallback) {
super(server,BeginPageRankTask.BeginPageRankTaskResult.class, completionCallback);
_config = jobConfig;
_prMasterStatus = serverStatus;
}
public static class BeginPageRankTaskResult extends PageRankTask.PageRankTaskResult {
public PRValueMap _valueMap = null;
}
@Override
protected void cancelTask() {
_isCancelled = true;
}
@Override
protected BeginPageRankTaskResult runTask() throws IOException {
BeginPageRankTaskResult result = new BeginPageRankTaskResult();
try {
// create job local directory if necessary
_server.getActiveJobLocalPath().mkdirs();
FileSystem fileSystem = _server.getFileSystem();
// figure out if we are going to load values from base location or job config (based on iteration number)
Path rangeRemotePath = new Path(_config.getInputValuesPath(),PageRankUtils.makeUniqueFileName(Constants.PR_RANGE_FILE_PREFIX,0,_server.getNodeIndex()));
Path rangeLocalPath = PageRankUtils.makeRangeFilePath(_server.getActiveJobLocalPath(), _server.getNodeIndex());
Path idsRemotePath = new Path(_config.getInputValuesPath(),PageRankUtils.makeUniqueFileName(Constants.PR_IDS_FILE_PREFIX,0,_server.getNodeIndex()));
Path idsLocalPath = new Path(PageRankUtils.makeIdsFilePath(_server.getActiveJobLocalPath(), _server.getNodeIndex()).getAbsolutePath());
Path outlinksFileRemotePath = new Path(_config.getOutlinksDataPath(),PageRankUtils.makeUniqueFileName(Constants.PR_OUTLINKS_FILE_PREFIX,0,_server.getNodeIndex()));
Path outlinksFileLocalPath = new Path(new File(_server.getActiveJobLocalPath(),PageRankUtils.makeUniqueFileName(Constants.PR_OUTLINKS_FILE_PREFIX,0,_server.getNodeIndex())).getAbsolutePath());
Path valuesRemotePath = null;
if (_config.getIterationNumber() == 0) {
// fetch values from base values path
valuesRemotePath = new Path(_config.getInputValuesPath(),PageRankUtils.makeUniqueFileName(Constants.PR_VALUE_FILE_PREFIX,0,_server.getNodeIndex()));
LOG.info("Iteration Number is 0. Using Values File:" + valuesRemotePath);
}
else {
// fetch latest values from job path (hdfs) based on last iteration number ...
valuesRemotePath = new Path(_config.getJobWorkPath(),PageRankUtils.makeUniqueFileName(Constants.PR_VALUE_FILE_PREFIX,_config.getIterationNumber() - 1,_server.getNodeIndex()));
LOG.info("Iteration Number is:" + _config.getIterationNumber() + ". Using Values File:" + valuesRemotePath);
}
/*
Path localValuesFilePath = new Path(new File(_server.getActiveJobLocalPath(),PageRankUtils.makeUniqueFileName(Constants.PR_VALUE_FILE_PREFIX,_config.getIterationNumber(),_server.getNodeIndex())).getAbsolutePath());
// copy the files to the local directory ...
FileStatus rangeFileStatus = fileSystem.getFileStatus(rangeRemotePath);
File rangeLocalFile = new File(rangeLocalPath.toString());
if (rangeLocalFile.exists() == false || rangeLocalFile.length() != rangeFileStatus.getLen()) {
rangeLocalFile.delete();
LOG.info("Copying Range File:" + rangeRemotePath + " to " + rangeLocalPath);
fileSystem.copyToLocalFile(rangeRemotePath, rangeLocalPath);
}
else {
LOG.info("Skipping Copy of Range File:" + rangeRemotePath + " to " + rangeLocalPath);
}
FileStatus idFileStatus = fileSystem.getFileStatus(idsRemotePath);
File idLocalFile = new File(idsLocalPath.toString());
if (idLocalFile.exists() == false || idLocalFile.length() != idFileStatus.getLen()) {
LOG.info("Copying Ids File:" + idsRemotePath + " to " + idsLocalPath);
fileSystem.copyToLocalFile(idsRemotePath, idsLocalPath);
}
else {
LOG.info("Skipping Copying Ids File:" + idsRemotePath + " to " + idsLocalPath);
}
*/
FileStatus outlinksFileStatus = fileSystem.getFileStatus(outlinksFileRemotePath);
File outlinksLocalFile = new File(outlinksFileLocalPath.toString());
if (outlinksLocalFile.exists() == false || outlinksLocalFile.length() != outlinksFileStatus.getLen()) {
LOG.info("Copying outlinks File:" + outlinksFileRemotePath + " to " + outlinksLocalFile);
fileSystem.copyToLocalFile(outlinksFileRemotePath,outlinksFileLocalPath);
}
else {
LOG.info("Skipping Copying outlinks File:" + outlinksFileRemotePath + " to " + outlinksLocalFile);
}
/*
FileStatus valuesFileStatus = fileSystem.getFileStatus(valuesRemotePath);
File valuesLocalFile = new File(localValuesFilePath.toString());
if (valuesLocalFile.exists() == false || valuesLocalFile.length() != valuesFileStatus.getLen()) {
LOG.info("Copying values File:" + valuesRemotePath + " to " + valuesLocalFile);
fileSystem.copyToLocalFile(valuesRemotePath,localValuesFilePath);
}
else {
LOG.info("Skipping Copying values File:" + valuesRemotePath + " to " + valuesLocalFile);
}
*/
// now load the values map ...
result._valueMap = new PageRankUtils.PRValueMap();
//result._valueMap.open(fileSystem,valuesRemotePath, PageRankUtils.makeRangeFilePath(_server.getActiveJobLocalPath(), _server.getNodeIndex()));
boolean valuesFileMissing = false;
if (_server.getActiveJobConfig().getIterationNumber() != 0 && !_server.getFileSystem().exists(valuesRemotePath)) {
LOG.error("Values File Missing for Iteration:" + _server.getActiveJobConfig().getIterationNumber());
valuesFileMissing = true;
// revert to iteration zero values file ...
valuesRemotePath = new Path(_config.getInputValuesPath(),PageRankUtils.makeUniqueFileName(Constants.PR_VALUE_FILE_PREFIX,0,_server.getNodeIndex()));
}
result._valueMap.open(fileSystem,valuesRemotePath, rangeRemotePath);
// ok now if iteration number is non-zero,
// recalculate rank from previous iteration's data ...
if (_config.getIterationNumber() != 0 && valuesFileMissing) {
// load data from previous iteration ...
int iterationNumberToLoadFrom = _config.getIterationNumber() - 1;
// ok figure out what state master is in
if (_prMasterStatus == PRMasterState.ServerStatus.ITERATING_CALCULATING) {
// use current iteration number data
iterationNumberToLoadFrom = 0;
LOG.info("Master is in CALCULATION PHASE. SKIP LOAD OF VALUEMAP");
}
// in the distribution case ... check to see if checkpoint file is present ...
else if (_prMasterStatus == PRMasterState.ServerStatus.ITERATING_DISTRIBUTING) {
Path checkpointFilePath = PageRankUtils.getCheckpointFilePath(new Path(_server.getActiveJobConfig().getJobWorkPath()),
IterationInfo.Phase.DISTRIBUTE,
_server.getActiveJobConfig().getIterationNumber(),
_server.getNodeIndex());
// ok checkpoint file exists, use current iteration number to load data
if (_server.getFileSystem().exists(checkpointFilePath)) {
LOG.info("Checkpoint file exists. SKIP LOAD OF VALUEMAP");
iterationNumberToLoadFrom = 0;
}
}
if (iterationNumberToLoadFrom != 0) {
// load super domain filter
LOG.info("Initializing SuperDomain Filter");
SuperDomainFilter superDomainFilter = new SuperDomainFilter();
superDomainFilter.loadFromPath(_server.getDirectoryServiceAddress(), CrawlEnvironment.ROOT_SUPER_DOMAIN_PATH, false);
LOG.info("Starting Calculate Task to load value map - Using Iteration Number:" + iterationNumberToLoadFrom);
// first zero value map values ...
result._valueMap.zeroValues();
PageRankUtils.calculateRank(
_server.getConfig(),
_server.getFileSystem(),
result._valueMap,
_server.getActiveJobLocalPath(),
_server.getActiveJobConfig().getJobWorkPath(),
_server.getNodeIndex(),
_server.getBaseConfig().getSlaveCount(),
iterationNumberToLoadFrom,
superDomainFilter,
new PageRankUtils.ProgressAndCancelCheckCallback() {
@Override
public boolean updateProgress(final float percentComplete) {
_percentComplete = percentComplete;
return BeginPageRankTask.this.isCancelled();
}
});
}
}
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
throw e;
}
return result;
}
@Override
public String getDescription() {
return "Begin PageRank Task";
}
@Override
public synchronized boolean isCancelled() {
return _isCancelled;
}
}