/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.crawlhistory;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.URI;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.TreeSet;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.record.Buffer;
import org.commoncrawl.async.Timer;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.BulkItemHistoryQuery;
import org.commoncrawl.protocol.BulkItemHistoryQueryResponse;
import org.commoncrawl.protocol.BulkUpdateData;
import org.commoncrawl.protocol.CrawlHistoryStatus;
import org.commoncrawl.protocol.CrawlMaster;
import org.commoncrawl.protocol.CrawlerHistoryService;
import org.commoncrawl.protocol.SingleItemHistoryQueryResponse;
import org.commoncrawl.protocol.SlaveHello;
import org.commoncrawl.protocol.SlaveRegistration;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.protocol.CrawlHistoryStatus.CheckpointState;
import org.commoncrawl.rpc.base.internal.AsyncClientChannel;
import org.commoncrawl.rpc.base.internal.AsyncContext;
import org.commoncrawl.rpc.base.internal.AsyncRequest;
import org.commoncrawl.rpc.base.internal.AsyncServerChannel;
import org.commoncrawl.rpc.base.internal.NullMessage;
import org.commoncrawl.rpc.base.internal.AsyncRequest.Status;
import org.commoncrawl.rpc.base.shared.RPCException;
import org.commoncrawl.server.CommonCrawlServer;
import org.commoncrawl.service.crawler.CrawlSegmentLog;
import org.commoncrawl.service.crawlhistory.HistoryServerState;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.IPAddressUtils;
import org.commoncrawl.util.ImmutableBuffer;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.BitUtils.BitStream;
import org.commoncrawl.util.time.Hour;
/**
*
*
* @author rana
*
*/
public class CrawlHistoryServer extends CommonCrawlServer
implements CrawlerHistoryService, AsyncServerChannel.ConnectionCallback, AsyncClientChannel.ConnectionCallback,Timer.Callback {
InetAddress _masterIP;
int _masterPort = -1;
private int _numElements = -1;
private int _numHashFunctions = -1;
private int _bitsPerElement = -1;
private int _crawlNumber = 0;
private URLFPBloomFilter _bloomFilter = null;
/** server state record key **/
String HistoryServerStateKey = "HistoryServerState";
/** server state object **/
HistoryServerState _state;
/** checkpoint file system **/
FileSystem _checkpointFS;
/** segment log fs **/
FileSystem _segmentLogFS;
/** base storage path **/
Path _baseStoragePath = new Path(CrawlEnvironment.HDFS_HistoryServerBase);
/** segment storage dir **/
Path _segmentLogsStoragePath = new Path(CrawlEnvironment.getCrawlSegmentLogsDirectory());
/** timers **/
Timer _handshakeTimer;
private Thread _checkpointThread = null;
private Semaphore _checkpointThreadSemaphore = new Semaphore(1);
private boolean _shutdownFlag = false;
/** checkpoint paths **/
private TreeSet<Path> _processedPaths = new TreeSet<Path>();
/** last checkpoint time **/
private long _lastCheckpointScanTime = -1;
private long _lastCheckpointFlushTime = -1;
private static final int CHECKPOINT_MUTEX_ACQUISITON_DELAY = 60000 * 2;
/** urls process since last checkpoint **/
private AtomicInteger _urlsProcessedSinceCheckpoint = new AtomicInteger();
/** default checkpoint scan interval **/
private static final int DEFAULT_CHECKPOINT_SCAN_INTERVAL = 60000; // every minute
/** default checkpoint flush interval **/
private static final int DEFAULT_CHECKPOINT_FLUSH_INTERVAL = 10 * 60 * 1000; // 10 minutes
private int _checkpointScanInterval = DEFAULT_CHECKPOINT_SCAN_INTERVAL;
private int _checkpointFlushInterval = DEFAULT_CHECKPOINT_FLUSH_INTERVAL;
enum HandshakeState {
NOT_INITIATED,
INITIATING,
IDLE,
RENEWING
}
HandshakeState _handshakeState = HandshakeState.NOT_INITIATED;
boolean _connectedToMaster = false;
SlaveRegistration _registration = null;
private static final Log LOG = LogFactory.getLog(CrawlHistoryServer.class);
@Override
protected String getDefaultDataDir() {
return CrawlEnvironment.DEFAULT_DATA_DIR;
}
@Override
protected String getDefaultHttpInterface() {
return CrawlEnvironment.DEFAULT_HTTP_INTERFACE;
}
@Override
protected int getDefaultHttpPort() {
return CrawlEnvironment.DEFAULT_CRAWLER_HISTORY_HTTP_PORT;
}
@Override
protected String getDefaultLogFileName() {
return "historyserver.log";
}
@Override
protected String getDefaultRPCInterface() {
return CrawlEnvironment.DEFAULT_RPC_INTERFACE;
}
@Override
protected int getDefaultRPCPort() {
return CrawlEnvironment.DEFAULT_CRAWLER_HISTORY_RPC_PORT;
}
@Override
protected String getWebAppName() {
return CrawlEnvironment.CRAWLER_HISTORY_WEBAPP_NAME;
}
/**
*
* @return the instance id based name for this host
* @throws IOException
*/
String getHostId() throws IOException {
if (_registration != null) {
return CrawlEnvironment.getCrawlerNameGivenId(_registration.getInstanceId());
}
throw new IOException("Invalid State. No Established Instance Id!");
}
AsyncClientChannel _masterChannel = null;
CrawlMaster.AsyncStub _masterRPCStub;
AsyncServerChannel _serverChannel = null;
@Override
protected boolean initServer() {
try {
InetSocketAddress masterAddress = new InetSocketAddress(_masterIP,_masterPort);
InetSocketAddress serverAddress = new InetSocketAddress(_serverAddress.getAddress(),0);
_masterChannel = new AsyncClientChannel(getEventLoop(),serverAddress,masterAddress,this);
_masterRPCStub = new CrawlMaster.AsyncStub(_masterChannel);
_masterChannel.open();
// create server channel ...
_serverChannel = new AsyncServerChannel(this, this.getEventLoop(), this.getServerAddress(),this);
// register RPC services it supports ...
registerService(_serverChannel,CrawlerHistoryService.spec);
_handshakeTimer = new Timer(5000,true,this);
getEventLoop().setTimer(_handshakeTimer);
return true;
}
catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
}
return false;
}
void startServices() {
_shutdownFlag = false;
// now initialize the recorstore ...
try {
_state = new HistoryServerState();
_state.setCurrentCheckpointState(CheckpointState.ACTIVE);
_state.setCurrentCrawlNumber(_crawlNumber);
updateState();
// load bloom filter from disk if possible
loadBloomFilter();
_serverChannel.open();
// start the checkpoint thread ...
startCheckpointThread();
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
void shutdownServices() {
_serverChannel.close();
_registration = null;
_handshakeState = HandshakeState.NOT_INITIATED;
// ok, wait to grab the checkpoint thread semaphore
LOG.info("Server Shutdown Detected. Waiting on checkpoint thread");
_shutdownFlag = true;
try {
_checkpointThreadSemaphore.acquireUninterruptibly();
LOG.info("Checkpoint thread semaphore acquired. Joining checkpoint thread ... ");
if (_checkpointThread != null) {
try {
_checkpointThread.join();
} catch (Exception e) {
LOG.error("Exception while waiting for Checkpoint Thread shutdown:" + CCStringUtils.stringifyException(e));
}
}
}
finally {
_checkpointThreadSemaphore.release();
_bloomFilter = null;
}
}
/** do a clean shutdown (if possible) **/
@Override
public void stop() {
shutdownServices();
super.stop();
}
/** update and persist state data structure **/
private void updateState(){
//NOOP now
}
@Override
protected boolean parseArguements(String[] argv) {
for(int i=0; i < argv.length;++i) {
if (argv[i].equalsIgnoreCase("--numElements")) {
if (i+1 < argv.length) {
_numElements = Integer.parseInt(argv[++i]);
}
}
else if (argv[i].equalsIgnoreCase("--numHashFunctions")) {
if (i+1 < argv.length) {
_numHashFunctions = Integer.parseInt(argv[++i]);
}
}
else if (argv[i].equalsIgnoreCase("--numBitsPerElement")) {
if (i+1 < argv.length) {
_bitsPerElement = Integer.parseInt(argv[++i]);
}
}
else if (argv[i].equalsIgnoreCase("--crawlNumber")) {
_crawlNumber = Integer.parseInt(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--checkpointFS")) {
try {
_checkpointFS = FileSystem.get(new URI(argv[++i]),getConfig());
} catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
else if (argv[i].equalsIgnoreCase("--masterIP")) {
try {
_masterIP = InetAddress.getByName(argv[++i]);
} catch (UnknownHostException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
else if (argv[i].equalsIgnoreCase("--masterPort")) {
_masterPort = Integer.parseInt(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--storageBase")) {
_baseStoragePath = new Path(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--segmentLogsDir")) {
_segmentLogsStoragePath = new Path(argv[++i]);
try {
_segmentLogFS = FileSystem.get(_segmentLogsStoragePath.toUri(),getConfig());
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
return false;
}
}
else if (argv[i].equalsIgnoreCase("--checkpointScanInterval")) {
_checkpointScanInterval = Integer.parseInt(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--checkpointFlushInterval")) {
_checkpointFlushInterval = Integer.parseInt(argv[++i]);
}
}
if (_segmentLogFS == null) {
try {
_segmentLogFS = FileSystem.get(_segmentLogsStoragePath.toUri(),getConfig());
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
return false;
}
}
if (
_numElements != -1
&& _numHashFunctions != -1
&& _bitsPerElement != -1
&& _crawlNumber != -1
&& _checkpointFS != null
&& _masterIP != null
&& _masterPort != -1
) {
return true;
}
else {
LOG.error("Some Command Line Parameters Were Missing:");
LOG.error("Parameter:_numElements Value:" + _numElements);
LOG.error("Parameter:_numHashFunctions Value:" + _numHashFunctions);
LOG.error("Parameter:_bitsPerElement Value:" + _bitsPerElement);
LOG.error("Parameter:_crawlNumber Value:" + _crawlNumber);
LOG.error("Parameter:_checkpointFS Value:" + _checkpointFS);
LOG.error("Parameter:_segmentLogFS Value:" + _segmentLogFS);
LOG.error("Parameter:_masterIP Value:" + _masterIP);
LOG.error("Parameter:_masterPort Value:" + _masterPort);
return false;
}
}
@Override
protected void printUsage() {
}
@Override
protected boolean startDaemons() {
return true;
}
@Override
protected void stopDaemons() {
}
@Override
public void checkpoint(AsyncContext<CrawlHistoryStatus, CrawlHistoryStatus> rpcContext)throws RPCException {
//NOOP
}
@Override
public void bulkItemQuery(AsyncContext<BulkItemHistoryQuery, BulkItemHistoryQueryResponse> rpcContext)throws RPCException {
LOG.info("Received BulkItemQueryRequest");
ImmutableBuffer inputBuffer = rpcContext.getInput().getFingerprintList();
if (inputBuffer.getCount() != 0) {
try {
if (_bloomFilter == null) {
throw new IOException("BloomFilter Not Initilized. Invalid Server State!");
}
DataInputStream inputStream = new DataInputStream(
new ByteArrayInputStream(inputBuffer.getReadOnlyBytes(),0,inputBuffer.getCount()));
BitStream bitStreamOut = new BitStream();
URLFPV2 fingerprint = new URLFPV2();
int itemsPresent = 0;
while (inputStream.available() != 0) {
fingerprint.setDomainHash(WritableUtils.readVLong(inputStream));
fingerprint.setUrlHash(WritableUtils.readVLong(inputStream));
if (_bloomFilter.isPresent(fingerprint)) {
bitStreamOut.addbit(1);
++itemsPresent;
}
else {
bitStreamOut.addbit(0);
}
}
LOG.info("Received BulkItemQueryRequest Completed with " + itemsPresent + " items found");
rpcContext.getOutput().setResponseList(new Buffer(bitStreamOut.bits,0,(bitStreamOut.nbits + 7) / 8));
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
rpcContext.setStatus(Status.Error_RequestFailed);
rpcContext.setErrorDesc(CCStringUtils.stringifyException(e));
}
rpcContext.completeRequest();
}
}
@Override
public void singleItemQuery(AsyncContext<URLFPV2, SingleItemHistoryQueryResponse> rpcContext)throws RPCException {
try {
if (_bloomFilter == null) {
throw new IOException("BloomFilter Not Initilized. Invalid Server State!");
}
rpcContext.getOutput().setWasCrawled(_bloomFilter.isPresent(rpcContext.getInput()));
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
rpcContext.setStatus(Status.Error_RequestFailed);
rpcContext.setErrorDesc(CCStringUtils.stringifyException(e));
}
finally {
rpcContext.completeRequest();
}
}
@Override
public void updateHistory(AsyncContext<URLFPV2, NullMessage> rpcContext)throws RPCException {
try {
if (_bloomFilter == null) {
throw new IOException("BloomFilter Not Initilized. Invalid Server State!");
}
_bloomFilter.add(rpcContext.getInput());
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
rpcContext.setStatus(Status.Error_RequestFailed);
rpcContext.setErrorDesc(CCStringUtils.stringifyException(e));
}
finally {
rpcContext.completeRequest();
}
}
private final Path getDataFileBasePath() throws IOException {
return new Path(_baseStoragePath,getHostId());
}
private final Path getDataFileFinalPath()throws IOException {
return new Path(_baseStoragePath,getHostId()+".data");
}
private final Path getDataFileCheckpointPath()throws IOException {
return new Path(_baseStoragePath,getHostId()+".checkpoint");
}
private final Path getCheckpointMutexPath()throws IOException {
Hour hour = new Hour(new Date());
Path checkpointPath = new Path(_baseStoragePath,CrawlEnvironment.HDFS_HistoryServerCheckpointMutex+"."+ getHostId() +"."+hour.getFirstMillisecond());
return checkpointPath;
}
private List<Path> reloadActiveHistory()throws IOException {
ArrayList<Path> paths = new ArrayList<Path>();
FileSystem fs = _segmentLogFS;
// create scan pattern
Path hdfsScanPath = CrawlEnvironment.getRemoteCrawlSegmentLogWildcardPath(_segmentLogsStoragePath,getHostId());
// scan hdfs for log files
FileStatus candidates[];
LOG.info("Scanning For Cadnidates in:" + hdfsScanPath);
candidates = fs.globStatus(hdfsScanPath);
// iterate candidates
for(FileStatus candidate : candidates) {
// ok found a candidate we can work on
LOG.info("Found Candidate:" + candidate.getPath());
final URLFPV2 placeHolderFP = new URLFPV2();
CrawlSegmentLog.walkFingerprintsInLogFile(fs,candidate.getPath(),new CrawlSegmentLog.LogFileItemCallback() {
@Override
public void processItem(long domainHash, long urlFingerprint) {
placeHolderFP.setDomainHash(domainHash);
placeHolderFP.setUrlHash(urlFingerprint);
// add item for bloom filter
_bloomFilter.add(placeHolderFP);
}
});
LOG.info("Finished Processing Candidate:" + candidate.getPath());
paths.add(candidate.getPath());
}
return paths;
}
private void serializeBloomFilter(Path checkpointPath) throws IOException {
FileSystem fs = _checkpointFS;
// delete existing ...
fs.delete(checkpointPath,false);
FSDataOutputStream outputStream = fs.create(checkpointPath);
try {
DataOutputStream dataOut = new DataOutputStream(outputStream);
dataOut.writeInt(0); // version
dataOut.writeInt(_state.getCurrentCrawlNumber()); // crawl number ...
// serialize bloom filter contents ...
_bloomFilter.serialize(outputStream);
}
finally {
if (outputStream != null) {
outputStream.flush();
outputStream.close();
}
}
}
private void deSerializeBloomFilter(Path checkpointPath) throws IOException {
FileSystem fs = _checkpointFS;
FSDataInputStream stream = fs.open(checkpointPath);
try {
stream.readInt(); // version
stream.readInt(); // crawl number ...
// release previous version of bloom filter if any
_bloomFilter = null;
// serialize bloom filter contents ...
_bloomFilter = URLFPBloomFilter.load(stream);
}
finally {
stream.close();
}
}
private boolean validateOnDiskVersion() throws IOException {
FileSystem fs = _checkpointFS;
Path dataFilePath = getDataFileFinalPath();
LOG.info("Loading BloomFilter From Disk at Path:" + dataFilePath);
if (fs.exists(dataFilePath)) {
FSDataInputStream stream = null;
try {
stream = fs.open(dataFilePath);
DataInputStream dataInput = new DataInputStream(stream);
// skip version
dataInput.readInt();
// read crawl version ...
int serializedCrawlVersion = dataInput.readInt();
LOG.info("BloomFilter From On Disk has CrawlVersion:" + serializedCrawlVersion);
if (serializedCrawlVersion < _state.getCurrentCrawlNumber()) {
LOG.error("skipping load because serial crawl number is less than current crawl");
stream.close();
stream = null;
fs.rename(dataFilePath, new Path(dataFilePath.getParent(),dataFilePath.getName()+"-V-"+serializedCrawlVersion));
return false;
}
return true;
}
finally {
if (stream != null)
stream.close();
}
}
return false;
}
private void loadBloomFilter() throws IOException {
FileSystem fs = _checkpointFS;
Path dataFilePath = getDataFileFinalPath();
LOG.info("Potentially Loading BloomFilter From Disk at Path:" + dataFilePath);
if (!validateOnDiskVersion()) {
LOG.info("On Disk Verison Not Valid. Allocating New BloomFilter at Path:" + dataFilePath);
LOG.info("Allocating NEW BloomFilter");
_bloomFilter = new URLFPBloomFilter(_numElements,_numHashFunctions,_bitsPerElement);
}
else {
LOG.info("Loading BloomFilter From Disk");
deSerializeBloomFilter(dataFilePath);
}
List<Path> paths = reloadActiveHistory();
if (paths.size() != 0) {
LOG.info("Loaded Some History Via Log Files - Writing Back to Disk");
serializeBloomFilter(dataFilePath);
for (Path historyFile : paths) {
fs.delete(historyFile,false);
}
}
}
private void startCheckpointThread() {
_checkpointThread = new Thread(new Runnable() {
@Override
public void run() {
// ok check point thread run in perpetuty
while (!_shutdownFlag) {
if (_lastCheckpointScanTime == -1
|| _lastCheckpointFlushTime == -1
|| (System.currentTimeMillis() - _lastCheckpointScanTime) >= _checkpointScanInterval
|| (System.currentTimeMillis() - _lastCheckpointFlushTime) >= _checkpointFlushInterval) {
//LOG.info("Checkpoint Thread Grabbing Semaphore");
// grab checkpoint thread semaphore
_checkpointThreadSemaphore.acquireUninterruptibly();
LOG.info("Checkpoint Thread Grabbed Semaphore");
try {
try {
// create scan pattern
Path hdfsScanPath = CrawlEnvironment.getRemoteCrawlSegmentLogWildcardPath(_segmentLogsStoragePath, getHostId());
// scan hdfs for log files
FileStatus candidates[];
LOG.info("Scanning for logs in:" + _segmentLogsStoragePath + " using wildcard:" + hdfsScanPath + " FS is:" + _segmentLogFS);
candidates = _segmentLogFS.globStatus(hdfsScanPath);
// iterate candidates
for(FileStatus candidate : candidates) {
// check candidate against processed path list ...
if (!_processedPaths.contains(candidate.getPath())){
int urlCountBeforeProcessing = _urlsProcessedSinceCheckpoint.get();
// ok found a candidate we can work on
LOG.info("Checkpoint Thread Found Candidate:" + candidate.getPath());
final URLFPV2 placeHolderFP = new URLFPV2();
CrawlSegmentLog.walkFingerprintsInLogFile(_segmentLogFS,candidate.getPath(),new CrawlSegmentLog.LogFileItemCallback() {
@Override
public void processItem(long domainHash, long urlFingerprint) {
placeHolderFP.setDomainHash(domainHash);
placeHolderFP.setUrlHash(urlFingerprint);
// add item for bloom filter
_bloomFilter.add(placeHolderFP);
// inrement urls processed count ...
_urlsProcessedSinceCheckpoint.addAndGet(1);
}
});
_processedPaths.add(candidate.getPath());
LOG.info("Finished Processing Candidate:" + candidate.getPath());
}
}
// update scan time ...
_lastCheckpointScanTime = System.currentTimeMillis();
// see if can do a full checkpoint ...
if (_lastCheckpointFlushTime == -1 || System.currentTimeMillis() - _lastCheckpointFlushTime >= _checkpointFlushInterval) {
int approximateItemsToFlush = _urlsProcessedSinceCheckpoint.get();
// ok at this point we are read to initialize a checkpoint
if (approximateItemsToFlush != 0) {
Path checkpointMutexPath = getCheckpointMutexPath();
if (_checkpointFS.createNewFile(checkpointMutexPath)) {
try {
LOG.info("Checkpoint Thread Starting Checkpoint");
// get the checkpoint path ...
Path checkpointPath = getDataFileCheckpointPath();
Path finalPath = getDataFileFinalPath();
LOG.info("Checkpoint Thread Writing BloomFilter Data");
// serialize the filter ...
serializeBloomFilter(checkpointPath);
LOG.info("Checkpoint Thread Deleting Old Checkpoint Data");
// ok now everything seems to have gone fine ... delete existing data file
_checkpointFS.delete(finalPath,false);
LOG.info("Checkpoint Thread ReWriting New Checkpoint Data");
// rename checkpoint to final ...
_checkpointFS.rename(checkpointPath, finalPath);
if (_state.getCurrentCheckpointState() != CrawlHistoryStatus.CheckpointState.TRANSITIONING) {
LOG.info("Checkpoint Thread Deleting Processed Files");
// ok safely delete all processed files
for (Path processedFilePath: _processedPaths){
_segmentLogFS.delete(processedFilePath,false);
}
_processedPaths.clear();
}
else {
LOG.info("Skipping Processed Files Purge because we are in Transitioning State");
}
_urlsProcessedSinceCheckpoint.addAndGet(-approximateItemsToFlush);
}
finally {
LOG.info("Checkpoint Thread Releasing Mutex:" + checkpointMutexPath);
_checkpointFS.delete(checkpointMutexPath, false);
}
}
else {
int delay = (int)(Math.random() * CHECKPOINT_MUTEX_ACQUISITON_DELAY);
LOG.info("Checkpoint thread failed to acquire Mutex:" + checkpointMutexPath + " Waiting " + delay + "(MS) before retry");
try {
Thread.sleep(delay);
} catch (InterruptedException e) {
}
}
}
// update last checkpoint no time no matter what ...
_lastCheckpointFlushTime = System.currentTimeMillis();
}
} catch (IOException e) {
LOG.error("Checkpoint Thread Bloom Filter Checkpoint Failed with Exception:" + CCStringUtils.stringifyException(e));
try {
Thread.sleep(60000);
} catch (InterruptedException e1) {
}
}
}
finally {
LOG.info("Checkpoint Thread Releasing Checkpoint Semaphore");
_checkpointThreadSemaphore.release();
}
}
else {
try {
//LOG.info("Checkpoint Thread IDLE");
Thread.sleep(30000);
} catch (InterruptedException e) {
}
}
}
LOG.info("Checkpoint Thread Exiting!");
}
});
_checkpointThread.start();
}
@Override
public void IncomingClientConnected(AsyncClientChannel channel) {
// TODO Auto-generated method stub
}
@Override
public void IncomingClientDisconnected(AsyncClientChannel channel) {
// TODO Auto-generated method stub
}
@Override
public void queryStatus(AsyncContext<NullMessage, CrawlHistoryStatus> rpcContext)throws RPCException {
try {
rpcContext.getOutput().setActiveCrawlNumber(_state.getCurrentCrawlNumber());
rpcContext.getOutput().setCheckpointState(_state.getCurrentCheckpointState());
rpcContext.setStatus(Status.Success);
}
finally {
rpcContext.completeRequest();
}
}
@Override
public void sync(final AsyncContext<CrawlHistoryStatus, NullMessage> rpcContext) throws RPCException {
LOG.info("Received Sync From Crawler");
// validate crawl number
if (_state.getCurrentCrawlNumber() == rpcContext.getInput().getActiveCrawlNumber()) {
// snapshot current time
final long startTime = System.currentTimeMillis();
// ok reset resync variable on checkpoint thread
_lastCheckpointScanTime = -1;
// now set a timer to poll periodically for resync to complete
getEventLoop().setTimer(new Timer(100,true,new Timer.Callback() {
@Override
public void timerFired(Timer timer) {
// ok check to see if resync happened ...
LOG.info("Timer Fired. Last Checkpoint Scan Time:" + _lastCheckpointScanTime + " Desired Threshold Time:" + startTime);
if (_lastCheckpointScanTime >= startTime) {
getEventLoop().cancelTimer(timer);
try {
rpcContext.setStatus(Status.Success);
rpcContext.completeRequest();
} catch (RPCException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}
}));
}
else {
LOG.error("Crawler CrawlNumber and HistoryServer CrawlNumber don't match! - Aborting Sync");
rpcContext.setStatus(Status.Error_RequestFailed);
rpcContext.setErrorDesc("Crawler CrawlNumber and HistoryServer CrawlNumber don't match! - Aborting Sync");
rpcContext.completeRequest();
}
}
@Override
public void bulkUpdateHistory(AsyncContext<BulkUpdateData, NullMessage> rpcContext) throws RPCException {
LOG.info("Received BulkUpdate Request");
ImmutableBuffer inputBuffer = rpcContext.getInput().getFingerprintList();
try {
if (inputBuffer.getCount() != 0) {
try {
if (_bloomFilter == null) {
throw new IOException("BloomFilter Not Initilized. Invalid Server State!");
}
DataInputStream inputStream = new DataInputStream(
new ByteArrayInputStream(inputBuffer.getReadOnlyBytes(),0,inputBuffer.getCount()));
URLFPV2 fingerprint = new URLFPV2();
int itemsAdded = 0;
while (inputStream.available() != 0) {
fingerprint.setDomainHash(WritableUtils.readVLong(inputStream));
fingerprint.setUrlHash(WritableUtils.readVLong(inputStream));
_bloomFilter.add(fingerprint);
++itemsAdded;
}
_urlsProcessedSinceCheckpoint.addAndGet(itemsAdded);
LOG.info("Finished Processed BulkUpdate Request. " + itemsAdded + " items processed." );
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
rpcContext.setStatus(Status.Error_RequestFailed);
rpcContext.setErrorDesc(CCStringUtils.stringifyException(e));
}
}
}
finally {
rpcContext.completeRequest();
}
}
@Override
public void OutgoingChannelConnected(AsyncClientChannel channel) {
_connectedToMaster = true;
initiateHandshake();
}
public void initiateHandshake() {
_handshakeState = HandshakeState.INITIATING;
LOG.info("Connected to Master. Initiating Handshake");
SlaveHello slaveHello = new SlaveHello();
slaveHello.setIpAddress(IPAddressUtils.IPV4AddressToInteger(_serverAddress.getAddress().getAddress()));
slaveHello.setCookie(System.currentTimeMillis());
slaveHello.setServiceName("history");
try {
_masterRPCStub.registerSlave(slaveHello,new AsyncRequest.Callback<SlaveHello, SlaveRegistration>() {
@Override
public void requestComplete(AsyncRequest<SlaveHello, SlaveRegistration> request) {
if (request.getStatus() == Status.Success) {
// LOG.info("Master Handshake Successfull");
_registration = request.getOutput();
_registration.setLastTimestamp(System.currentTimeMillis());
_handshakeState = HandshakeState.IDLE;
startServices();
}
else {
LOG.error("Handshake to Master Failed");
_handshakeState = HandshakeState.NOT_INITIATED;
}
}
});
}
catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
@Override
public boolean OutgoingChannelDisconnected(AsyncClientChannel channel) {
LOG.error("Disconnected From Master. Shutting Down Services");
_connectedToMaster = false;
shutdownServices();
return false;
}
@Override
public void timerFired(Timer timer) {
if (timer == _handshakeTimer) {
if (_handshakeState == HandshakeState.NOT_INITIATED) {
initiateHandshake();
}
else if (_handshakeState == HandshakeState.IDLE) {
if (_registration != null) {
if (System.currentTimeMillis() - _registration.getLastTimestamp() >= 1000) {
//LOG.info("Renewing Lease with Master");
_handshakeState = HandshakeState.RENEWING;
try {
_masterRPCStub.extendRegistration(_registration, new AsyncRequest.Callback<SlaveRegistration, NullMessage>() {
@Override
public void requestComplete(AsyncRequest<SlaveRegistration, NullMessage> request) {
if (request.getStatus() == Status.Success) {
//LOG.info("Extended Registration");
_registration.setLastTimestamp(System.currentTimeMillis());
_handshakeState = HandshakeState.IDLE;
}
else {
LOG.error("Handshake Extension Failed!");
shutdownServices();
}
}
});
} catch (RPCException e) {
LOG.error(CCStringUtils.stringifyException(e));
shutdownServices();
}
}
}
}
}
}
}