/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.crawler;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.URI;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Set;
import java.util.TreeSet;
import java.util.Vector;
import java.util.concurrent.Semaphore;
import javax.servlet.jsp.JspWriter;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.WritableUtils;
import org.commoncrawl.async.Timer;
import org.commoncrawl.async.ConcurrentTask.CompletionCallback;
import org.commoncrawl.common.Environment;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.io.NIODNSResolver;
import org.commoncrawl.io.NIOHttpConnection;
import org.commoncrawl.protocol.ActiveHostInfo;
import org.commoncrawl.protocol.CrawlHistoryStatus;
import org.commoncrawl.protocol.CrawlMaster;
import org.commoncrawl.protocol.CrawlSegment;
import org.commoncrawl.protocol.CrawlSegmentHost;
import org.commoncrawl.protocol.CrawlSegmentStatus;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.CrawlerAction;
import org.commoncrawl.protocol.CrawlerHistoryService;
import org.commoncrawl.protocol.CrawlerService;
import org.commoncrawl.protocol.CrawlerStatus;
import org.commoncrawl.protocol.SlaveHello;
import org.commoncrawl.protocol.SlaveRegistration;
import org.commoncrawl.rpc.base.internal.AsyncClientChannel;
import org.commoncrawl.rpc.base.internal.AsyncContext;
import org.commoncrawl.rpc.base.internal.AsyncRequest;
import org.commoncrawl.rpc.base.internal.AsyncServerChannel;
import org.commoncrawl.rpc.base.internal.NullMessage;
import org.commoncrawl.rpc.base.internal.AsyncRequest.Callback;
import org.commoncrawl.rpc.base.internal.AsyncRequest.Status;
import org.commoncrawl.rpc.base.shared.RPCException;
import org.commoncrawl.server.AsyncWebServerRequest;
import org.commoncrawl.server.CommonCrawlServer;
import org.commoncrawl.service.crawler.CrawlLog.CheckpointCompletionCallback;
import org.commoncrawl.service.crawler.CrawlerEngine.CrawlStopCallback;
import org.commoncrawl.service.crawler.SegmentLoader.LoadProgressCallback;
import org.commoncrawl.service.crawler.filters.DomainFilterData;
import org.commoncrawl.service.crawler.filters.FilterResults;
import org.commoncrawl.service.crawler.filters.CrawlRateOverrideFilter;
import org.commoncrawl.service.crawler.filters.DomainFilter;
import org.commoncrawl.service.crawler.filters.IPAddressBlockFilter;
import org.commoncrawl.service.crawler.filters.Filter.FilterResult;
import org.commoncrawl.service.directory.DirectoryServiceCallback;
import org.commoncrawl.service.directory.DirectoryServiceItemList;
import org.commoncrawl.service.directory.DirectoryServiceRegistrationInfo;
import org.commoncrawl.service.directory.DirectoryServiceServer;
import org.commoncrawl.service.directory.DirectoryServiceSubscriptionInfo;
import org.commoncrawl.service.dns.DNSRewriteFilter;
import org.commoncrawl.service.dns.DNSService;
import org.commoncrawl.service.dns.DNSServiceResolver;
import org.commoncrawl.service.statscollector.CrawlerStatsService;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.FlexBuffer;
import org.commoncrawl.util.IPAddressUtils;
import org.commoncrawl.util.RuntimeStatsCollector;
import org.commoncrawl.util.URLUtils;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
/**
* Crawler Server (CommonCrawlerServer derived class)
*
* @author rana
*
*/
public class CrawlerServer extends CommonCrawlServer
implements CrawlerService,
AsyncClientChannel.ConnectionCallback,
AsyncServerChannel.ConnectionCallback,
DirectoryServiceCallback,
Timer.Callback {
enum HandshakeStatus {
Uninitialized,
SendingInitToMaster,
MasterConnected,
SentUpdateToMaster
}
private InetSocketAddress _masterAddress = null;
private int _maxSockets = -1;
private static CrawlerEngine _engine;
private static CrawlerServer _server;
private CrawlerStatus _crawlerStatus;
/** DNS Resolver Service **/
private InetSocketAddress _dnsServiceAddress;
private AsyncClientChannel _dnsServiceChannel;
private DNSService.AsyncStub _dnsServiceStub;
private DNSServiceResolver _dnsServiceResolver;
/** Directory Service Stub **/
InetAddress _directoryServiceAddress;
AsyncClientChannel _directoryServiceChannel;
DirectoryServiceServer.AsyncStub _directoryServiceStub;
long _directoryServiceCallbackCookie = 0;
/** StatsCollector Service Stub **/
InetSocketAddress _statsCollectorAddress;
AsyncClientChannel _statsCollectorServiceChannel;
CrawlerStatsService.AsyncStub _statsCollectorStub;
/** History Service Stub **/
InetSocketAddress _historyServiceAddress;
AsyncClientChannel _historyServiceChannel;
CrawlerHistoryService.AsyncStub _historyServiceStub;
/** Crawl Content Path **/
Path _crawlContentPath;
/** filters **/
private DomainFilter _blockedDomainFilter = null;
private DomainFilter _temporarilyBlockedDomainFilter = null;
private DNSRewriteFilter _rewriteFilter = null;
private IPAddressBlockFilter _ipAddressBlockFilter = null;
private long _filterUpdateTime = -1;
private CrawlRateOverrideFilter _crawlRateOverrideFilter = null;
/** record store **/
static final String CrawlerStateKey = "CrawlerState2";
/** master crawl controller support **/
private static final int ACTIVE_HOST_LIST_REFRESH_INTERVAL_MASTER = 20 * 1000; // the master refreshed its list every 20 seconds
private static final int ACTIVE_HOST_LIST_REFRESH_INTERVAL_CLIENT = 1 * 60000; // the clients refresh their list every minute
private int _pauseStateTimestampIncremental = -1;
private long _pauseStateTimestamp = -1;
// used on the master side to store latest state
private FlexBuffer _masterPauseStateBuffer = null;
// slave pause state
private Set<Integer> _pausedHostsSet = null;
InetSocketAddress _masterCrawlerAddress = null;
AsyncClientChannel _masterCrawlerServiceChannel = null;
CrawlerService.AsyncStub _masterCrawlerStub = null;
Timer _masterCrawlerHostListRefreshTimer = null;
private String _unitTestName = null;
private File _domainQueueDirectory = null;
private InetSocketAddress[] _crawlInterface = null;
private InetSocketAddress _proxyAddress = null;
private static final int DEFAULT_DNS_HIGH_WATER_MARK = 500;
private static final int DEFAULT_DNS_LOW_WATER_MARK = 10;
private static final String DEFAULT_DOMAIN_QUEUE_DIR_NAME = "domainQueue";
private static final int DEFAULT_HOST_IDLE_FLUSH_THRESHOLD =60000;
private static int _dnsHighWaterMark =DEFAULT_DNS_HIGH_WATER_MARK;
private static int _dnsLowWaterMark = DEFAULT_DNS_LOW_WATER_MARK;
private static int _maxActiveURLS = -1;
private static long _cycleTime = -1;
private static int _crawlLogCheckpointItemThreshold = CrawlLog.DEFAULT_LOG_FILE_CHECKPOINT_ITEM_COUNT_THRESHOLD;
private static long _crawlLogCheckpointLogSizeThreshold = CrawlLog.DEFAULT_LOG_FILE_SIZE_CHECKPOINT_THRESHOLD;
private static int _crawlLogCheckpointInterval = CrawlLog.DEFAULT_LOG_CHECKPOINT_INTERVAL;
private static int _crawlLogFlushInterval = CrawlLog.DEFAULT_LOG_FLUSH_INTERVAL;
/*** NEW MASTER / SLAVE HANDSHAKE STUFF **/
enum HandshakeState {
NOT_INITIATED,
INITIATING,
IDLE,
RENEWING,
SHUTTING_DOWN
}
HandshakeState _handshakeState = HandshakeState.NOT_INITIATED;
boolean _connectedToMaster = false;
SlaveRegistration _registration = null;
AsyncClientChannel _masterChannel = null;
CrawlMaster.AsyncStub _masterRPCStub;
int _masterPort = -1;
/** timers **/
Timer _handshakeTimer;
public static CrawlerEngine getEngine() {
return _engine;
}
public static CrawlerServer getServer() {
return _server;
}
/**
*
* @return Path where crawled content is stored
*/
Path getCrawlContentPath() {
return _crawlContentPath;
}
/** get the domain queue storage directory name **/
public File getDomainQueueDir() { return _domainQueueDirectory; }
/** get the dns service resolver **/
public NIODNSResolver getDNSServiceResolver() { return _dnsServiceResolver; }
/** get directory service address **/
public InetAddress getDirectoryServiceAddress() { return _directoryServiceAddress; }
/** get history service stub **/
public CrawlerHistoryService.AsyncStub getHistoryServiceStub() { return _historyServiceStub; }
/** get stats service stub **/
public CrawlerStatsService.AsyncStub getStatsCollectorStub() { return _statsCollectorStub; }
/** has parse queue **/
public boolean isParseQueueEnabled() { return true; }
//@Override
protected boolean initServer() {
_server = this;
CrawlList.setServerSingleton(this);
String dataPath = _server.getDataDirectory().getAbsolutePath() + "/";
String dbPath = dataPath;
if (CrawlEnvironment.inUnitTestMode()) {
dbPath += "UnitTest_" + CrawlEnvironment.CRAWLER_DB;
if (Environment.detailLogEnabled())
LOG.info("In Unit Test Mode: DB Path is:" +dbPath );
}
else {
dbPath += getServer().getDatabaseName();
if (Environment.detailLogEnabled())
LOG.info("initialize - Config says Crawler Segment db path is: "+dbPath);
}
File databasePath = new File(dbPath);
// if in unit test mode ... delete existing db file
if (CrawlEnvironment.inUnitTestMode()) {
// delete existing datapase path if any ...
if (databasePath.exists())
databasePath.delete();
}
_crawlerStatus = new CrawlerStatus();
_crawlerStatus.setActiveListNumber(0);
_crawlerStatus.setCrawlerState(CrawlerStatus.CrawlerState.IDLE);
// initialize domain queue directory
if (_domainQueueDirectory == null) {
_domainQueueDirectory = new File(getDataDirectory(),DEFAULT_DOMAIN_QUEUE_DIR_NAME);
}
if (!_domainQueueDirectory.isDirectory()) {
if (!_domainQueueDirectory.mkdir()) {
LOG.error("Unable to Initialize Domain Queue Directory at:" + _domainQueueDirectory.getAbsolutePath());
return false;
}
}
try {
LOG.info("Starting Communications with DNS Server At:" + _dnsServiceAddress);
_dnsServiceChannel = new AsyncClientChannel(_eventLoop,new InetSocketAddress(0),_dnsServiceAddress,this);
_dnsServiceChannel.open();
_dnsServiceStub = new DNSService.AsyncStub(_dnsServiceChannel);
_dnsServiceResolver = new DNSServiceResolver(_dnsServiceStub);
LOG.info("Loading Filters");
reloadFilters();
// start communications with the directory service
LOG.info("Starting Communications with Directory Service Server At:" + _dnsServiceAddress);
_directoryServiceChannel = new AsyncClientChannel(_eventLoop,new InetSocketAddress(0),new InetSocketAddress(_directoryServiceAddress,CrawlEnvironment.DIRECTORY_SERVICE_RPC_PORT),this);
_directoryServiceChannel.open();
_directoryServiceStub = new DirectoryServiceServer.AsyncStub(_directoryServiceChannel);
// start communications with the directory service
LOG.info("Starting Communications with Stats Server At:" + _statsCollectorAddress);
_statsCollectorServiceChannel = new AsyncClientChannel(_eventLoop,new InetSocketAddress(0),_statsCollectorAddress,this);
_statsCollectorServiceChannel.open();
_statsCollectorStub = new CrawlerStatsService.AsyncStub(_statsCollectorServiceChannel);
// and with the history server
if (!externallyManageCrawlSegments()){
LOG.info("Starting Communications with History Service Server At:" + _historyServiceAddress);
_historyServiceChannel = new AsyncClientChannel(_eventLoop,new InetSocketAddress(0),_historyServiceAddress,this);
_historyServiceChannel.open();
_historyServiceStub = new CrawlerHistoryService.AsyncStub(_historyServiceChannel);
}
// see if we have a master crawler ...
if (_masterCrawlerAddress != null) {
_masterCrawlerServiceChannel = new AsyncClientChannel(_eventLoop,new InetSocketAddress(0),_masterCrawlerAddress,this);
_masterCrawlerServiceChannel.open();
_masterCrawlerStub = new CrawlerService.AsyncStub(_masterCrawlerServiceChannel);
}
// initialize logging servlet
getWebServer().addServlet("tailLog", "/tailLog",RequestLogServlet.class);
}
catch (IOException e) {
LOG.fatal(CCStringUtils.stringifyException(e));
return false;
}
if (externallyManageCrawlSegments()) {
if (!initializeEngine(0)) {
return false;
}
}
// create server channel ...
AsyncServerChannel channel = new AsyncServerChannel(this, this.getEventLoop(), this.getServerAddress(),this);
// register RPC services it supports ...
registerService(channel,CrawlerService.spec);
registerService(channel,DirectoryServiceCallback.spec);
// create connection to master ...
InetSocketAddress masterLocalInterfaceAddress = new InetSocketAddress(_serverAddress.getAddress(),0);
try {
_masterChannel = new AsyncClientChannel(getEventLoop(),masterLocalInterfaceAddress,_masterAddress,this);
_masterRPCStub = new CrawlMaster.AsyncStub(_masterChannel);
_masterChannel.open();
_handshakeTimer = new Timer(1000,true,this);
getEventLoop().setTimer(_handshakeTimer);
return true;
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
return false;
}
private boolean initializeEngine(int activeListId) {
// initialize the crawl engine ...
_engine = new CrawlerEngine(this,_maxSockets,_dnsHighWaterMark,_dnsLowWaterMark,_cycleTime,activeListId);
if (_maxActiveURLS != -1) {
_engine.setMaxActiveURLThreshold(_maxActiveURLS);
}
InetSocketAddress crawlInterfaces[] = null;
if (_crawlInterface != null && _crawlInterface.length != 0) {
LOG.info("Crawl Interfaces are:");
for (InetSocketAddress address : _crawlInterface) {
LOG.info(address.toString());
}
crawlInterfaces = _crawlInterface;
}
else {
crawlInterfaces = new InetSocketAddress[] { new InetSocketAddress(0) };
}
if (!_engine.initialize(crawlInterfaces)) {
LOG.fatal("Crawl Engine initialization failed!. Exiting... ");
return false;
}
return true;
}
//@Override
protected boolean parseArguements(String[] argv) {
getConfig().set("http.agent.name",CrawlEnvironment.CCBOT_UA);
for(int i=0; i < argv.length;++i) {
if (argv[i].equalsIgnoreCase("--master")) {
if (i+1 < argv.length) {
_masterAddress = CCStringUtils.parseSocketAddress(argv[++i]);
}
}
else if (argv[i].equalsIgnoreCase("--dnsservice")) {
if (i+1 < argv.length) {
_dnsServiceAddress = new InetSocketAddress(argv[++i],CrawlEnvironment.DNS_SERVICE_RPC_PORT);
}
}
else if (argv[i].equalsIgnoreCase("--maxSockets")) {
if (i+1 < argv.length) {
_maxSockets = Integer.parseInt(argv[++i]);
}
}
else if (argv[i].equalsIgnoreCase("--unitTest")) {
CrawlEnvironment.setUnitTestMode(true);
_unitTestName = argv[++i];
}
else if (argv[i].equalsIgnoreCase("--dnsHighMark")) {
_dnsHighWaterMark = Integer.parseInt(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--dnsLowMark")) {
_dnsLowWaterMark = Integer.parseInt(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--maxActiveURLS")){
_maxActiveURLS = Integer.parseInt(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--cycleTimer")){
_cycleTime = System.currentTimeMillis() + (Integer.parseInt(argv[++i]) * 1000);
}
else if (argv[i].equalsIgnoreCase("--domainQueueDir")) {
_domainQueueDirectory = new File(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--crawlInterface")) {
String interfaceList[] = argv[++i].split(";");
_crawlInterface = new InetSocketAddress[interfaceList.length];
for(int j=0;j<_crawlInterface.length;++j) {
try {
_crawlInterface[j] = new InetSocketAddress(InetAddress.getByName(interfaceList[j]),0);
} catch (UnknownHostException e) {
LOG.error(CCStringUtils.stringifyException(e));
return false;
}
}
}
else if (argv[i].equalsIgnoreCase("--useProxyServer")) {
_proxyAddress = CCStringUtils.parseSocketAddress(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--directoryserver")) {
if (i+1 < argv.length) {
try {
_directoryServiceAddress = InetAddress.getByName(argv[++i]);
} catch (UnknownHostException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}
else if (argv[i].equalsIgnoreCase("--statscollector")) {
if (i+1 < argv.length) {
try {
_statsCollectorAddress = new InetSocketAddress(InetAddress.getByName(argv[++i]),CrawlEnvironment.CRAWLSTATSCOLLECTOR_SERVICE_RPC_PORT);
} catch (UnknownHostException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}
else if (argv[i].equalsIgnoreCase("--historyserver")) {
if (i+1 < argv.length) {
_historyServiceAddress = CCStringUtils.parseSocketAddress(argv[++i]);
}
}
else if (argv[i].equalsIgnoreCase("--mastercrawler")) {
_masterCrawlerAddress = CCStringUtils.parseSocketAddress(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--defaultFS")) {
try {
CrawlEnvironment.setDefaultHadoopFSURI(argv[++i]);
} catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
else if (argv[i].equalsIgnoreCase("--contentDataDir")) {
try {
_crawlContentPath = new Path(argv[++i]);
} catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
else if (argv[i].equalsIgnoreCase("--storageBase")) {
CrawlEnvironment.setCCRootDir(argv[++i]);
LOG.info("Changed CC_ROOT_DIR to:" + CrawlEnvironment.CC_ROOT_DIR);
}
else if (argv[i].equalsIgnoreCase("--segmentLogsDir")) {
CrawlEnvironment.setCrawlSegmentLogsDirectory(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--segmentDataDir")) {
CrawlEnvironment.setCrawlSegmentDataDirectory(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--bloomFilterSize")) {
CrawlerEngine.BLOOM_FILTER_SIZE = Integer.parseInt(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--userAgent")) {
getConfig().set("http.agent.name",argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--crawlLogItemThreshold")) {
_crawlLogCheckpointItemThreshold = Integer.parseInt(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--crawlLogSizeThreshold")) {
_crawlLogCheckpointLogSizeThreshold = Long.parseLong(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--crawlLogCheckpointInterval")) {
_crawlLogCheckpointInterval = Integer.parseInt(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--crawlLogFlushInterval")) {
_crawlLogFlushInterval = Integer.parseInt(argv[++i]);
}
}
return (_masterAddress != null
&& _dnsServiceAddress != null
&& _statsCollectorAddress != null
&& _directoryServiceAddress != null
&& _crawlContentPath != null
&& (_historyServiceAddress != null || externallyManageCrawlSegments()));
}
//@Override
protected void printUsage() {
System.out.println("Crawler Startup Args: --master ["+ _masterAddress +"] "
+ " --crawlInterface ["+ _crawlInterface +"] "
+ " --dnsservice ["+ _dnsServiceAddress +"] "
+ " --directoryserver ["+ _directoryServiceAddress +"] "
+ " --statscollector ["+ _statsCollectorAddress +"] "
+ " --historyserver ["+ _historyServiceAddress +"] "
+ " --contentFS ["+ _crawlContentPath +"]"
);
}
//@Override
protected boolean startDaemons() {
return true;
}
//@Override
protected void stopDaemons() {
}
//@Override
protected String getDefaultLogFileName() {
return "crawler";
}
//@Override
protected int getDefaultRPCPort() {
return CrawlEnvironment.DEFAULT_CRAWLER_RPC_PORT;
}
//@Override
protected String getDefaultHttpInterface() {
return CrawlEnvironment.DEFAULT_HTTP_INTERFACE;
}
//@Override
protected String getDefaultRPCInterface() {
return CrawlEnvironment.DEFAULT_RPC_INTERFACE;
}
//@Override
protected String getWebAppName() {
return "crawler";
}
//@Override
protected int getDefaultHttpPort() {
return CrawlEnvironment.DEFAULT_CRAWLER_HTTP_PORT;
}
//@Override
protected String getDefaultDataDir() {
return CrawlEnvironment.DEFAULT_DATA_DIR;
}
public String getUnitTestName() {
return _unitTestName;
}
public InetSocketAddress getProxyAddress() {
return _proxyAddress;
}
@Override
public String getHostName() {
if (_registration != null) {
return CrawlEnvironment.getCrawlerNameGivenId(_registration.getInstanceId());
}
else {
LOG.error("Get HostName Called but no Lease!");
return super.getHostName();
}
}
public int getCrawlLogCheckpointItemThreshold() {
return _crawlLogCheckpointItemThreshold;
}
public long getCrawlLogCheckpointLogSizeThreshold() {
return _crawlLogCheckpointLogSizeThreshold;
}
public int getCrawlLogCheckpointInterval() {
return _crawlLogCheckpointInterval;
}
public int getCrawlLogFlushInterval() {
return _crawlLogFlushInterval;
}
/*
public void addCrawlSegment(AsyncContext<CrawlSegment, CrawlerStatus> rpcContext) throws RPCException {
_engine.addCrawlSegment(rpcContext);
}
*/
public void OutgoingChannelConnected(AsyncClientChannel channel) {
if (channel == _dnsServiceChannel) {
LOG.info("Connected to DNS Service");
}
else if (channel == _historyServiceChannel) {
LOG.info("Connected to History Server. ");
}
else if (channel == _statsCollectorServiceChannel) {
LOG.info("Connected to StatsCollector Server");
}
else if (channel == _directoryServiceChannel) {
LOG.info("Connected to Directory Server. Registering for Callbacks");
getEventLoop().setTimer(new Timer(1000,false,new Timer.Callback() {
@Override
public void timerFired(Timer timer) {
DirectoryServiceRegistrationInfo registerationInfo = new DirectoryServiceRegistrationInfo();
_directoryServiceCallbackCookie = System.currentTimeMillis();
registerationInfo.setConnectionString(getServerAddress().getAddress().getHostAddress() + ":" + getServerAddress().getPort());
registerationInfo.setRegistrationCookie(_directoryServiceCallbackCookie);
registerationInfo.setConnectionName("DNS Service");
try {
_directoryServiceStub.register(registerationInfo, new AsyncRequest.Callback<DirectoryServiceRegistrationInfo,NullMessage>() {
@Override
public void requestComplete(AsyncRequest<DirectoryServiceRegistrationInfo, NullMessage> request) {
LOG.info("Received Registration Compelte Callback from Directory Server with Status:" + request.getStatus());
}
});
} catch (RPCException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}));
}
else if (channel == _masterCrawlerServiceChannel) {
LOG.info("Connected to Master Crawler at:" + _masterCrawlerAddress.toString() );
refreshMasterCrawlerActiveHostList();
}
else if (channel == _masterChannel) {
LOG.info("Connected to Master");
if (_handshakeState == HandshakeState.NOT_INITIATED) {
LOG.info("Initiating Handshake with Master");
initiateHandshake();
}
}
}
public boolean OutgoingChannelDisconnected(AsyncClientChannel channel) {
if (channel == _dnsServiceChannel) {
return true;
}
else if (channel == _masterCrawlerServiceChannel) {
LOG.info("Disconnected from Master Crawler at:" + _masterCrawlerAddress);
if (_masterCrawlerHostListRefreshTimer != null) {
_eventLoop.cancelTimer(_masterCrawlerHostListRefreshTimer);
_masterCrawlerHostListRefreshTimer = null;
}
}
else if (channel == _masterChannel) {
LOG.info("Master Channel Disconnected. Initiating Clean Shutdown");
try {
shutdownServices();
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
return false;
}
public void IncomingClientConnected(AsyncClientChannel channel) {
if (Environment.detailLogEnabled())
LOG.info("Master INCOMING Channel Connected");
}
public void IncomingClientDisconnected(AsyncClientChannel channel) {
}
public void dumpStats(final JspWriter out) {
final RuntimeStatsCollector stats = _engine.getStats();
AsyncWebServerRequest webRequest = new AsyncWebServerRequest("dumpStats",out) {
@Override
public boolean handleRequest(Semaphore completionSemaphore)throws IOException {
synchronized(stats) {
stats.dumpStatsToHTML(out);
}
return false;
}
};
webRequest.dispatch(_eventLoop);
webRequest = null;
}
public void dumpQueueDetails(final JspWriter out) {
AsyncWebServerRequest webRequest = new AsyncWebServerRequest("dumpStats",out) {
@Override
public boolean handleRequest(Semaphore completionSemaphore)throws IOException {
_engine.dumpQueueDetailsToHTML(out);
return false;
}
};
webRequest.dispatch(_eventLoop);
webRequest = null;
}
public void dumpHostDetails(final JspWriter out,String hostId)throws IOException {
if (hostId != null) {
final int hostIP = Integer.parseInt(hostId);
AsyncWebServerRequest webRequest = new AsyncWebServerRequest("dumpStats",out) {
@Override
public boolean handleRequest(Semaphore completionSemaphore)throws IOException {
_engine.dumpHostDetailsToHTML(out,hostIP);
return false;
}
};
webRequest.dispatch(_eventLoop);
webRequest = null;
}
else {
out.write("ERROR:Invalid Host ID");
}
}
public void shutdownCleanly(final JspWriter out) {
/*
AsyncWebServerRequest webRequest = new AsyncWebServerRequest("shutdown",out) {
@Override
public boolean handleRequest(Semaphore completionSemaphore)throws IOException {
LOG.info("Shutdown Initiated Via Web Interface");
_engine.stopCrawlerCleanly();
return false;
}
};
webRequest.dispatch(_eventLoop);
webRequest = null;
*/
System.exit(-1);
}
protected void subscribeToList(String listPath) throws IOException {
DirectoryServiceSubscriptionInfo subscription = new DirectoryServiceSubscriptionInfo();
subscription.setSubscriptionPath(listPath);
LOG.info("Subscribing to:" + listPath);
_directoryServiceStub.subscribe(subscription,new AsyncRequest.Callback<DirectoryServiceSubscriptionInfo,DirectoryServiceItemList>() {
@Override
public void requestComplete(AsyncRequest<DirectoryServiceSubscriptionInfo, DirectoryServiceItemList> request) {
if (request.getStatus() == AsyncRequest.Status.Success){
LOG.info("Subscription Successfull!");
}
else {
LOG.info("Subscription Failed!");
}
}
});
}
@Override
public void initialize(AsyncContext<DirectoryServiceRegistrationInfo, NullMessage> rpcContext) throws RPCException {
LOG.info("Received Initialization Request on Callback Channel");
if (rpcContext.getInput().getRegistrationCookie() == _directoryServiceCallbackCookie) {
LOG.info("Cookies Match! Sending Subscription information");
rpcContext.completeRequest();
getEventLoop().setTimer(new Timer(1000,false,new Timer.Callback() {
@Override
public void timerFired(Timer timer) {
try {
subscribeToList("/lists/.*");
} catch (IOException e) {
LOG.error("List subscription failed with exception:" + CCStringUtils.stringifyException(e));
}
}
}));
}
}
@Override
public void itemChanged(AsyncContext<DirectoryServiceItemList, NullMessage> rpcContext) throws RPCException {
LOG.info("### Directory Service List Change Message Received");
reloadFilters();
rpcContext.completeRequest();
}
/**
*
* @return the path to the crawl rate override filter
*/
public String getCrawlRateOverrideFilterPath() {
return CrawlEnvironment.CRAWL_RATE_MOD_FILTER_PATH;
}
protected void reloadFilters() {
LOG.info("Reloading Filters");
// update filter update time
_filterUpdateTime = System.currentTimeMillis();
LOG.info("Initializing Filters");
// _spamFilter = new DomainFilter(DomainFilterData.Type.Type_ExlusionFilter);
if(useGlobalBlockLists()) {
_blockedDomainFilter = new DomainFilter(DomainFilterData.Type.Type_ExlusionFilter);
_temporarilyBlockedDomainFilter = new DomainFilter(DomainFilterData.Type.Type_ExlusionFilter);
_ipAddressBlockFilter = new IPAddressBlockFilter();
}
_rewriteFilter = new DNSRewriteFilter();
_crawlRateOverrideFilter = new CrawlRateOverrideFilter();
try {
// LOG.info("### Loading Spam Filter");
// _spamFilter.loadFromPath(_directoryServiceAddress,CrawlEnvironment.SPAM_DOMAIN_LIST,true);
if(useGlobalBlockLists()) {
LOG.info("### Loading Blocked Domains Filter");
_blockedDomainFilter.loadFromPath(_directoryServiceAddress,CrawlEnvironment.BLOCKED_DOMAIN_LIST,false);
LOG.info("### Loading Temporarily Blocked Domains Filter");
_temporarilyBlockedDomainFilter.loadFromPath(_directoryServiceAddress,CrawlEnvironment.TEMPORARILY_BLOCKED_DOMAIN_LIST,false);
LOG.info("### IP Address Block Filter ");
_ipAddressBlockFilter.loadFromPath(_directoryServiceAddress, CrawlEnvironment.IP_BLOCK_LIST, false);
}
LOG.info("### Loading DNS Rewrite Filter ");
_rewriteFilter.loadFromPath(_directoryServiceAddress, CrawlEnvironment.DNS_REWRITE_RULES, false);
LOG.info("### Loading Crawl Rate Override Filter from path:" + getCrawlRateOverrideFilterPath());
_crawlRateOverrideFilter.loadFromPath(getDirectoryServiceAddress(),getCrawlRateOverrideFilterPath(),false);
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
public DomainFilter getDomainBlackListFilter() { return _blockedDomainFilter; }
public DomainFilter getTemporaryBlackListFilter() { return _temporarilyBlockedDomainFilter; }
public DNSRewriteFilter getDNSRewriteFilter() { return _rewriteFilter; }
public IPAddressBlockFilter getIPAddressFilter() { return _ipAddressBlockFilter; }
public long getFilterUpdateTime() { return _filterUpdateTime; }
/** get database name for this instance
*
*/
public String getDatabaseName() {
return CrawlEnvironment.CRAWLER_DB;
}
/** queue up a high priority http get
*
*/
public void queueHighPriorityURL(String url,long fingerprint,CrawlItemStatusCallback callback) {
_engine.queueExternalURL(url, fingerprint,true, callback);
}
/** queue up a low priority http get
*
*/
public void queueLowPriorityURL(String url,long fingerprint,CrawlItemStatusCallback callback) {
_engine.queueExternalURL(url, fingerprint,false, callback);
}
/** queue low priority crawlsegmetHost
*
*/
public void queueExternalHost(CrawlSegmentHost host,CrawlItemStatusCallback callback) {
_engine.queueExternalCrawlSegmentHost(host,callback);
}
/** enable the crawl log (yes by default)
*
*/
public boolean enableCrawlLog() {
return true;
}
/** crawl completed for the specified crawl target
*
*/
public void crawlComplete(NIOHttpConnection connection,CrawlURL url,CrawlTarget optTargetObj,boolean successOrFailure) {
// NOOP
}
/** externally manage crawl segments
*
*/
public boolean externallyManageCrawlSegments() {
return false;
}
/** load the given crawl segment in a background thread
*
*/
public void loadExternalCrawlSegment(final CrawlSegment segment,final LoadProgressCallback loadCallback,final CompletionCallback<CrawlSegmentStatus> completionCallback,final CrawlSegmentStatus status) {
// NOOP
}
/** update crawl segment status
*
*/
public void updateCrawlSegmentStatus(int crawlSegmentId,CrawlSegmentStatus status) {
}
/** get the fixed list id for high priority urls
*
*/
public int getHighPriorityListId() {
return 0;
}
/** notification that a fetch is starting on the target url
*
*/
public void fetchStarting(CrawlTarget target,NIOHttpConnection connection) {
}
/** should we use black lists
*
*/
public boolean useGlobalBlockLists() {
return true;
}
/** check host stats for failures
*
*/
public boolean failHostsOnStats() {
return true;
}
/** check for crawl rate override for the specified domain / url**/
public int checkForCrawlRateOverride(URL url) {
FilterResults resultsOut = new FilterResults();
String rootDomainName = URLUtils.extractRootDomainName(url.getHost());
if (rootDomainName != null) {
if (_crawlRateOverrideFilter.filterItem(rootDomainName,url.getHost(), url.getPath(), null, resultsOut) == FilterResult.Filter_Modified) {
return resultsOut.getCrawlRateOverride();
}
}
return -1;
}
/** is url in server block list **/
public boolean isURLInBlockList(URL url) {
return false;
}
/** get max robots exclusion in crawl loop override **/
public int getMaxRobotsExlusionsInLoopOverride() {
return -1;
}
/** get the host idle flush threshold
*
* the number of milliseconds a host needs to be idle for it
* to be purged from memory
* **/
public int getHostIdleFlushThreshold() {
return DEFAULT_HOST_IDLE_FLUSH_THRESHOLD;
}
/** return true to disable cycle timer
*
*/
public boolean disableCycleTimer() {
//TODO: DISABLING CYCLE TIME BY DEFAULT...
return true;
}
// crawler service related methods ...
@Override
public void queryStatus(AsyncContext<NullMessage, CrawlerStatus> rpcContext) throws RPCException {
if (Environment.detailLogEnabled())
LOG.info("Received Heartbeat Request From Master");
rpcContext.getOutput().setCrawlerState(_crawlerStatus.getCrawlerState());
rpcContext.getOutput().setActiveListNumber(_crawlerStatus.getActiveListNumber());
rpcContext.setStatus(AsyncRequest.Status.Success);
rpcContext.completeRequest();
}
private void populateCrawlStatusRepsonse(CrawlerStatus responseObjectOut) {
try {
responseObjectOut.merge(_crawlerStatus);
} catch (CloneNotSupportedException e) {
}
}
@Override
public void doAction(final AsyncContext<CrawlerAction, CrawlerStatus> rpcContext) throws RPCException {
LOG.info("Received Action Cmd:" + rpcContext.getInput().getActionType() + " ListId:" + rpcContext.getInput().getActiveListNumber());
switch (rpcContext.getInput().getActionType()) {
case CrawlerAction.ActionType.FLUSH: {
if (_crawlerStatus.getCrawlerState() == CrawlerStatus.CrawlerState.ACTIVE
|| _crawlerStatus.getCrawlerState() == CrawlerStatus.CrawlerState.IDLE ) {
// shift state to pausing ...
_crawlerStatus.setCrawlerState(CrawlerStatus.CrawlerState.FLUSHING);
if (getEngine() != null) {
// stop the crawl ... wait for completion ...
getEngine().stopCrawl(new CrawlStopCallback() {
@Override
public void crawlStopped() {
// ok, now see if we can initiate a flush ...
if (getEngine() != null && getEngine()._crawlLog != null) {
getEngine()._crawlLog.forceFlushAndCheckpointLog(new CheckpointCompletionCallback() {
@Override
public void checkpointFailed(long checkpointId, Exception e) {
try {
// log the error and keep going :-(
LOG.error(CCStringUtils.stringifyException(e));
_crawlerStatus.setCrawlerState(CrawlerStatus.CrawlerState.FLUSHED);
populateCrawlStatusRepsonse(rpcContext.getOutput());
rpcContext.completeRequest();
} catch (RPCException e1) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
@Override
public void checkpointComplete(long checkpointId,Vector<Long> completedSegmentList) {
_crawlerStatus.setCrawlerState(CrawlerStatus.CrawlerState.FLUSHED);
populateCrawlStatusRepsonse(rpcContext.getOutput());
try {
rpcContext.completeRequest();
} catch (RPCException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
});
}
else {
_crawlerStatus.setCrawlerState(CrawlerStatus.CrawlerState.FLUSHED);
populateCrawlStatusRepsonse(rpcContext.getOutput());
try {
rpcContext.completeRequest();
} catch (RPCException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}
});
}
else {
_crawlerStatus.setCrawlerState(CrawlerStatus.CrawlerState.FLUSHED);
populateCrawlStatusRepsonse(rpcContext.getOutput());
try {
rpcContext.completeRequest();
} catch (RPCException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}
else {
rpcContext.setStatus(Status.Error_RequestFailed);
rpcContext.setErrorDesc("Invalid State");
rpcContext.completeRequest();
}
}
break;
case CrawlerAction.ActionType.PURGE: {
if (_crawlerStatus.getCrawlerState() == CrawlerStatus.CrawlerState.FLUSHED
|| _crawlerStatus.getCrawlerState() == CrawlerStatus.CrawlerState.IDLE ) {
LOG.info("Received PURGE REQUEST WHILE IDLE OR PAUSED - Shutting down engine");
if (_engine != null) {
_engine.shutdown();
}
LOG.info("Engine shutdown complete.");
_engine = null;
// clear data directory
CrawlLog.purgeDataDirectory(getDataDirectory());
// update state ...
_crawlerStatus.setCrawlerState(CrawlerStatus.CrawlerState.PURGED);
populateCrawlStatusRepsonse(rpcContext.getOutput());
rpcContext.completeRequest();
}
else {
rpcContext.setStatus(Status.Error_RequestFailed);
rpcContext.setErrorDesc("Invalid State");
rpcContext.completeRequest();
}
}
break;
case CrawlerAction.ActionType.RESUME_CRAWL: {
LOG.info("Receieved Resume Crawl Notificarion");
if (_crawlerStatus.getCrawlerState() == CrawlerStatus.CrawlerState.FLUSHED
&& _engine != null
&& _crawlerStatus.getActiveListNumber() == rpcContext.getInput().getActiveListNumber()) {
// ok just resume crawl ...
LOG.info("Crawler is paused and list ids match. Just restarting the crawl");
_engine.startCrawl();
_crawlerStatus.setCrawlerState(CrawlerStatus.CrawlerState.ACTIVE);
populateCrawlStatusRepsonse(rpcContext.getOutput());
rpcContext.completeRequest();
}
else if (_crawlerStatus.getCrawlerState() == CrawlerStatus.CrawlerState.IDLE
|| _crawlerStatus.getCrawlerState() == CrawlerStatus.CrawlerState.PURGED){
LOG.info("Crawler is Idle. Starting Crawl from scratch");
_crawlerStatus.setActiveListNumber(rpcContext.getInput().getActiveListNumber());
initializeCrawl(rpcContext);
}
else if (_crawlerStatus.getCrawlerState() == CrawlerStatus.CrawlerState.ACTIVE && _crawlerStatus.getActiveListNumber() == rpcContext.getInput().getActiveListNumber()) {
LOG.info("Received Resume Crawl on already valid active crawl. Ignoring.");
populateCrawlStatusRepsonse(rpcContext.getOutput());
rpcContext.completeRequest();
}
else {
rpcContext.setStatus(Status.Error_RequestFailed);
rpcContext.setErrorDesc("Invalid State");
rpcContext.completeRequest();
}
}
break;
}
}
private void initializeCrawl(final AsyncContext<CrawlerAction, CrawlerStatus> rpcContext) throws RPCException {
// ok , first things first, send init to history server
CrawlHistoryStatus crawlStatus = new CrawlHistoryStatus();
crawlStatus.setActiveCrawlNumber(_crawlerStatus.getActiveListNumber());
LOG.info("Sending Sync to HistoryServer");
_historyServiceStub.sync(crawlStatus,new Callback<CrawlHistoryStatus, NullMessage>() {
@Override
public void requestComplete(AsyncRequest<CrawlHistoryStatus, NullMessage> request) {
LOG.info("Received response from HistoryServer");
if (request.getStatus() == Status.Success) {
LOG.info("History Server Sync Successfull - Initializing Engine");
if (initializeEngine(_crawlerStatus.getActiveListNumber())) {
LOG.info("Engine Initialization Successfull. Starting Crawl for List:" + _crawlerStatus.getActiveListNumber());
// kick off the load process
_engine.loadCrawlSegments();
_crawlerStatus.setCrawlerState(CrawlerStatus.CrawlerState.ACTIVE);
populateCrawlStatusRepsonse(rpcContext.getOutput());
try {
rpcContext.completeRequest();
} catch (RPCException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
// exit on this path ...
return;
}
}
// failure path ...
rpcContext.getOutput().setCrawlerState(CrawlerStatus.CrawlerState.IDLE);
rpcContext.setStatus(Status.Error_RequestFailed);
if (request.getStatus() != Status.Success) {
rpcContext.setErrorDesc("History Server Sync Failed");
}
else {
rpcContext.setErrorDesc("Engine Initialization Failed");
}
try {
rpcContext.completeRequest();
} catch (RPCException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
});
}
@Override
public void stop() {
LOG.info("Crawler Server Stop Called");
if (_engine != null) {
LOG.info("Shutting Down Crawler Engine");
_engine.stopCrawlerCleanly();
_engine = null;
}
LOG.info("CrawlerServer: Calling Super Stop");
super.stop();
}
@Override
public void queryActiveHosts(AsyncContext<NullMessage, ActiveHostInfo> rpcContext) throws RPCException {
// a slave server is asking us about our set of active hosts
rpcContext.setStatus(Status.Success); // default to success ...
// check to see if we need to refresh the list ...
if (_pauseStateTimestamp == -1 || (System.currentTimeMillis() - _pauseStateTimestamp) >= ACTIVE_HOST_LIST_REFRESH_INTERVAL_MASTER) {
LOG.info("Refreshing Active Host List");
_masterPauseStateBuffer = null;
// ok refresh the list ...
if (_engine != null) {
// ok ... update the host list via the engine ...
try {
_masterPauseStateBuffer = _engine.getActiveHostListAsBuffer();
}
catch (IOException e) {
LOG.error("queryActiveHosts threw Exception:"+ CCStringUtils.stringifyException(e));
}
}
_pauseStateTimestamp = System.currentTimeMillis();
_pauseStateTimestampIncremental++;
}
if (_masterPauseStateBuffer != null) {
rpcContext.getOutput().setActiveHostIds(_masterPauseStateBuffer);
}
// no matter echo current timestamp (serial version)
rpcContext.getOutput().setPauseStateTimestamp(_pauseStateTimestampIncremental);
rpcContext.completeRequest();
}
void refreshMasterCrawlerActiveHostList() {
// ok if there is a master crawler, and it is online ...
if (_masterCrawlerServiceChannel != null && _masterCrawlerServiceChannel.isOpen()) {
try {
_masterCrawlerStub.queryActiveHosts(new Callback<NullMessage, ActiveHostInfo>() {
@Override
public void requestComplete(AsyncRequest<NullMessage, ActiveHostInfo> request) {
if (request.getStatus() == Status.Success) {
// ok update timestamp no matter what
_pauseStateTimestampIncremental = request.getOutput().getPauseStateTimestamp();
// and clear set ...
_pausedHostsSet = null;
// now see if we have a valid response ...
if (request.getOutput().getActiveHostIds().getCount() != 0) {
LOG.info("Received New Active Host Set From Master Crawler At:" + _masterCrawlerAddress);
// ok we have a valid list of hosts ...
// create a reader stream
DataInputBuffer inputStream = new DataInputBuffer();
inputStream.reset(request.getOutput().getActiveHostIds().getReadOnlyBytes(),0,request.getOutput().getActiveHostIds().getCount());
try {
// create a set ...
Set<Integer> ipAddressSet = new TreeSet<Integer>();
// populate it
int ipAddressCount = WritableUtils.readVInt(inputStream);
for (int i=0;i<ipAddressCount;++i) {
ipAddressSet.add(WritableUtils.readVInt(inputStream));
}
LOG.info("Successfully updated Active Host Set");
// ok replace set ...
_pausedHostsSet = ipAddressSet;
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}
}
});
} catch (RPCException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
// ok no matter what... check to see if we need to set up refresh timer ...
if (_masterCrawlerHostListRefreshTimer == null) {
_masterCrawlerHostListRefreshTimer = new Timer(ACTIVE_HOST_LIST_REFRESH_INTERVAL_CLIENT,true,new Timer.Callback() {
@Override
public void timerFired(Timer timer) {
// call refresh again ...
refreshMasterCrawlerActiveHostList();
}
});
_eventLoop.setTimer(_masterCrawlerHostListRefreshTimer);
}
}
/** get the serial pause state timestamp **/
final public int getPauseStateSerialTimestamp() { return _pauseStateTimestampIncremental; }
/** check to see if a host is paused by a master controller **/
final public boolean isHostPaused(CrawlHost host) {
if (_pausedHostsSet != null) {
return _pausedHostsSet.contains(host.getIPAddress());
}
return false;
}
@Override
public void timerFired(Timer timer) {
if (timer == _handshakeTimer) {
if (_handshakeState == HandshakeState.NOT_INITIATED) {
initiateHandshake();
}
else if (_handshakeState == HandshakeState.IDLE) {
if (_registration != null) {
if (System.currentTimeMillis() - _registration.getLastTimestamp() >= 1000) {
_handshakeState = HandshakeState.RENEWING;
try {
_masterRPCStub.extendRegistration(_registration, new AsyncRequest.Callback<SlaveRegistration, NullMessage>() {
@Override
public void requestComplete(AsyncRequest<SlaveRegistration, NullMessage> request) {
if (request.getStatus() == Status.Success) {
_registration.setLastTimestamp(System.currentTimeMillis());
_handshakeState = HandshakeState.IDLE;
}
else {
LOG.error("Handshake Extension Failed! - Initiating Shutdown!");
try {
shutdownServices();
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}
});
} catch (RPCException e) {
LOG.error("Lease Renewal Failed with Error:"+ CCStringUtils.stringifyException(e));
try {
shutdownServices();
} catch (IOException e1) {
LOG.error(CCStringUtils.stringifyException(e1));
}
}
}
}
}
}
}
JsonObject _masterProperties = new JsonObject();
public void initiateHandshake() {
_handshakeState = HandshakeState.INITIATING;
LOG.info("Connected to Master. Initiating Handshake");
SlaveHello slaveHello = new SlaveHello();
slaveHello.setIpAddress(IPAddressUtils.IPV4AddressToInteger(_serverAddress.getAddress().getAddress()));
slaveHello.setCookie(System.currentTimeMillis());
slaveHello.setServiceName("crawler");
try {
_masterRPCStub.registerSlave(slaveHello,new AsyncRequest.Callback<SlaveHello, SlaveRegistration>() {
@Override
public void requestComplete(AsyncRequest<SlaveHello, SlaveRegistration> request) {
if (request.getStatus() == Status.Success) {
LOG.info("Master Handshake Successfull");
_registration = request.getOutput();
_registration.setLastTimestamp(System.currentTimeMillis());
_handshakeState = HandshakeState.IDLE;
LOG.info("Starting Crawler");
try {
_masterProperties = new JsonObject();
if (request.getOutput().getPropertiesHash().length() != 0) {
_masterProperties = new JsonParser().parse(request.getOutput().getPropertiesHash()).getAsJsonObject();
if (_masterProperties.has(CrawlEnvironment.PROPERTY_SEGMENT_DATA_DIR)) {
LOG.info("Master Overrode Segment Data Dir to:"
+ _masterProperties.get(CrawlEnvironment.PROPERTY_SEGMENT_DATA_DIR).getAsString());
CrawlEnvironment.setCrawlSegmentDataDirectory(_masterProperties.get(CrawlEnvironment.PROPERTY_SEGMENT_DATA_DIR).getAsString());
}
if (_masterProperties.has(CrawlEnvironment.PROPERTY_CONTENT_DATA_DIR)) {
LOG.info("Master Overrode Content Data Dir to:"
+ _masterProperties.get(CrawlEnvironment.PROPERTY_CONTENT_DATA_DIR).getAsString());
_crawlContentPath = new Path(_masterProperties.get(CrawlEnvironment.PROPERTY_CONTENT_DATA_DIR).getAsString());
}
}
startServices();
} catch (IOException e) {
LOG.info("Crawler Start Failed with Exception:" +CCStringUtils.stringifyException(e));
try {
shutdownServices();
} catch (IOException e1) {
LOG.error(CCStringUtils.stringifyException(e1));
}
}
}
else {
LOG.error("Handshake to Master Failed");
_handshakeState = HandshakeState.NOT_INITIATED;
}
}
});
}
catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
void startServices() throws IOException {
// ok , first things first, send init to history server
CrawlHistoryStatus crawlStatus = new CrawlHistoryStatus();
crawlStatus.setActiveCrawlNumber(_crawlerStatus.getActiveListNumber());
LOG.info("Sending Sync to HistoryServer");
_historyServiceStub.sync(crawlStatus,new Callback<CrawlHistoryStatus, NullMessage>() {
@Override
public void requestComplete(AsyncRequest<CrawlHistoryStatus, NullMessage> request) {
LOG.info("Received response from HistoryServer");
if (request.getStatus() == Status.Success) {
LOG.info("History Server Sync Successfull - Initializing Engine");
if (initializeEngine(_crawlerStatus.getActiveListNumber())) {
LOG.info("Engine Initialization Successfull. Starting Crawl for List:" + _crawlerStatus.getActiveListNumber());
// kick off the load process
_engine.loadCrawlSegments();
_crawlerStatus.setCrawlerState(CrawlerStatus.CrawlerState.ACTIVE);
}
}
else {
LOG.error("History Server Sync Failed! Shutting Down Services!");
try {
shutdownServices();
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}
});
}
void doFinalEngineCleanup() throws IOException {
if (_engine != null) {
_engine.shutdown();
}
LOG.info("Engine shutdown complete.");
_engine = null;
// clear data directory
CrawlLog.purgeDataDirectory(getDataDirectory());
_crawlerStatus.setCrawlerState(CrawlerStatus.CrawlerState.IDLE);
_registration = null;
_handshakeState = HandshakeState.NOT_INITIATED;
}
void shutdownServices()throws IOException {
LOG.info("Shutdown Services Initiated");
_handshakeState = HandshakeState.SHUTTING_DOWN;
if (_crawlerStatus.getCrawlerState() == CrawlerStatus.CrawlerState.ACTIVE || _crawlerStatus.getCrawlerState() == CrawlerStatus.CrawlerState.IDLE ) {
// shift state to pausing ...
_crawlerStatus.setCrawlerState(CrawlerStatus.CrawlerState.FLUSHING);
if (getEngine() != null) {
LOG.info("Stopping Crawl");
// stop the crawl ... wait for completion ...
getEngine().stopCrawl(new CrawlStopCallback() {
@Override
public void crawlStopped() {
LOG.info("Crawl Stopped");
// ok, now see if we can initiate a flush ...
if (getEngine() != null) {
if (getEngine()._crawlLog != null) {
LOG.info("Checkpointing CrawlLog");
getEngine()._crawlLog.forceFlushAndCheckpointLog(new CheckpointCompletionCallback() {
@Override
public void checkpointFailed(long checkpointId, Exception e) {
LOG.error("Checkpoint Failed!");
try {
doFinalEngineCleanup();
} catch (IOException e1) {
LOG.error(CCStringUtils.stringifyException(e1));
}
}
@Override
public void checkpointComplete(long checkpointId,Vector<Long> completedSegmentList) {
LOG.info("Checkpoint Complete");
try {
doFinalEngineCleanup();
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
});
}
}
else {
LOG.info("Crawl Stopped but CrawlerEngine NULL");
try {
doFinalEngineCleanup();
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}
});
}
else {
LOG.info("Shutdown Called but Engine Already NULL");
doFinalEngineCleanup();
}
}
}
}