package org.commoncrawl.service.crawlmasterV2; import java.io.IOException; import java.util.Collection; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.commoncrawl.async.Timer; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.protocol.CrawlMaster; import org.commoncrawl.protocol.SlaveHello; import org.commoncrawl.protocol.SlaveRegistration; import org.commoncrawl.rpc.base.internal.AsyncContext; import org.commoncrawl.rpc.base.internal.AsyncRequest.Status; import org.commoncrawl.rpc.base.internal.AsyncServerChannel; import org.commoncrawl.rpc.base.internal.NullMessage; import org.commoncrawl.rpc.base.shared.RPCException; import org.commoncrawl.server.CommonCrawlServer; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.IPAddressUtils; import com.google.common.collect.DiscreteDomains; import com.google.common.collect.Iterables; import com.google.common.collect.Maps; import com.google.common.collect.Multimap; import com.google.common.collect.Ranges; import com.google.common.collect.Sets; import com.google.common.collect.TreeMultimap; import com.google.gson.JsonObject; public class CrawlMasterServer extends CommonCrawlServer implements CrawlMaster, Timer.Callback { String _s3AccessKey; String _s3Secret; static CrawlMasterServer _server; Multimap<Integer, SlaveRegistration> _registry = TreeMultimap.create(); Map<Integer,Integer> _ipToInstanceIdMap = Maps.newTreeMap(); Set<Integer> _instanceIds = Sets.newTreeSet(); private static final int WATCHDOG_DELAY = 1000; private static final int MAX_TIME_BETWEEN_HEARTBEATS = 120000; Timer _watchDogTimer = new Timer(WATCHDOG_DELAY,true,this); public static final Log LOG = LogFactory.getLog(CrawlMasterServer.class); // RPCS @Override public void registerSlave( AsyncContext<SlaveHello, SlaveRegistration> rpcContext) throws RPCException { LOG.info("Received RegisterSlave Request from:" + IPAddressUtils.IntegerToIPAddressString(rpcContext.getInput().getIpAddress()) + " Service:" + rpcContext.getInput().getServiceName()); Integer instanceId = _ipToInstanceIdMap.get(rpcContext.getInput().getIpAddress()); // assume failure ... rpcContext.setStatus(Status.Error_RequestFailed); if (instanceId == null) { LOG.info("No Instance Id Mapping Found for IP:"+ IPAddressUtils.IntegerToIPAddressString(rpcContext.getInput().getIpAddress())); instanceId = Iterables.getFirst(_instanceIds,-1); if (instanceId != -1) { // remove from id pool _instanceIds.remove(instanceId); // assign to ip pool _ipToInstanceIdMap.put(rpcContext.getInput().getIpAddress(), instanceId); } } if (instanceId != null && instanceId != -1) { // create service registration ... rpcContext.getOutput().setIpAddress(rpcContext.getInput().getIpAddress()); rpcContext.getOutput().setServiceName(rpcContext.getInput().getServiceName()); rpcContext.getOutput().setCookie(rpcContext.getInput().getCookie()); rpcContext.getOutput().setInstanceId(instanceId); rpcContext.getOutput().setLastTimestamp(System.currentTimeMillis()); rpcContext.getOutput().setPropertiesHash(_properties.toString()); // stash it away ... _registry.put(instanceId, rpcContext.getOutput()); // and echo it back to sender ... rpcContext.setStatus(Status.Success); // log it LOG.info("Successfully bound Service:" + rpcContext.getInput().getServiceName() +" IP:" + IPAddressUtils.IntegerToIPAddressString(rpcContext.getInput().getIpAddress()) + " Cookie:" + rpcContext.getInput().getCookie() + " to InstanceId:" + instanceId); } else { LOG.error("Unable to obtain instance Id for IP:"+ IPAddressUtils.IntegerToIPAddressString(rpcContext.getInput().getIpAddress())); rpcContext.setErrorDesc("No Instance Id Available for Specified IP Address"); } rpcContext.completeRequest(); } @Override public void extendRegistration( AsyncContext<SlaveRegistration, NullMessage> rpcContext) throws RPCException { // assume failure rpcContext.setStatus(Status.Error_RequestFailed); Integer instaceIdMapping = _ipToInstanceIdMap.get(rpcContext.getInput().getIpAddress()); // if ip address to instace id mapping matches ... if (instaceIdMapping != null && instaceIdMapping == rpcContext.getInput().getInstanceId()) { Collection<SlaveRegistration> registrations = _registry.get(rpcContext.getInput().getInstanceId()); // if the registration exists ... if (registrations != null && registrations.contains(rpcContext.getInput())) { // extend the registration ... rpcContext.getInput().setLastTimestamp(System.currentTimeMillis()); // store it ... registrations.remove(rpcContext.getInput()); registrations.add(rpcContext.getInput()); // set status bit rpcContext.setStatus(Status.Success); LOG.info("Extended registration for Service:" + rpcContext.getInput().getServiceName() +" IP:" + IPAddressUtils.IntegerToIPAddressString(rpcContext.getInput().getIpAddress()) + " Cookie:" + rpcContext.getInput().getCookie() + " InstanceId:" + rpcContext.getInput().getInstanceId() + " TS: " + rpcContext.getInput().getLastTimestamp()); } } rpcContext.completeRequest(); } @Override public void expireRegistration( AsyncContext<SlaveRegistration, NullMessage> rpcContext) throws RPCException { // assume failure rpcContext.setStatus(Status.Error_RequestFailed); Integer instaceIdMapping = _ipToInstanceIdMap.get(rpcContext.getInput().getIpAddress()); // if ip address to instace id mapping matches ... if (instaceIdMapping != null && instaceIdMapping == rpcContext.getInput().getInstanceId()) { Collection<SlaveRegistration> registrations = _registry.get(rpcContext.getInput().getInstanceId()); // if the registration exists ... if (registrations != null) { // remove the specified registration ... registrations.remove(rpcContext.getInput()); LOG.info("Released registration for Service:" + rpcContext.getInput().getServiceName() +" IP:" + IPAddressUtils.IntegerToIPAddressString(rpcContext.getInput().getIpAddress()) + " Cookie:" + rpcContext.getInput().getCookie() + " InstanceId:" + rpcContext.getInput().getInstanceId()); rpcContext.setStatus(Status.Success); } } rpcContext.completeRequest(); } //@Override protected String getDefaultLogFileName() { return "crawldb"; } @Override protected String getDefaultDataDir() { return CrawlEnvironment.DEFAULT_DATA_DIR; } @Override protected String getDefaultHttpInterface() { return CrawlEnvironment.DEFAULT_HTTP_INTERFACE; } @Override protected int getDefaultHttpPort() { return CrawlEnvironment.DEFAULT_DATABASE_HTTP_PORT; } @Override protected String getDefaultRPCInterface() { return CrawlEnvironment.DEFAULT_RPC_INTERFACE; } @Override protected int getDefaultRPCPort() { return CrawlEnvironment.DEFAULT_DATABASE_RPC_PORT; } @Override protected String getWebAppName() { return CrawlEnvironment.CRAWLMASTER_WEBAPP_NAME; } JsonObject _properties = new JsonObject(); @Override protected boolean parseArguements(String[] argv) { for(int i=0; i < argv.length;++i) { if (argv[i].equalsIgnoreCase("--awsAccessKey")) { if (i+1 < argv.length) { _s3AccessKey = argv[++i]; } } else if (argv[i].equalsIgnoreCase("--awsSecret")) { if (i+1 < argv.length) { _s3Secret = argv[++i]; } } else if (argv[i].equalsIgnoreCase("--segmentDataDir")) { _properties.addProperty(CrawlEnvironment.PROPERTY_SEGMENT_DATA_DIR, argv[++i]); } else if (argv[i].equalsIgnoreCase("--contentDataDir")) { _properties.addProperty(CrawlEnvironment.PROPERTY_CONTENT_DATA_DIR, argv[++i]); } } return true; } @Override protected void printUsage() { System.out.println("Database Startup Args: --dataDir [data directory]"); } @Override protected boolean startDaemons() { getEventLoop().setTimer(_watchDogTimer); return true; } @Override protected void stopDaemons() { getEventLoop().cancelTimer(_watchDogTimer); } @Override protected boolean initServer() { _server = this; // populate instance ids ... _instanceIds.addAll(Ranges.open(-1, CrawlEnvironment.NUM_CRAWLERS).asSet(DiscreteDomains.integers())); LOG.info("Available Instance Ids Are: "+ _instanceIds); try { // create server channel ... AsyncServerChannel channel = new AsyncServerChannel(this, getEventLoop(), getServerAddress(),null); // register RPC services it supports ... registerService(channel,CrawlMaster.spec); // open the server channel .. channel.open(); } catch (IOException e) { LOG.fatal(CCStringUtils.stringifyException(e)); return false; } return true; } @Override public void timerFired(Timer timer) { //LOG.info("Heartbeat timer fired."); // ok walk registry expiring stuff ... Iterator<Entry<Integer, SlaveRegistration>> registrations = _registry.entries().iterator(); while (registrations.hasNext()) { Entry<Integer, SlaveRegistration> registration = registrations.next(); if ((System.currentTimeMillis() - registration.getValue().getLastTimestamp()) >= MAX_TIME_BETWEEN_HEARTBEATS) { LOG.info("Released registration for Service:" + registration.getValue().getServiceName() +" IP:" + IPAddressUtils.IntegerToIPAddressString(registration.getValue().getIpAddress()) + " Cookie:" + registration.getValue().getCookie() + " InstanceId:" + registration.getValue().getInstanceId() + " Current TS:" + System.currentTimeMillis() + " Last TS:" + registration.getValue().getLastTimestamp()); registrations.remove(); } } // now walk address mappings Iterator<Entry<Integer,Integer>> ipToInstanceIdIterator = _ipToInstanceIdMap.entrySet().iterator(); while (ipToInstanceIdIterator.hasNext()) { Entry<Integer,Integer> mapping = ipToInstanceIdIterator.next(); if (_registry.get(mapping.getValue()).size() == 0) { LOG.info("Instance Id:" + mapping.getValue() + " has no more registrations associated with IP:" + IPAddressUtils.IntegerToIPAddressString(mapping.getKey()) + " - Expiring."); ipToInstanceIdIterator.remove(); // reclaim id _instanceIds.add(mapping.getValue()); LOG.info("Recalimed Instance Id:" + mapping.getValue()+ " New Set is:" + _instanceIds); } } } }