/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.server;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.NetworkInterface;
import java.net.SocketException;
import java.net.UnknownHostException;
import java.util.Enumeration;
import java.util.TreeMap;
import java.util.Vector;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.common.Environment;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.rpc.base.internal.Server;
import org.commoncrawl.util.RuntimeStatsCollector;
/**
*
* @author rana
*
*/
public abstract class CommonCrawlServer extends Server {
protected static final class CommonConfig {
public String _className;
public String _hostName;
public String _rpcInterface;
public int _rpcPort = -1;
public String _webInterface;
public int _webPort = -1;
public String _configName;
public String _dataDir = null;
public String _logDir = null;
public int _dnsThreadPoolSize = -1;
}
public static final Log LOG = LogFactory.getLog(CommonCrawlServer.class);
private static final int DEFAULT_MAX_THREADPOOL_THREADS = 5;
private static final int DEFAULT_DNS_THREAD_POOL_SIZE = 50;
public static final String DNS_POOL_NAME = "dns";
/** return the ip address for a given network interface **/
protected static String[] getIPs(String strInterface) throws UnknownHostException {
Pattern pattern = Pattern.compile("([0-9]+).([0-9]+).([0-9]+).([0-9]+)");
Matcher m = pattern.matcher(strInterface);
// if this is an ip address ...
if (m.matches()) {
LOG.info("getIPs detected ip address as interface name. returned: " + strInterface);
return new String[] { strInterface };
} else {
if (strInterface.equalsIgnoreCase("localhost") || strInterface.equalsIgnoreCase("lo")) {
LOG.info("getIPs for localhost name:" + strInterface + " returned: 127.0.0.1");
return new String[] { "127.0.0.1" };
}
try {
NetworkInterface netIF = NetworkInterface.getByName(strInterface);
if (netIF == null) {
return null;
} else {
Vector<String> ips = new Vector<String>();
Enumeration<InetAddress> e = netIF.getInetAddresses();
while (e.hasMoreElements()) {
InetAddress address = e.nextElement();
// only allow ipv4 addresses for now ...
if (address.getAddress().length == 4) {
LOG.info("getIPs for name:" + strInterface + " found:" + address.getHostAddress());
ips.add(address.getHostAddress());
}
}
return ips.toArray(new String[] {});
}
} catch (SocketException e) {
return null;
}
}
}
public static void main(String argv[]) throws Exception {
try {
Configuration conf = new Configuration();
conf.addResource("core-site.xml");
conf.addResource("mapred-site.xml");
conf.addResource("hdfs-site.xml");
// conf.addResource("hadoop-site.xml");
// conf.addResource("commoncrawl-default.xml");
// conf.addResource("commoncrawl-site.xml");
/*
* conf.setClassLoader( new ClassLoader() {
*
* @Override protected Class<?> findClass(String name) throws
* ClassNotFoundException { if (name.startsWith("org.crawlcommons")) { //
* this is a hack to deal with the problem of having a bunch of serialized
* data in hdfs sequence files that referes to // protocol buffer with the
* old crawler package name of org.crawlcommons instead of the new package
* name of org.commoncrawl // we replace the crawlcommons string and call
* back into the class loader to re-resolve the name... name =
* name.replaceFirst("org.crawlcommons", "org.commoncrawl"); return
* loadClass(name); }
*
* return super.findClass(name); }
*
* });
*/
// set this config object as our global config
CrawlEnvironment.setHadoopConfig(conf);
_commonConfig = parseCommonConfig(argv);
if (_commonConfig._className == null) {
printCommonUsage();
return;
}
if (_commonConfig._hostName != null) {
conf.set("org.commoncrawl.hostname", _commonConfig._hostName);
}
if (_commonConfig._rpcInterface != null) {
conf.set("org.commoncrawl.rpcInterface", _commonConfig._rpcInterface);
}
if (_commonConfig._webInterface != null) {
conf.set("org.commoncrawl.httpInterface", _commonConfig._webInterface);
}
if (_commonConfig._rpcPort != -1) {
conf.setInt("org.commoncrawl.rpcPort", _commonConfig._rpcPort);
}
if (_commonConfig._webPort != -1) {
conf.setInt("org.commoncrawl.httpPort", _commonConfig._webPort);
}
if (_commonConfig._dataDir != null) {
conf.set("org.commoncrawl.dataDir", _commonConfig._dataDir);
}
if (_commonConfig._logDir != null) {
conf.set("commoncrawl.log.dir", _commonConfig._logDir);
}
if (_commonConfig._dnsThreadPoolSize != -1) {
conf.setInt("org.commoncrawl.dnsThreadPoolSize", _commonConfig._dnsThreadPoolSize);
}
LOG.info("Log File Is:" + System.getProperty("commoncrawl.log.file"));
LOG.info("Instantiating Class:" + _commonConfig._className);
Class theClass = conf.getClassByName(_commonConfig._className);
Object serverInstance = theClass.newInstance();
CommonCrawlServer server = CommonCrawlServer.class.cast(serverInstance);
StringUtils.startupShutdownMessage(theClass, argv, LOG);
if (server.init(argv, conf)) {
try {
server.start();
server.join();
} catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
} finally {
server.stopDaemons();
server.stop();
}
}
} catch (Throwable e) {
LOG.error(StringUtils.stringifyException(e));
e.printStackTrace();
System.exit(-1);
}
}
public static CommonConfig parseCommonConfig(String argv[]) {
CommonConfig configOut = new CommonConfig();
if (argv != null) {
for (int i = 0; i < argv.length; ++i) {
if (argv[i].equalsIgnoreCase("--server")) {
if (i + 1 < argv.length) {
configOut._className = argv[++i];
}
} else if (argv[i].equalsIgnoreCase("--hostname")) {
if (i + 1 < argv.length) {
configOut._hostName = argv[++i];
}
} else if (argv[i].equalsIgnoreCase("--rpcIntfc")) {
if (i + 1 < argv.length) {
configOut._rpcInterface = argv[++i];
}
} else if (argv[i].equalsIgnoreCase("--httpIntfc")) {
if (i + 1 < argv.length) {
configOut._webInterface = argv[++i];
}
} else if (argv[i].equalsIgnoreCase("--rpcPort")) {
if (i + 1 < argv.length) {
configOut._rpcPort = Integer.parseInt(argv[++i]);
}
} else if (argv[i].equalsIgnoreCase("--httpPort")) {
if (i + 1 < argv.length) {
configOut._webPort = Integer.parseInt(argv[++i]);
}
} else if (argv[i].equalsIgnoreCase("--conf")) {
if (i + 1 < argv.length) {
configOut._configName = argv[++i];
}
} else if (argv[i].equalsIgnoreCase("--dataDir")) {
if (i + 1 < argv.length) {
configOut._dataDir = argv[++i];
}
}
else if (argv[i].equalsIgnoreCase("--logDir")) {
configOut._logDir = argv[++i];
} else if (argv[i].equalsIgnoreCase("--dnsPoolSize")) {
configOut._dnsThreadPoolSize = Integer.parseInt(argv[++i]);
}
else if (argv[i].equalsIgnoreCase("--detailLogging")) {
int value = Integer.parseInt(argv[++i]);
Environment.enableDetailLog((value == 1));
}
}
}
return configOut;
}
private static void printCommonUsage() {
System.err
.println("Usage: java CommonCrawlServer --server [server type] --hostname [hostname] --rpcInterface [rpc server interface] --rpcPort [rpc server port] --httpInterface [web server interface] --httpPort [web server port] --dataDir [data directory] --conf [config file] [server args]");
}
protected Configuration _configuration;
private static CommonCrawlServer _serverSingleton;
protected static CommonConfig _commonConfig;
protected String _hostName;
protected InetSocketAddress _serverAddress;
protected File _dataDir;
protected WebServer _webServer;
protected boolean _useAsyncWebDispatch = false;
private int _dnsThreadPoolSize = DEFAULT_DNS_THREAD_POOL_SIZE;
protected EventLoop _eventLoop = null;
/** default thread pool */
private ExecutorService _defaultThreadPool;
/** registered thread pools */
private TreeMap<String, ExecutorService> _threadPoolMap = new TreeMap<String, ExecutorService>();
private static ThreadLocal _serverObject = new ThreadLocal() {
@Override
protected Object initialValue() {
return _serverSingleton;
};
};
public static CommonCrawlServer getServerSingleton() {
return (CommonCrawlServer) _serverObject.get();
}
public CommonCrawlServer() {
_serverSingleton = this;
}
public void collectStats(RuntimeStatsCollector collector) {
StringBuffer sb = new StringBuffer();
sb.append("\n*****THREAD-POOL-STATE*****\n");
sb.append(String.format("%1$10.10s ", "POOLNAME"));
sb.append(String.format("%1$8.8s ", "TOTAL"));
sb.append(String.format("%1$6.6s ", "ACTIVE"));
sb.append(String.format("%1$8.8s\n", "PENDING"));
long totalTaskCount = ((ThreadPoolExecutor) _defaultThreadPool).getTaskCount();
long completedTaskCount = ((ThreadPoolExecutor) _defaultThreadPool).getCompletedTaskCount();
long activeCount = ((ThreadPoolExecutor) _defaultThreadPool).getActiveCount();
// print out default thread pool stats first ...
sb.append(String.format("%1$10.10s ", "default"));
sb.append(String.format("%1$8.8s ", totalTaskCount));
sb.append(String.format("%1$6.6s ", activeCount));
sb.append(String.format("%1$8.8s\n", totalTaskCount - completedTaskCount - activeCount));
synchronized (_threadPoolMap) {
for (String poolName : _threadPoolMap.keySet()) {
ThreadPoolExecutor executor = (ThreadPoolExecutor) _threadPoolMap.get(poolName);
totalTaskCount = executor.getTaskCount();
completedTaskCount = executor.getCompletedTaskCount();
activeCount = executor.getActiveCount();
sb.append(String.format("%1$10.10s ", poolName));
sb.append(String.format("%1$8.8s ", totalTaskCount));
sb.append(String.format("%1$6.6s ", activeCount));
sb.append(String.format("%1$8.8s\n", totalTaskCount - completedTaskCount - activeCount));
}
}
collector.setStringValue(ServerStats.ID, ServerStats.Name.CommonServer_ThreadPoolInfo, sb.toString());
}
/**
* dispatch an async web request on the server's main event loop thread
*
* @param request
* object
* @throws IOException
*/
public void dispatchAsyncWebRequest(final AsyncWebServerRequest request) throws IOException {
request.dispatch(_eventLoop);
if (request.getException() != null) {
throw request.getException();
}
}
public Configuration getConfig() {
return _configuration;
}
public File getDataDirectory() {
return _dataDir;
}
protected abstract String getDefaultDataDir();
protected abstract String getDefaultHttpInterface();
protected abstract int getDefaultHttpPort();
protected abstract String getDefaultLogFileName();
protected abstract String getDefaultRPCInterface();
protected abstract int getDefaultRPCPort();
public synchronized ExecutorService getDefaultThreadPool() {
return _defaultThreadPool;
}
public EventLoop getEventLoop() {
return _eventLoop;
}
public String getHostName() {
return _hostName;
}
public File getLogDirectory() {
if (_configuration.get("commoncrawl.log.dir",null) != null) {
return new File(_configuration.get("commoncrawl.log.dir",null));
}
else {
return new File(System.getProperty("commoncrawl.log.dir","./logs"));
}
}
public InetSocketAddress getServerAddress() {
return _serverAddress;
}
public String getServerName() {
return this.getClass().getSimpleName();
}
public ExecutorService getThreadPool(String threadPoolId) {
ExecutorService service = null;
synchronized (_threadPoolMap) {
service = _threadPoolMap.get(threadPoolId);
}
return service;
}
protected abstract String getWebAppName();
public WebServer getWebServer() {
return _webServer;
}
protected final boolean init(String argv[], Configuration conf) throws IOException {
_configuration = conf;
if (!parseArguements(argv)) {
printUsage();
return false;
}
overrideConfig(conf);
// initialize the default thread pool
_defaultThreadPool = Executors.newFixedThreadPool(_configuration.getInt("org.commoncrawl.threadpool.max.threads",
DEFAULT_MAX_THREADPOOL_THREADS));
_hostName = _configuration.get("org.commoncrawl.hostname");
if (_hostName == null) {
// get host name via other meands
InetAddress localHostAddr = InetAddress.getLocalHost();
_hostName = localHostAddr.getHostName();
_configuration.set("org.commoncrawl.hostname", _hostName);
}
if (Environment.detailLogEnabled())
LOG.info("Hostname is: " + _hostName);
String rpcInterface = _configuration.get("org.commoncrawl.rpcInterface", getDefaultRPCInterface());
String webInterface = _configuration.get("org.commoncrawl.httpInterface", getDefaultHttpInterface());
int rpcPort = _configuration.getInt("org.commoncrawl.rpcPort", getDefaultRPCPort());
int httpPort = _configuration.getInt("org.commoncrawl.httpPort", getDefaultHttpPort());
String dataDirectory = _configuration.get("org.commoncrawl.dataDir", getDefaultDataDir());
LOG.info("Data Dir is:" + getDefaultDataDir());
_dnsThreadPoolSize = _configuration.getInt("org.commoncrawl.dnsThreadPoolSize", DEFAULT_DNS_THREAD_POOL_SIZE);
// validate data directory
_dataDir = new File(dataDirectory);
if (!_dataDir.exists()) {
if (!_dataDir.mkdirs()) {
System.out.println("Unable to create data directory: " + dataDirectory);
return false;
}
} else if (!_dataDir.isDirectory()) {
System.out.println("Invalid data directory:" + dataDirectory);
return false;
}
// update properties ...
LOG.info("Config says rpcInterface is:" + rpcInterface);
conf.set("org.commoncrawl.rpcInterface", rpcInterface);
LOG.info("Config says httpInterface is:" + webInterface);
conf.set("org.commoncrawl.httpInterface", webInterface);
LOG.info("Config says rpcPort is:" + rpcPort);
conf.setInt("org.commoncrawl.rpcPort", rpcPort);
LOG.info("Config says httpPort is:" + httpPort);
conf.setInt("org.commoncrawl.httpPort", httpPort);
LOG.info("Config says dataDir is:" + dataDirectory);
conf.set("org.commoncrawl.dataDir", dataDirectory);
// extract ip address for rpc / web interfaces
String rpcIPS[] = getIPs(rpcInterface);
String webIPS[] = getIPs(webInterface);
if (rpcIPS == null || rpcIPS.length == 0) {
LOG.error("No Valid IP Addresses found for RPC Interface:" + rpcInterface);
return false;
}
if (webIPS == null || webIPS.length == 0) {
LOG.error("No Valid IP Addresses found for Web Interface:" + webInterface);
return false;
}
String selectedRPCInterface = rpcIPS[0];
if (rpcIPS.length > 1) {
for (String rpcIPAddress : rpcIPS) {
if (!rpcIPAddress.endsWith(".1")) {
selectedRPCInterface = rpcIPAddress;
break;
}
}
}
String selectedWebInterface = webIPS[0];
if (webIPS.length > 1) {
for (String webIPAddress : rpcIPS) {
if (!webIPAddress.endsWith(".1")) {
selectedWebInterface = webIPAddress;
break;
}
}
}
LOG.info("RPC Interface translates to IP:" + selectedRPCInterface);
LOG.info("Web Interface translates to IP:" + selectedWebInterface);
// initialize the server address
_serverAddress = new InetSocketAddress(selectedRPCInterface, rpcPort);
// init the event loop
_eventLoop = new EventLoop(registerThreadPool(DNS_POOL_NAME, _dnsThreadPoolSize));
// and the Web Server
_webServer = new WebServer(this, selectedWebInterface, httpPort, false, _useAsyncWebDispatch);
_webServer.setAttribute("commoncrawl.server", this);
// initialize the underlying server
if (initServer()) {
// log the startup time
LOG.info(_commonConfig._className + " up at: " + _serverAddress);
return true;
}
return false;
}
protected abstract boolean initServer();
private void join() throws InterruptedException {
if (_eventLoop.getEventThread() != null) {
_eventLoop.getEventThread().join();
}
}
protected void overrideConfig(Configuration conf) {
}
protected abstract boolean parseArguements(String argv[]);
protected abstract void printUsage();
public ExecutorService registerThreadPool(String threadPoolId, int maxThreads) {
ExecutorService service = null;
synchronized (_threadPoolMap) {
service = _threadPoolMap.get(threadPoolId);
if (service == null) {
service = Executors.newFixedThreadPool(maxThreads);
_threadPoolMap.put(threadPoolId, service);
}
}
return service;
}
public void setAsyncWebDispatch(boolean asyncWebDispatch) {
_useAsyncWebDispatch = asyncWebDispatch;
}
@Override
public void start() throws IOException {
// add a shutdown hook ...
Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
@Override
public void run() {
stop();
}
}));
_eventLoop.start();
if (startDaemons()) {
// start RPC server
super.start();
// and finally start web server ...
_webServer.start();
}
}
protected abstract boolean startDaemons();
/** called to initiate an orderly shutdown of the service **/
@Override
public void stop() {
// stop RPC Server
super.stop();
/*
* try { // and web server _webServer.stop(); } catch (InterruptedException
* e) {
*
* }
*/
// and finally stop daemons
stopDaemons();
_eventLoop.stop();
}
protected abstract void stopDaemons();
public void terminateDefaultThreadPool() {
ExecutorService oldThreadPool = null;
synchronized (this) {
oldThreadPool = _defaultThreadPool;
_defaultThreadPool = Executors.newFixedThreadPool(_configuration.getInt("org.commoncrawl.threadpool.max.threads",
DEFAULT_MAX_THREADPOOL_THREADS));
}
oldThreadPool.shutdown();
try {
while (!oldThreadPool.awaitTermination(1000, TimeUnit.MILLISECONDS)) {
LOG.info("Awaiting shutdown for Default ThreadPool");
}
} catch (InterruptedException e) {
LOG.error(StringUtils.stringifyException(e));
}
}
public void terminateThreadPool(String threadPoolId) {
ExecutorService service = null;
synchronized (_threadPoolMap) {
service = _threadPoolMap.remove(threadPoolId);
}
if (service != null) {
LOG.info("Terminating ThreadPool:" + threadPoolId);
service.shutdown();
try {
while (!service.awaitTermination(1000, TimeUnit.MILLISECONDS)) {
LOG.info("Awaiting shutdown for ThreadPool:" + threadPoolId);
}
} catch (InterruptedException e) {
LOG.error(StringUtils.stringifyException(e));
}
}
}
}