/** * Copyright 2008 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package net.sf.katta.node; import java.io.File; import java.io.IOException; import java.net.BindException; import java.util.Collection; import net.sf.katta.node.monitor.IMonitor; import net.sf.katta.operation.node.NodeOperation; import net.sf.katta.operation.node.OperationResult; import net.sf.katta.operation.node.ShardRedeployOperation; import net.sf.katta.protocol.ConnectedComponent; import net.sf.katta.protocol.InteractionProtocol; import net.sf.katta.protocol.NodeQueue; import net.sf.katta.protocol.metadata.NodeMetaData; import net.sf.katta.util.NodeConfiguration; import net.sf.katta.util.ThrottledInputStream.ThrottleSemaphore; import org.I0Itec.zkclient.ExceptionUtil; import org.I0Itec.zkclient.NetworkUtil; import org.I0Itec.zkclient.exception.ZkInterruptedException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RPC.Server; import org.apache.log4j.Logger; public class Node implements ConnectedComponent { protected final static Logger LOG = Logger.getLogger(Node.class); private final NodeConfiguration _nodeConf; protected InteractionProtocol _protocol; private final IContentServer _contentServer; protected NodeContext _context; protected String _nodeName; private Server _rpcServer; private IMonitor _monitor; private Thread _nodeOperatorThread; private boolean _stopped; public Node(InteractionProtocol protocol, IContentServer server) { this(protocol, new NodeConfiguration(), server); } public Node(InteractionProtocol protocol, final NodeConfiguration configuration, IContentServer contentServer) { _protocol = protocol; _contentServer = contentServer; if (contentServer == null) { throw new IllegalArgumentException("Null server passed to Node()"); } _nodeConf = configuration; } /** * Boots the node */ public void start() { if (_stopped) { throw new IllegalStateException("Node cannot be started again after it was shutdown."); } LOG.info("starting rpc server with server class = " + _contentServer.getClass().getCanonicalName()); String hostName = NetworkUtil.getLocalhostName(); _rpcServer = startRPCServer(hostName, _nodeConf.getStartPort(), _contentServer, _nodeConf.getRpcHandlerCount()); _nodeName = hostName + ":" + _rpcServer.getListenerAddress().getPort(); _contentServer.init(_nodeName, _nodeConf); // we add hostName and port to the shardFolder to allow multiple nodes per // server with the same configuration File shardsFolder = new File(_nodeConf.getShardFolder(), _nodeName.replaceAll(":", "_")); LOG.info("local shard folder: " + shardsFolder.getAbsolutePath()); int throttleInKbPerSec = _nodeConf.getShardDeployThrottle(); final ShardManager shardManager; if (throttleInKbPerSec > 0) { LOG.info("throtteling of shard deployment to " + throttleInKbPerSec + " kilo-bytes per second"); shardManager = new ShardManager(shardsFolder, new ThrottleSemaphore(throttleInKbPerSec * 1024)); } else { shardManager = new ShardManager(shardsFolder); } _context = new NodeContext(_protocol, this, shardManager, _contentServer); _protocol.registerComponent(this); startMonitor(_nodeName, _nodeConf); init(); LOG.info("started node '" + _nodeName + "'"); } private synchronized void init() { redeployInstalledShards(); NodeMetaData nodeMetaData = new NodeMetaData(_nodeName); NodeQueue nodeOperationQueue = _protocol.publishNode(this, nodeMetaData); startOperatorThread(nodeOperationQueue); } private void startOperatorThread(NodeQueue nodeOperationQueue) { _nodeOperatorThread = new Thread(new NodeOperationProcessor(nodeOperationQueue, _context)); _nodeOperatorThread.setName(NodeOperationProcessor.class.getSimpleName() + ": " + getName()); _nodeOperatorThread.setDaemon(true); _nodeOperatorThread.start(); } @Override public synchronized void reconnect() { LOG.info(_nodeName + " reconnected"); init(); } @Override public synchronized void disconnect() { if (_nodeOperatorThread == null) { LOG.warn(_nodeName + " disconnected before initialization complete"); return; } LOG.info(_nodeName + " disconnected"); try { do { LOG.info("trying to stop node-processor..."); _nodeOperatorThread.interrupt(); _nodeOperatorThread.join(2500); } while (_nodeOperatorThread.isAlive()); } catch (InterruptedException e) { Thread.interrupted(); } // we keep serving the shards } private void redeployInstalledShards() { Collection<String> installedShards = _context.getShardManager().getInstalledShards(); ShardRedeployOperation redeployOperation = new ShardRedeployOperation(installedShards); try { redeployOperation.execute(_context); } catch (InterruptedException e) { ExceptionUtil.convertToRuntimeException(e); } } private void startMonitor(String nodeName, NodeConfiguration conf) { if (LOG.isTraceEnabled()) { LOG.trace("starting node monitor"); } String monitorClass = conf.getMonitorClass(); try { Class<?> c = Class.forName(monitorClass); _monitor = (IMonitor) c.newInstance(); _monitor.startMonitoring(nodeName, _protocol); } catch (Exception e) { LOG.error("Unable to start node monitor:", e); } } public void shutdown() { if (_stopped) { throw new IllegalStateException("already stopped"); } LOG.info("shutdown " + _nodeName + " ..."); _stopped = true; if (_monitor != null) { _monitor.stopMonitoring(); } _nodeOperatorThread.interrupt(); try { _nodeOperatorThread.join(); } catch (InterruptedException e) { Thread.interrupted();// proceed } _protocol.unregisterComponent(this); _rpcServer.stop(); try { _context.getContentServer().shutdown(); } catch (Throwable t) { LOG.error("Error shutting down server", t); } LOG.info("shutdown " + _nodeName + " finished"); } public String getName() { return _nodeName; } public NodeContext getContext() { return _context; } public int getRPCServerPort() { return _rpcServer.getListenerAddress().getPort(); } public boolean isRunning() { // TODO jz: improve this whole start/stop/isRunning thing return _context != null && !_stopped; } public void join() throws InterruptedException { _rpcServer.join(); } public Server getRpcServer() { return _rpcServer; } /* * Starting the hadoop RPC server that response to query requests. We iterate * over a port range of node.server.port.start + 10000 */ private static Server startRPCServer(String hostName, final int startPort, IContentServer nodeManaged, int handlerCount) { int serverPort = startPort; int tryCount = 10000; Server _rpcServer = null; while (_rpcServer == null) { try { _rpcServer = RPC.getServer(nodeManaged, "0.0.0.0", serverPort, handlerCount, false, new Configuration()); LOG.info(nodeManaged.getClass().getSimpleName() + " server started on : " + hostName + ":" + serverPort); } catch (final BindException e) { if (serverPort - startPort < tryCount) { serverPort++; // try again } else { throw new RuntimeException("tried " + tryCount + " ports and no one is free..."); } } catch (final IOException e) { throw new RuntimeException("unable to create rpc server", e); } } try { _rpcServer.start(); } catch (final IOException e) { throw new RuntimeException("failed to start rpc server", e); } return _rpcServer; } @Override protected void finalize() throws Throwable { super.finalize(); shutdown(); } @Override public String toString() { return _nodeName; } public InteractionProtocol getProtocol() { return _protocol; } protected static class NodeOperationProcessor implements Runnable { private final NodeQueue _queue; private final NodeContext _nodeContext; public NodeOperationProcessor(NodeQueue queue, NodeContext nodeContext) { _queue = queue; _nodeContext = nodeContext; } @Override public void run() { try { while (_nodeContext.getNode().isRunning()) { try { NodeOperation operation = _queue.peek(); OperationResult operationResult; try { LOG.info("executing " + operation); operationResult = operation.execute(_nodeContext); } catch (Exception e) { ExceptionUtil.rethrowInterruptedException(e); LOG.error(_nodeContext.getNode().getName() + ": failed to execute " + operation, e); operationResult = new OperationResult(_nodeContext.getNode().getName(), e); } _queue.complete(operationResult);// only remove after finish } catch (Throwable e) { ExceptionUtil.rethrowInterruptedException(e); LOG.fatal(_nodeContext.getNode().getName() + ": operation failure ", e); } } } catch (InterruptedException e) { Thread.interrupted(); } catch (ZkInterruptedException e) { Thread.interrupted(); } LOG.info("node operation processor for " + _nodeContext.getNode().getName() + " stopped"); } } }