/** * Copyright 2011 LiveRamp * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.liveramp.hank.partition_server; import java.io.File; import java.io.IOException; import java.lang.reflect.Field; import java.nio.ByteBuffer; import java.nio.channels.Selector; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.log4j.PropertyConfigurator; import org.apache.thrift.TException; import org.apache.thrift.protocol.TCompactProtocol; import org.apache.thrift.protocol.TProtocol; import org.apache.thrift.server.TThreadedSelectorServer; import org.apache.thrift.transport.TFramedTransport; import org.apache.thrift.transport.TNonblockingServerSocket; import org.apache.thrift.transport.TSocket; import org.apache.thrift.transport.TTransportException; import com.liveramp.hank.config.InvalidConfigurationException; import com.liveramp.hank.config.PartitionServerConfigurator; import com.liveramp.hank.config.yaml.YamlPartitionServerConfigurator; import com.liveramp.hank.coordinator.Coordinator; import com.liveramp.hank.coordinator.Host; import com.liveramp.hank.coordinator.HostCommand; import com.liveramp.hank.coordinator.HostCommandQueueChangeListener; import com.liveramp.hank.coordinator.HostState; import com.liveramp.hank.coordinator.Hosts; import com.liveramp.hank.coordinator.PartitionServerAddress; import com.liveramp.hank.coordinator.Ring; import com.liveramp.hank.coordinator.RingGroup; import com.liveramp.hank.util.CommandLineChecker; import com.liveramp.hank.util.HankTimer; import com.liveramp.hank.util.UpdateStatisticsRunnable; import com.liveramp.hank.zookeeper.WatchedNodeListener; import static com.liveramp.hank.util.LocalHostUtils.getHostName; public class PartitionServer implements HostCommandQueueChangeListener, WatchedNodeListener<HostCommand> { private static final Logger LOG = LoggerFactory.getLogger(PartitionServer.class); private static final long MAIN_THREAD_STEP_SLEEP_MS = 1000; private static final int UPDATE_FILESYSTEM_STATISTICS_THREAD_SLEEP_TIME_MS_DEFAULT = 2 * 60 * 1000; private static final int NUM_WARMUP_QUERIES_PER_THREAD = 100; private static final long MAX_BUFFER_SIZE = 1L << 24; // 16MB private final PartitionServerConfigurator configurator; private final Coordinator coordinator; private final LinkedBlockingQueue<HostCommand> commandQueue; private boolean stopping = false; private boolean hasProcessedCommandOnStartup = false; private final PartitionServerAddress hostAddress; private final Host host; private Thread updateThread; private Thread offlineWatcherThread; private TThreadedSelectorServer dataServer; private Thread dataServerThread; private boolean waitForDataServer; private final RingGroup ringGroup; private Thread shutdownHook; private UpdateFilesystemStatisticsRunnable updateFilesystemStatisticsRunnable; private Thread updateFilesystemStatisticsThread; public PartitionServer(PartitionServerConfigurator configurator, String hostName) throws IOException { this.configurator = configurator; this.coordinator = configurator.createCoordinator(); this.commandQueue = new LinkedBlockingQueue<HostCommand>(); hostAddress = new PartitionServerAddress(hostName, configurator.getServicePort()); ringGroup = coordinator.getRingGroup(configurator.getRingGroupName()); if (ringGroup == null) { throw new RuntimeException("Could not get ring group: " + configurator.getRingGroupName()); } Ring ring = ringGroup.getRingForHost(hostAddress); if (ring == null) { throw new RuntimeException("Could not get ring for host address: " + hostAddress + " in ring group " + ringGroup.getName()); } host = ring.getHostByAddress(hostAddress); if (host == null) { throw new RuntimeException("Could not get host for host address: " + hostAddress + " in ring group " + ringGroup.getName() + " ring " + ring.getRingNumber()); } if (Hosts.isOnline(host)) { throw new RuntimeException("Could not start a partition server for host " + host + " since it is already online."); } host.setCommandQueueChangeListener(this); host.setCurrentCommandChangeListener(this); host.setEnvironmentFlags(configurator.getEnvironmentFlags()); // Start the update filesystem statistics thread updateFilesystemStatisticsRunnable = new UpdateFilesystemStatisticsRunnable(); updateFilesystemStatisticsThread = new Thread(updateFilesystemStatisticsRunnable, "Update Filesystem Statistics"); updateFilesystemStatisticsThread.setDaemon(true); updateFilesystemStatisticsThread.start(); } public void run() throws IOException, InterruptedException { // Add shutdown hook addShutdownHook(); // Initialize and process commands setStateSynchronized(HostState.IDLE); // In case of exception, server will stop and state will be coherent. // Wait for state to propagate addServerOfflineWatcher(); while (host.getState() != HostState.IDLE) { LOG.info("Waiting for Host state " + HostState.IDLE + " to propagate."); Thread.sleep(100); } processCommandOnStartup(); while (!stopping) { try { HostCommand command = commandQueue.poll(MAIN_THREAD_STEP_SLEEP_MS, TimeUnit.MILLISECONDS); if (command != null) { try { processCommand(command, host.getState()); } catch (IOException e) { LOG.error("Failed to process command: " + command, e); break; } } } catch (InterruptedException e) { LOG.info("Interrupted in main loop. Exiting.", e); break; } } // Shuting down LOG.info("Partition server main thread is stopping."); // Stop serving data stopServingData(); // Stop updating if necessary stopUpdating(); // Signal OFFLINE setStateSynchronized(HostState.OFFLINE); // In case of exception, server will stop and state will be coherent. // Remove shutdown hook. We don't need it anymore as we just set the host state to OFFLINE removeShutdownHook(); // Disconnect from zookeeper coordinator.close(); } private void addServerOfflineWatcher() { Runnable serverOfflineWatcher = new Runnable() { @Override public void run() { //TODO make these configurable long units = 10l; TimeUnit unit = TimeUnit.MINUTES; try { while (true) { HostState state = getStateSafe(); LOG.info("Current state: "+state); if (state == null || HostState.OFFLINE.equals(state)) { LOG.info("OFFLINE. Starting shutdown countdown"); startShutDownCountdown(units, unit); } unit.sleep(units); } }catch (Exception e){ LOG.error("Watcher thread encountered an error - thread will stop for safety", e); } } public HostState getStateSafe() { HostState state; try { state = host.getState(); } catch (IOException e) { LOG.error("Offline watcher failed to get state, counting as OFFLINE. Exception: ", e); state = HostState.OFFLINE; } return state; } private void startShutDownCountdown(long units, TimeUnit unit) { try { unit.sleep(units); HostState state = getStateSafe(); if (state == null || HostState.OFFLINE.equals(state)) { LOG.error("Partition Server was OFFLINE for " + units + " " + unit.toString()); stopSynchronized(); }else{ LOG.error("Shutdown cancelled, state is currently: " + state); } } catch (InterruptedException e) { LOG.error("Interrupted while performing shutdown countdown", e); } } }; offlineWatcherThread = new Thread(serverOfflineWatcher, "Server Offline Watcher"); offlineWatcherThread.setDaemon(true); offlineWatcherThread.start(); } // Stop the partition server. Can be called from another thread. public synchronized void stopSynchronized() { stop(); } // Stop the partition server private void stop() { stopping = true; } protected IfaceWithShutdown getHandler() throws IOException { return new PartitionServerHandler(hostAddress, configurator, coordinator); } protected IUpdateManager getUpdateManager() throws IOException { return new UpdateManager(configurator, host, ringGroup); } @Override public synchronized void onCommandQueueChange(Host host) { LOG.info("Command queue changed."); // Do not process anything when we have not yet tried to process a command when starting up. if (!hasProcessedCommandOnStartup) { LOG.info("Ignoring command queue change as commands have not yet been executed on startup."); return; } try { HostCommand command = host.getCurrentCommand(); if (command == null) { // When command queue changes, and current command is empty, move on to next command host.nextCommand(); } else { // A current command was already in place, we are still executing it. Do nothing. LOG.info("Ignoring command queue change as a command is currently being executed."); } } catch (IOException e) { LOG.error("Failed to move on to next command.", e); stop(); } } @Override public void onWatchedNodeChange(final HostCommand command) { LOG.info("Current command changed: " + command); // Do not process anything when stopping if (stopping) { LOG.error("Ignoring command " + command + " because server is stopping."); return; } try { if (command != null) { commandQueue.put(command); } } catch (InterruptedException e) { LOG.error("Failed to process command.", e); stopSynchronized(); } } public void processCommandOnStartup() { try { HostCommand command = host.getCurrentCommand(); LOG.info("Current command is: " + command); if (command != null) { commandQueue.put(command); } else { host.nextCommand(); } } catch (Exception e) { LOG.error("Failed to process current command on startup.", e); stop(); } hasProcessedCommandOnStartup = true; } private synchronized void setStateSynchronized(HostState state) throws IOException { // In case of failure to set host state, stop the partition server and rethrow the exception. try { host.setState(state); } catch (IOException e) { stop(); throw e; } } private synchronized HostCommand nextCommandSynchronized() throws IOException { // In case of failure to move on to next command, stop the partition server and rethrow the exception. try { return host.nextCommand(); } catch (IOException e) { stop(); throw e; } } private void processCommand(HostCommand command, HostState state) throws IOException { LOG.info("Processing command: " + command); switch (command) { case EXECUTE_UPDATE: processExecuteUpdate(state); break; case GO_TO_IDLE: processGoToIdle(state); break; case SERVE_DATA: processServeData(state); break; } } private void processGoToIdle(HostState state) throws IOException { switch (state) { case SERVING: // Set IDLE state proactively. If the shutting down hangs, it will be safe since clients will be already aware. host.setState(HostState.IDLE); // In case of exception, server will stop and state will be coherent. stopServingData(); host.nextCommand(); // In case of exception, server will stop and state will be coherent. break; default: LOG.info(ignoreIncompatibleCommandMessage(HostCommand.GO_TO_IDLE, state)); host.nextCommand(); // In case of exception, server will stop and state will be coherent. } } private void processExecuteUpdate(HostState state) throws IOException { switch (state) { case IDLE: host.setState(HostState.UPDATING); // In case of exception, server will stop and state will be coherent. executeUpdate(); // Next command is set by the updater thread break; default: LOG.info(ignoreIncompatibleCommandMessage(HostCommand.EXECUTE_UPDATE, state)); host.nextCommand(); // In case of exception, server will stop and state will be coherent. } } private void processServeData(HostState state) throws IOException { switch (state) { case IDLE: serveData(); host.setState(HostState.SERVING); // In case of exception, server will stop and state will be coherent. host.nextCommand(); // In case of exception, server will stop and state will be coherent. break; default: LOG.info(ignoreIncompatibleCommandMessage(HostCommand.SERVE_DATA, state)); host.nextCommand(); // In case of exception, server will stop and state will be coherent. } } private void executeUpdate() { if (updateThread != null) { LOG.error("Update got called while one is already running!"); return; } Runnable updateRunnable = new Runnable() { @Override public void run() { try { IUpdateManager updateManager = getUpdateManager(); updateManager.update(); LOG.info("Update succeeded."); } catch (Throwable e) { LOG.error("Update failed. Updater encountered a fatal error:", e); try { long cooldown = configurator.getUpdateFailureCooldown(); LOG.error("Will retry update in "+cooldown+ "ms."); Thread.sleep(cooldown); } catch (InterruptedException e1) { // no op } } // Go back to IDLE even in case of failure try { setStateSynchronized(HostState.IDLE); // In case of exception, server will stop and state will be coherent. } catch (IOException e) { LOG.error("Failed to record state change.", e); } // Move on to next command try { nextCommandSynchronized(); // In case of exception, server will stop and state will be coherent. } catch (IOException e) { LOG.error("Failed to move on to next command.", e); } // Signal that update thread is done. updateThread = null; } }; updateThread = new Thread(updateRunnable, "Update manager thread"); updateThread.start(); } private void stopUpdating() throws InterruptedException { if (updateThread != null) { LOG.info("Update thread is still running. Interrupting and waiting for it to finish..."); updateThread.interrupt(); updateThread.join(); // In case of interrupt exception, server will stop and state will be coherent. } } protected void startThriftServer() throws TTransportException, IOException, InterruptedException { IfaceWithShutdown handler = null; try { // Set up the service handler handler = getHandler(); // Launch the thrift server TNonblockingServerSocket serverSocket = new TNonblockingServerSocket(configurator.getServicePort()); TThreadedSelectorServer.Args options = new TThreadedSelectorServer.Args(serverSocket); options.processor(new com.liveramp.hank.generated.PartitionServer.Processor(handler)); options.workerThreads(configurator.getNumConcurrentQueries()); options.selectorThreads(4); options.protocolFactory(new TCompactProtocol.Factory()); options.maxReadBufferBytes = MAX_BUFFER_SIZE; dataServer = new TThreadedSelectorServer(options); LOG.info("Launching Thrift server."); dataServer.serve(); LOG.info("Thrift server exited."); // The Thrift server does not clean up selectors after stopping, which leads to a file descriptor leak. // See https://issues.apache.org/jira/browse/THRIFT-2274 // TODO: when the bug is fixed in Thrift, remove this ugly hack which takes care of the issue List<Selector> selectors = getServerSelectors(dataServer); closeServerSelectors(selectors); // Close the socket serverSocket.close(); } finally { // Always shut down the handler if (handler != null) { LOG.debug("Shutting down Partition Server handler."); handler.shutDown(); } } } private void serveData() throws IOException { waitForDataServer = true; if (dataServer != null) { LOG.info("Data server is already running. Cannot serve data."); return; } Runnable r = new Runnable() { @Override public void run() { try { startThriftServer(); } catch (Throwable t) { // Data server is probably going down unexpectedly, stop the partition server LOG.error("Data server thread encountered a fatal throwable and is stopping.", t); // Stop waiting for data server waitForDataServer = false; // Stop partition server main thread stopSynchronized(); } } }; dataServerThread = new Thread(r, "PartitionServer Thrift data server thread"); LOG.info("Launching data server thread."); dataServerThread.start(); try { while (dataServer == null || !dataServer.isServing()) { if (!waitForDataServer) { LOG.info("Data server encountered an error. Stop waiting for it to start."); break; } LOG.debug("Data server isn't online yet. Waiting..."); Thread.sleep(1000); } } catch (InterruptedException e) { throw new IOException("Interrupted while waiting for data server thread to start", e); } if (dataServer == null || !dataServer.isServing()) { throw new IOException("Failed to start data server"); } else { warmUp(); LOG.info("Data server online and serving."); } } private void stopServingData() { if (dataServer == null) { return; } LOG.info("Stopping data server thread."); dataServer.stop(); try { dataServerThread.join(); } catch (InterruptedException e) { LOG.error("Interrupted while waiting for data server thread to stop. Continuing.", e); } dataServer = null; dataServerThread = null; LOG.info("Data server thread stopped"); } private String ignoreIncompatibleCommandMessage(HostCommand command, HostState state) { return String.format("Ignoring command %s because it is incompatible with state %s.", command, state); } private Map<String, FilesystemStatisticsAggregator> getFilesystemStatistics() throws IOException { Map<String, FilesystemStatisticsAggregator> result = new HashMap<String, FilesystemStatisticsAggregator>(); for (String filesystemRoot : getUsedFilesystemRoots()) { File filesystemRootFile = new File(filesystemRoot); result.put(filesystemRoot, new FilesystemStatisticsAggregator(filesystemRootFile.getTotalSpace(), filesystemRootFile.getUsableSpace())); } return result; } private Set<String> getUsedFilesystemRoots() throws IOException { return configurator.getDataDirectories(); /* // Create set of system roots Set<String> filesystemRoots = new HashSet<String>(); for (File root : File.listRoots()) { filesystemRoots.add(root.getCanonicalPath()); } // Determine set of used roots Set<String> result = new HashSet<String>(); for (String dataDirectoryPath : configurator.getDataDirectories()) { String dataDirectoryCanonicalPath = new File(dataDirectoryPath).getCanonicalPath(); String bestFilesystemRoot = null; for (String filesystemRoot : filesystemRoots) { if (dataDirectoryCanonicalPath.startsWith(filesystemRoot) && (bestFilesystemRoot == null || bestFilesystemRoot.length() < filesystemRoot.length())) { bestFilesystemRoot = filesystemRoot; } } if (bestFilesystemRoot == null) { throw new RuntimeException("Unable to determine filesystem root for directory: " + dataDirectoryCanonicalPath); } result.add(bestFilesystemRoot); } return result; */ } private void warmUp() throws IOException { LOG.info("Warming up data server..."); List<Thread> threads = new ArrayList<Thread>(); for (int i = 0; i < configurator.getNumConcurrentQueries(); ++i) { threads.add(new Thread(new WarmupRunnable(), "Warmup Thread #" + i)); } HankTimer timer = new HankTimer(); for (Thread thread : threads) { thread.start(); } for (Thread thread : threads) { try { thread.join(); } catch (InterruptedException e) { LOG.error("Failed to warm up data server", e); throw new IOException("Failed to warm up data server", e); } } long warmupDurationMs = timer.getDurationMs(); LOG.info("Warming up data server took " + warmupDurationMs + " ms"); } /** * This thread periodically updates statistics of the Host */ private class UpdateFilesystemStatisticsRunnable extends UpdateStatisticsRunnable implements Runnable { public UpdateFilesystemStatisticsRunnable() { super(UPDATE_FILESYSTEM_STATISTICS_THREAD_SLEEP_TIME_MS_DEFAULT); } @Override public void runCore() throws IOException { Hosts.setFilesystemStatistics(host, getFilesystemStatistics()); } @Override protected void cleanup() { try { Hosts.deleteFilesystemStatistics(host); } catch (IOException e) { LOG.error("Error while deleting runtime statistics.", e); throw new RuntimeException(e); } } } // Set the host to OFFLINE on VM shutdown private void addShutdownHook() { if (shutdownHook == null) { shutdownHook = new Thread() { @Override public void run() { try { if (host != null) { host.setState(HostState.OFFLINE); } } catch (IOException e) { // When VM is exiting and we fail to set host to OFFLINE, swallow the exception } } }; Runtime.getRuntime().addShutdownHook(shutdownHook); } } private void removeShutdownHook() { if (shutdownHook != null) { Runtime.getRuntime().removeShutdownHook(shutdownHook); } } static List<Selector> getServerSelectors(TThreadedSelectorServer server) { List<Selector> result = new ArrayList<Selector>(); try { // Get accept thread selector Field acceptThreadField = server.getClass().getDeclaredField("acceptThread"); acceptThreadField.setAccessible(true); Thread acceptThread = (Thread)acceptThreadField.get(server); Field acceptSelectorField = acceptThread.getClass().getDeclaredField("acceptSelector"); acceptSelectorField.setAccessible(true); Selector acceptSelector = (Selector)acceptSelectorField.get(acceptThread); result.add(acceptSelector); // Get the other selectors Field selectorThreadField = server.getClass().getDeclaredField("selectorThreads"); selectorThreadField.setAccessible(true); Set selectorThreads = (Set)selectorThreadField.get(server); for (Object selectorThread : selectorThreads) { Field selectorThreadSelectorField = selectorThread.getClass().getSuperclass().getDeclaredField("selector"); selectorThreadSelectorField.setAccessible(true); Selector selector = (Selector)selectorThreadSelectorField.get(selectorThread); result.add(selector); } } catch (NoSuchFieldException e) { throw new RuntimeException(e); } catch (IllegalAccessException e) { throw new RuntimeException(e); } return result; } static void closeServerSelectors(List<Selector> selectors) { for (Selector selector : selectors) { try { selector.close(); } catch (IOException e) { throw new RuntimeException(e); } } } public static void main(String[] args) throws IOException, InvalidConfigurationException, InterruptedException { CommandLineChecker.check(args, new String[]{"configuration_file_path", "log4j_properties_file_path"}, PartitionServer.class); String configPath = args[0]; String log4jprops = args[1]; PartitionServerConfigurator configurator = new YamlPartitionServerConfigurator(configPath); PropertyConfigurator.configure(log4jprops); new PartitionServer(configurator, getHostName()).run(); } private class WarmupRunnable implements Runnable { @Override public void run() { TSocket socket = null; TFramedTransport transport = null; try { socket = new TSocket(host.getAddress().getHostName(), host.getAddress().getPortNumber(), 0); transport = new TFramedTransport(socket); transport.open(); TProtocol proto = new TCompactProtocol(transport); com.liveramp.hank.generated.PartitionServer.Client client = new com.liveramp.hank.generated.PartitionServer.Client(proto); // Perform queries for (int i = 0; i < NUM_WARMUP_QUERIES_PER_THREAD; i++) { client.get(0, ByteBuffer.wrap(new byte[0])); } } catch (TException e) { LOG.error("Failed to warm up data server", e); throw new RuntimeException("Failed to warm up data server", e); } finally { if (transport != null) { transport.close(); } if (socket != null) { socket.close(); } } } } }