package dmg.cells.nucleus ; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableList; import com.google.common.collect.Maps; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.MoreExecutors; import org.apache.curator.framework.CuratorFramework; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.FileNotFoundException; import java.io.PrintWriter; import java.io.Serializable; import java.net.InetAddress; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.function.Consumer; import java.util.function.Function; import dmg.util.AuthorizedString; import dmg.util.Gate; import dmg.util.command.Command; import dmg.util.logback.FilterShell; import org.dcache.alarms.AlarmMarkerFactory; import org.dcache.alarms.PredefinedAlarm; import static com.google.common.util.concurrent.Futures.allAsList; import static java.util.stream.Collectors.toList; import static java.util.stream.Collectors.toMap; import static org.bouncycastle.asn1.x500.style.RFC4519Style.name; import static org.dcache.util.ByteUnit.MiB; /** * * * @author Patrick Fuhrmann * @version 0.1, 15 Feb 1998 */ public class SystemCell extends CellAdapter implements Thread.UncaughtExceptionHandler { private static final Logger _log = LoggerFactory.getLogger(SystemCell.class); /* Released on OOM to increase the chance that the shutdown succeeds. */ private byte[] _oomSafetyBuffer = new byte[MiB.toBytes(2)]; private final CellShell _cellShell ; private final CellNucleus _nucleus ; private int _packetsReceived, _packetsAnswered, _packetsForwarded, _packetsReplied, _exceptionCounter; private final Runtime _runtime = Runtime.getRuntime() ; private final Gate _shutdownLock = new Gate(false); private class TheKiller extends Thread { @Override public void run(){ _log.info("Running shutdown sequence"); kill() ; _log.info("Kill done, waiting for shutdown lock"); _shutdownLock.check() ; _log.info("Killer done"); } } public static SystemCell create(String cellDomainName, CuratorFramework curatorFramework) { CellNucleus.initCellGlue(cellDomainName, curatorFramework); return new SystemCell(); } protected SystemCell() { super("System", "System", ""); _nucleus = getNucleus(); _cellShell = new CellShell(getNucleus()); } @Override protected void starting() { /* We start the curator here to get the right context for the curator threads. */ CellNucleus.startCurator(); _cellShell.addCommandListener(this); _cellShell.addCommandListener(new LogbackShell()); _cellShell.addCommandListener(new FilterShell(_nucleus.getLoggingThresholds())); _cellShell.addCommandListener(_cellShell.new HelpCommands()); useInterpreter(false); _runtime.addShutdownHook(new TheKiller()); } @Override protected void started() { Thread.setDefaultUncaughtExceptionHandler(this); } @Override public void stopped() { shutdownSystem(); CellNucleus.shutdownCellGlue(); _log.info("Opening shutdown lock"); _shutdownLock.open(); System.exit(0); } // // interface from Cell // public String toString(){ long fm = _runtime.freeMemory() ; long tm = _runtime.totalMemory() ; return getCellDomainName()+ ":IOrec="+_packetsReceived+ ";IOexc="+_exceptionCounter+ ";MEM="+(tm-fm) ; } @Command(name = "get hostname", hint = "show this dCache-domain hostname", description = "Returns the hostname of the computer this (dCache) " + "domain is running at. The hostname returned can be either " + "the fully qualified domain name for this IP address " + "or just 'localhost', if the local host name could not" + " be resolved into an address.") public class GetHostnameCommand implements Callable<String> { @Override public String call() { try { return InetAddress.getLocalHost().getCanonicalHostName(); } catch (UnknownHostException ex) { return "localhost"; } } } private void shutdownSystem() { List<String> names = _nucleus.getCellNames(); List<String> nonSystem = new ArrayList<>(names.size()); List<String> system = new ArrayList<>(names.size()); for (String name: names) { CellInfo info = _nucleus.getCellInfo(name); if (info == null) { continue; } String cellName = info.getCellName(); if (cellName.equals("System")) { // Don't kill the system cell } else if (info.getCellType().equals("System")) { system.add(cellName); } else { nonSystem.add(cellName); } } _log.info("Will try to shutdown non-system cells {}", nonSystem); shutdownCells(nonSystem, 5000, 10000); _log.info("Will try to shutdown remaining cells {}", system); shutdownCells(system, 5000, 10000); } /** * Shuts down named cells. The method will block until the cells * are dead or until a timeout has occurred. * * @param cells List of names of cells to kill. * @param softTimeout Timeout in milliseconds to wait until we log the cells we are waiting for * @param hardTimeout Timeout in milliseconds to wait until we log stack traces and give up */ private void shutdownCells(List<String> cells, long softTimeout, long hardTimeout) { /* We log the completion of cell shutdown from a listener. */ long start = System.currentTimeMillis(); Function<String, Runnable> listeners = name -> () -> { long time = System.currentTimeMillis() - start; if (time > softTimeout) { _log.warn("Killed {} in {} ms", name, time); } else { _log.info("Killed {}", name); } }; /* Kill all the cells. */ Map<String, ListenableFuture<?>> futures = cells.stream().collect(toMap(name -> name, _nucleus::kill)); /* And attach the listener. */ futures.forEach((name, future) -> future.addListener(listeners.apply(name), MoreExecutors.directExecutor())); /* Now wait. */ try { try { Futures.successfulAsList(futures.values()).get(softTimeout, TimeUnit.MILLISECONDS); } catch (TimeoutException e) { futures.forEach((name, future) -> { if (!future.isDone()) { _log.warn("Still waiting for {} to shut down.", name); } }); Futures.successfulAsList(futures.values()).get(hardTimeout - softTimeout, TimeUnit.MILLISECONDS); } } catch (InterruptedException e) { Thread.currentThread().interrupt(); } catch (TimeoutException e) { futures.forEach((name, future) -> { if (!future.isDone()) { CellNucleus.listThreadGroupOf(name); } }); CellNucleus.listKillerThreadGroup(); } catch (ExecutionException e) { _log.error("Unexpected exception during shutdown.", e.getCause()); } } @Override public void getInfo(PrintWriter pw) { pw.append(" CellDomainName = ").println(getCellDomainName()); pw.format(" I/O rcv=%d;asw=%d;frw=%d;rpy=%d;exc=%d\n", _packetsReceived, _packetsAnswered, _packetsForwarded, _packetsReplied, _exceptionCounter); long fm = _runtime.freeMemory(); long tm = _runtime.totalMemory(); pw.format(" Memory : tot=%d;free=%d;used=%d\n", tm, fm, tm - fm); pw.println(" Cells (Threads)"); for (String name: _nucleus.getCellNames()) { pw.append(" ").append(name).append("("); Thread[] threads = _nucleus.getThreads(name); if (threads != null) { boolean first = true; for (Thread thread: threads) { pw.print(thread.getName()); if (first) { first = false; } else { pw.print(","); } } } pw.println(")"); } } @Override public void messageToForward( CellMessage msg ){ try{ sendMessage( msg ) ; _packetsForwarded ++ ; }catch( RuntimeException eee ){ _exceptionCounter ++ ; } } @Override public void messageArrived( CellMessage msg ){ _log.info( "Message arrived : "+msg ) ; _packetsReceived ++ ; if( msg.isReply() ){ _log.warn("Seems to a bounce : "+msg); return ; } Object obj = msg.getMessageObject() ; Serializable reply; if(obj instanceof String) { String command = (String) obj; if (command.isEmpty()) { return; } _log.info("Command: {}", command); if (command.equals("xyzzy")) { reply = "Nothing happens."; } else { reply = _cellShell.objectCommand2(command); } }else if( obj instanceof AuthorizedString ){ AuthorizedString as = (AuthorizedString)obj ; String command = as.toString() ; if( command.length() < 1 ) { return; } _log.info( "Command(p="+as.getAuthorizedPrincipal()+") : "+command ) ; reply = _cellShell.objectCommand2( command ) ; } else { return; } _log.debug("Reply : {}", reply); _packetsAnswered++; try { if (reply instanceof Reply) { ((Reply) reply).deliver(this, msg); } else { msg.revertDirection(); msg.setMessageObject(reply); sendMessage(msg); _log.debug("Sending : {}", msg); } _packetsReplied++; }catch( RuntimeException e ){ _exceptionCounter ++ ; } } @Override public void uncaughtException(Thread t, Throwable e) { /* In case of fatal errors we shut down. The wrapper script * will restart the domain. Notice that there is no guarantee * that the fatal error will not reoccur during shutdown and * in that case the shutdown may fail. We may want to consider * refining the shutdown logic such that in recovers if the * fatal error reoccurs. */ if (e instanceof VirtualMachineError) { _oomSafetyBuffer = null; kill(); _log.error(AlarmMarkerFactory.getMarker(PredefinedAlarm.FATAL_JVM_ERROR, getCellDomainName(), getCellName()), "Restarting due to fatal JVM error", e); return; } Throwable root = Throwables.getRootCause(e); if (root instanceof FileNotFoundException) { if (root.getMessage().contains("Too many open files")) { _log.error(AlarmMarkerFactory.getMarker(PredefinedAlarm.OUT_OF_FILE_DESCRIPTORS, getCellDomainName(), getCellName()), "Uncaught exception in thread " + t.getName(), e); return; } } _log.error("Uncaught exception in thread " + t.getName(), e); } }