/* * Galaxy * Copyright (c) 2012-2014, Parallel Universe Software Co. All rights reserved. * * This program and the accompanying materials are dual-licensed under * either the terms of the Eclipse Public License v1.0 as published by * the Eclipse Foundation * * or (per the licensee's choosing) * * under the terms of the GNU Lesser General Public License version 3.0 * as published by the Free Software Foundation. */ package co.paralleluniverse.galaxy.core; import co.paralleluniverse.common.MonitoringType; import static co.paralleluniverse.common.logging.LoggingUtils.hex; import co.paralleluniverse.common.spring.Service; import co.paralleluniverse.common.util.DegenerateInvocationHandler; import co.paralleluniverse.galaxy.Cluster; import co.paralleluniverse.galaxy.core.Cache.CacheLine; import co.paralleluniverse.galaxy.core.Message.BACKUP; import co.paralleluniverse.galaxy.core.Message.BACKUP_PACKET; import co.paralleluniverse.galaxy.core.Message.BACKUP_PACKETACK; import co.paralleluniverse.galaxy.core.Message.INV; import java.beans.ConstructorProperties; import java.lang.reflect.Proxy; import java.nio.ByteBuffer; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import org.cliffc.high_scale_lib.NonBlockingHashMapLong; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.jmx.export.annotation.ManagedAttribute; /** * * @author pron */ public class BackupImpl extends ClusterService implements Backup { // The trick here is to allow fast updates w/o copying the line buffer with each update (and generating garbage in the process). // We just want to mark updated lines, and copy their contents periodically during flushes. private static final Logger LOG = LoggerFactory.getLogger(BackupImpl.class); private long maxDelayNanos = TimeUnit.NANOSECONDS.convert(10, TimeUnit.MILLISECONDS); private final Comm serverComm; private final SlaveComm slaveComm; private Cache cache; // private final ReadWriteLock mapLock = new ReentrantReadWriteLock(); // this could become a bottleneck. consider replacing with a scalable lock private NonBlockingHashMapLong<BackupEntry> map; private final NonBlockingHashMapLong<BackupEntry> map1 = new NonBlockingHashMapLong<BackupEntry>(); private final NonBlockingHashMapLong<BackupEntry> map2 = new NonBlockingHashMapLong<BackupEntry>(); private volatile boolean copyImmediately; private final ReentrantLock currentBackupsLock = new ReentrantLock(); private final Condition currentBackupsPossiblyReady = currentBackupsLock.newCondition(); private final Map<Long, BACKUP> currentBackups = new HashMap<Long, BACKUP>(); private long nextId = 100000; private BACKUP_PACKET lastSent; private volatile boolean awaitServer; private volatile boolean awaitSlaves; private boolean shouldFlush; private long lastFlush; // private volatile boolean completedReplication = false; // private final ScheduledExecutorService scheduler = Executors.newSingleThreadScheduledExecutor(); private final BackupMonitor monitor; @ConstructorProperties({"name", "cluster", "serverComm", "slaveComm", "monitoringType"}) public BackupImpl(String name, Cluster cluster, ServerComm serverComm, SlaveComm slaveComm, MonitoringType monitoringType) { this(name, cluster, serverComm, slaveComm, createMonitor(monitoringType, name)); } BackupImpl(String name, Cluster cluster, ServerComm serverComm, SlaveComm slaveComm, BackupMonitor monitor) { super(name, cluster); this.monitor = monitor; if (cluster.hasServer() && serverComm == null) throw new RuntimeException("Configured to have server but serverComm is null!"); this.serverComm = serverComm; this.slaveComm = slaveComm; if (slaveComm != null) slaveComm.setBackup(this); map = map1; } static BackupMonitor createMonitor(MonitoringType monitoringType, String name) { if (monitoringType == null) return (BackupMonitor) Proxy.newProxyInstance(Cache.class.getClassLoader(), new Class<?>[]{BackupMonitor.class}, DegenerateInvocationHandler.INSTANCE); else switch (monitoringType) { case JMX: return new JMXBackupMonitor(name); case METRICS: return new MetricsBackupMonitor(); } throw new IllegalArgumentException("Unknown MonitoringType " + monitoringType); } public void setMaxDelay(int maxDelayMillis) { assertDuringInitialization(); this.maxDelayNanos = TimeUnit.NANOSECONDS.convert(maxDelayMillis, TimeUnit.MILLISECONDS); } @ManagedAttribute public int getMaxDelay() { return (int) TimeUnit.MILLISECONDS.convert(maxDelayNanos, TimeUnit.NANOSECONDS); } @Override public void init() throws Exception { if (serverComm instanceof Service) removeDependency((Service) serverComm); super.init(); } @Override protected void postInit() throws Exception { ((Service) getCluster()).awaitAvailable(); // If a master already exists let the client slave-comm replicate. We'll go online when we're done (see handleReceivedBackup), and in the meantime // we won't present this node as a slave. // If not, I may become the master, or may go online shortly after another concurrently initializing node which will become the master, // in which case we can expect the replication to complete shortly (as the master won't have time to update that many items). if (getCluster().getMaster(getCluster().getMyNodeId()) == null) setReady(true); super.postInit(); } @Override protected void start(boolean master) { if (master) startFlushThread(); } @Override public void switchToMaster() { super.switchToMaster(); if (!isAvailable() || !completedReplication) { LOG.info("Node has not completed replication so cannot become master. Going offline!"); getCluster().goOffline(); } else { startFlushThread(); } } @Override protected void shutdown() { super.shutdown(); scheduler.shutdownNow(); } @Override public void setCache(Cache cache) { assertDuringInitialization(); this.cache = cache; } private void startFlushThread() { scheduler.scheduleAtFixedRate(new Runnable() { @Override public void run() { flushNow(); } }, maxDelayNanos, maxDelayNanos, TimeUnit.NANOSECONDS); } @Override public boolean inv(long id, short owner) { try { if (LOG.isDebugEnabled()) LOG.debug("INV {}, {}", id, owner); return !slaveComm.send(Message.INV(getCluster().getMyNodeId(), id, owner)); } catch (NodeNotFoundException e) { throw new AssertionError(e); } } @Override public boolean startBackup() { LOG.debug("start backup"); mapLock.readLock().lock(); if (copyImmediately) { currentBackupsLock.lock(); if (!copyImmediately) // test again currentBackupsLock.unlock(); else return true; } return false; } @Override public void endBackup(boolean locked) { LOG.debug("end backup"); mapLock.readLock().unlock(); if (locked) { currentBackupsPossiblyReady.signal(); currentBackupsLock.unlock(); } } /** * Must be called by the cache when the line is synchronized, and under a read-lock (i.e. between startBackup and endBackup) * * @param id * @param version */ @Override public void backup(long id, long version) { if (LOG.isDebugEnabled()) LOG.debug("Backup: {} ver: {} {}", new Object[]{hex(id), version, copyImmediately ? "(COPY)" : ""}); if (copyImmediately) { currentBackups.put(id, makeBackup(cache.getLine(id), version)); oldMap().remove(id); } else map.put(id, new BackupEntry(id, version)); } @Override public void flush() { scheduler.submit(new Runnable() { @Override public void run() { flushNow(); } }); } private void flushNow() { try { final NonBlockingHashMapLong<BackupEntry> oldMap = map; mapLock.writeLock().lock(); // just to make sure we're not copying in the middle of a transaction try { if (oldMap.isEmpty()) return; switchMaps(); // we switch the maps in the hopes that oldMap is complete, and so backups can continue to work on second map } finally { mapLock.writeLock().unlock(); } LOG.debug("FLUSHING"); currentBackupsLock.lock(); try { assert !copyImmediately; for (Iterator<BackupEntry> it = oldMap.values().iterator(); it.hasNext();) { final BackupEntry be = it.next(); final CacheLine line = cache.getLine(be.id); assert line != null; synchronized (line) { final Message.BACKUP backup = makeBackup(line, be.version); if (backup != null) { oldMap.remove(be.id); if (LOG.isDebugEnabled()) LOG.debug("Copied {} ver {} for backup", hex(be.id), be.version); currentBackups.put(be.id, backup); } else { if (LOG.isDebugEnabled()) LOG.debug("Matching version for {} ({}) not found", hex(be.id), be.version); this.copyImmediately = true; } } it.remove(); } } finally { currentBackupsLock.unlock(); } if (copyImmediately) { // backups incomplete LOG.debug("Incomplete backups. Completeing."); mapLock.writeLock().lock(); currentBackupsLock.lock(); try { for (Iterator<BackupEntry> it = map.values().iterator(); it.hasNext();) { final BackupEntry be = it.next(); final CacheLine line = cache.getLine(be.id); assert line != null; synchronized (line) { Message.BACKUP backup = makeBackup(line, be.version); if (backup != null) { map.remove(be.id); if (LOG.isDebugEnabled()) LOG.debug("Copied {} ver {} for backup", hex(be.id), be.version); currentBackups.put(be.id, backup); } else oldMap.put(be.id, be); } it.remove(); } } finally { currentBackupsLock.unlock(); mapLock.writeLock().unlock(); } currentBackupsLock.lock(); try { for (Iterator<BackupEntry> it = oldMap.values().iterator(); it.hasNext();) { final BackupEntry be = it.next(); final Message.BACKUP backup = currentBackups.get(be.id); if (backup != null && backup.getVersion() >= be.version) it.remove(); } while (!oldMap.isEmpty()) { LOG.debug("Waiting for missing transactions: {}", oldMap); currentBackupsPossiblyReady.await(); } this.copyImmediately = false; } finally { currentBackupsLock.unlock(); } } final BACKUP_PACKET packet = flush1(); if (packet != null) send(packet); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } private BACKUP_PACKET flush1() { currentBackupsLock.lock(); try { if (lastSent == null) { shouldFlush = false; this.lastFlush = System.nanoTime(); if (currentBackups.isEmpty()) return null; final BACKUP_PACKET packet; packet = Message.BACKUP_PACKET(nextId, currentBackups.values()); nextId++; lastSent = packet; currentBackups.clear(); return packet; } else { // last backup not yet acked LOG.debug("Last backup not acked. Not sending."); final long passedMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - lastFlush, TimeUnit.NANOSECONDS); if (passedMillis > 2000) LOG.warn("SLAVE HAS NOT ACKED IN {} MILLISECONDS. SOMETHING IS SERIOUSLY WRONG!", passedMillis); shouldFlush = true; return null; } } finally { currentBackupsLock.unlock(); } } private void send(BACKUP_PACKET packet) { monitor.addBackupPacket(); monitor.addBackups(packet.getBackups().size()); try { awaitServer = true; awaitSlaves = true; if (serverComm != null) { LOG.debug("Sending backup packet to server: {}", packet); serverComm.send(packet); } else ack(true); if (!slaveComm.send(packet)) ack(false); else LOG.debug("Sent backup packet to slaves: {}", packet); } catch (NodeNotFoundException e) { throw new RuntimeException("Server not found!", e); } } private void switchMaps() { if (map == map1) map = map2; else map = map1; } private NonBlockingHashMapLong<BackupEntry> oldMap() { return map == map1 ? map2 : map1; } private Message.BACKUP makeBackup(CacheLine line, long version) { if (line.getVersion() != version) return null; final Message.BACKUP backup; if (line.getData() == null) { backup = Message.BACKUP(line.getId(), line.getVersion(), null); } else { final ByteBuffer buffer = ByteBuffer.allocate(line.getData().limit()); // storage.allocateStorage(line.getData().limit()); line.rewind(); buffer.put(line.getData()); line.rewind(); buffer.flip(); backup = Message.BACKUP(line.getId(), line.getVersion(), buffer); } LOG.debug("Copying version {} of line {} data: {}", new Object[]{backup.getVersion(), hex(backup.getLine()), backup.getData() != null ? "(" + backup.getData().remaining() + " bytes)" : "null"}); return backup; } private void serverAck(Message message) { final BACKUP_PACKETACK ack = (BACKUP_PACKETACK) message; if (ack.getId() != lastSent.getId()) { LOG.warn("Received backup ack from server with id {} which is different from last sent: {}", ack.getId(), lastSent.getId()); return; } ack(true); } @Override public void slavesAck(long id) { if (lastSent == null) { LOG.warn("Received backup ack from slaves with id {} but lastSent is null", id); return; } if (id != lastSent.getId()) { LOG.warn("Received backup ack from slaves with id {} which is different from last sent: {}", id, lastSent.getId()); return; } ack(false); } @Override public void slavesInvAck(long id) { cache.receive(Message.INVACK(getCluster().getMyNodeId(), id)); } private void ack(boolean server) { LOG.debug("Ack {}", server ? "server" : "slaves"); BACKUP_PACKET packet = null; final BACKUP_PACKET _lastSent; currentBackupsLock.lock(); try { if (server && awaitSlaves) { awaitServer = false; return; } if (!server && awaitServer) { awaitSlaves = false; return; } _lastSent = lastSent; lastSent = null; awaitServer = false; awaitSlaves = false; if (shouldFlush) packet = flush1(); } finally { currentBackupsLock.unlock(); } for (BACKUP backup : _lastSent.getBackups()) cache.receive(Message.BACKUPACK((short) 0, backup.getLine(), backup.getVersion()).setIncoming()); if (packet != null) send(packet); } @Override public Iterator<BACKUP> iterOwned() { final Iterator<Cache.CacheLine> it = cache.ownedIterator(); return new Iterator<BACKUP>() { @Override public boolean hasNext() { return it.hasNext(); } @Override public BACKUP next() { final Cache.CacheLine line = it.next(); synchronized (line) { monitor.addReplicationBackup(1); return (BACKUP) Message.BACKUP(line.getId(), line.getVersion(), line.getData()).cloneDataBuffers(); } } @Override public void remove() { throw new UnsupportedOperationException(); } }; } @Override public void receive(Message message) { switch (message.getType()) { case BACKUP_PACKETACK: serverAck(message); break; case BACKUP_PACKET: if (getCluster().isMaster()) LOG.warn("Received backup packet while master: {}", message); else { monitor.addBackupPacket(); monitor.addBackups(((BACKUP_PACKET) message).getBackups().size()); handleReceivedBackupPacket((BACKUP_PACKET) message); } break; case BACKUP: if (getCluster().isMaster()) LOG.warn("Received backup while master: {}", message); else { monitor.addReplicationBackup(1); handleReceivedBackup((BACKUP) message); } break; case INV: if (getCluster().isMaster()) LOG.warn("Received INV while master: {}", message); else handleReceivedInvalidate((INV) message); break; default: } } private void handleReceivedBackupPacket(BACKUP_PACKET packet) { try { LOG.debug("Received backup packet: {}", packet); for (BACKUP backup : packet.getBackups()) cache.receive(backup); slaveComm.send(Message.BACKUP_PACKETACK(packet)); } catch (NodeNotFoundException e) { LOG.error("Exception while sending backup ack", e); } } private void handleReceivedBackup(BACKUP backup) { LOG.debug("Received replication backup: {}", backup); if (backup.getLine() < 0) { LOG.info("Slave node now ready! (completed replication)"); completedReplication = true; setReady(true); } else cache.receive(backup); } private void handleReceivedInvalidate(INV inv) { try { LOG.debug("Received inv: {}", inv); cache.receive(inv); slaveComm.send(Message.INVACK(inv)); } catch (NodeNotFoundException e) { throw new AssertionError(e); } } private static class BackupEntry { public final long id; public final long version; public BackupEntry(long id, long version) { this.id = id; this.version = version; } @Override public String toString() { return "BackupEntry{" + "id: " + Long.toHexString(id) + ", version: " + version + '}'; } } }