/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb.iv2; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.TreeSet; import java.util.concurrent.Future; import org.voltcore.logging.VoltLogger; import org.voltcore.messaging.VoltMessage; import org.voltcore.utils.CoreUtils; import org.voltdb.messaging.Iv2RepairLogRequestMessage; import org.voltdb.messaging.Iv2RepairLogResponseMessage; import com.google_voltpatches.common.util.concurrent.SettableFuture; public class SpPromoteAlgo implements RepairAlgo { VoltLogger tmLog = new VoltLogger("TM"); private final String m_whoami; private final InitiatorMailbox m_mailbox; private final long m_requestId = System.nanoTime(); private final List<Long> m_survivors; private long m_maxSeenTxnId; // Each Term can process at most one promotion; if promotion fails, make // a new Term and try again (if that's your big plan...) private final SettableFuture<RepairResult> m_promotionResult = SettableFuture.create(); long getRequestId() { return m_requestId; } // scoreboard for responding replica repair log responses (hsid -> response count) static class ReplicaRepairStruct { int m_receivedResponses = 0; int m_expectedResponses = -1; // (a log msg cares about this init. value) long m_maxSpHandleSeen = Long.MIN_VALUE; // update counters and return the number of outstanding messages. boolean update(Iv2RepairLogResponseMessage response) { m_receivedResponses++; m_expectedResponses = response.getOfTotal(); m_maxSpHandleSeen = Math.max(m_maxSpHandleSeen, response.getHandle()); return logsComplete(); } boolean logsComplete() { return (m_expectedResponses - m_receivedResponses) == 0; } // return true if this replica needs the message for spHandle. boolean needs(long spHandle) { return m_maxSpHandleSeen < spHandle; } } // replicas being processed and repaired. Map<Long, ReplicaRepairStruct> m_replicaRepairStructs = new HashMap<Long, ReplicaRepairStruct>(); // Determine equal repair responses by the SpHandle of the response. Comparator<Iv2RepairLogResponseMessage> m_unionComparator = new Comparator<Iv2RepairLogResponseMessage>() { @Override public int compare(Iv2RepairLogResponseMessage o1, Iv2RepairLogResponseMessage o2) { if (o1.getHandle() < o2.getHandle()) { return -1; } else if (o1.getHandle() > o2.getHandle()) { return 1; } return 0; } }; // Union of repair responses. TreeSet<Iv2RepairLogResponseMessage> m_repairLogUnion = new TreeSet<Iv2RepairLogResponseMessage>(m_unionComparator); /** * Setup a new RepairAlgo but don't take any action to take responsibility. */ public SpPromoteAlgo(List<Long> survivors, InitiatorMailbox mailbox, String whoami, int partitionId) { m_mailbox = mailbox; m_survivors = survivors; m_whoami = whoami; m_maxSeenTxnId = TxnEgo.makeZero(partitionId).getTxnId(); } @Override public Future<RepairResult> start() { try { prepareForFaultRecovery(); } catch (Exception e) { tmLog.error(m_whoami + "failed leader promotion:", e); m_promotionResult.setException(e); } return m_promotionResult; } @Override public boolean cancel() { return m_promotionResult.cancel(false); } /** Start fixing survivors: setup scoreboard and request repair logs. */ void prepareForFaultRecovery() { for (Long hsid : m_survivors) { m_replicaRepairStructs.put(hsid, new ReplicaRepairStruct()); } tmLog.info(m_whoami + "found (including self) " + m_survivors.size() + " surviving replicas to repair. " + " Survivors: " + CoreUtils.hsIdCollectionToString(m_survivors)); VoltMessage logRequest = new Iv2RepairLogRequestMessage(m_requestId, Iv2RepairLogRequestMessage.SPREQUEST); m_mailbox.send(com.google_voltpatches.common.primitives.Longs.toArray(m_survivors), logRequest); } /** Process a new repair log response */ @Override public void deliver(VoltMessage message) { if (message instanceof Iv2RepairLogResponseMessage) { Iv2RepairLogResponseMessage response = (Iv2RepairLogResponseMessage)message; if (response.getRequestId() != m_requestId) { tmLog.debug(m_whoami + "rejecting stale repair response." + " Current request id is: " + m_requestId + " Received response for request id: " + response.getRequestId()); return; } ReplicaRepairStruct rrs = m_replicaRepairStructs.get(response.m_sourceHSId); if (rrs.m_expectedResponses < 0) { tmLog.debug(m_whoami + "collecting " + response.getOfTotal() + " repair log entries from " + CoreUtils.hsIdToString(response.m_sourceHSId)); } // Long.MAX_VALUE has rejoin semantics if (response.getHandle() != Long.MAX_VALUE) { m_maxSeenTxnId = Math.max(m_maxSeenTxnId, response.getHandle()); } if (response.getPayload() != null) { m_repairLogUnion.add(response); if (tmLog.isTraceEnabled()) { tmLog.trace(m_whoami + " collected from " + CoreUtils.hsIdToString(response.m_sourceHSId) + ", message: " + response.getPayload()); } } if (rrs.update(response)) { tmLog.debug(m_whoami + "collected " + rrs.m_receivedResponses + " responses for " + rrs.m_expectedResponses + " repair log entries from " + CoreUtils.hsIdToString(response.m_sourceHSId)); if (areRepairLogsComplete()) { repairSurvivors(); } } } } /** Have all survivors supplied a full repair log? */ public boolean areRepairLogsComplete() { for (Entry<Long, ReplicaRepairStruct> entry : m_replicaRepairStructs.entrySet()) { if (!entry.getValue().logsComplete()) { return false; } } return true; } /** Send missed-messages to survivors. */ public void repairSurvivors() { // cancel() and repair() must be synchronized by the caller (the deliver lock, // currently). If cancelled and the last repair message arrives, don't send // out corrections! if (this.m_promotionResult.isCancelled()) { tmLog.debug(m_whoami + "Skipping repair message creation for cancelled Term."); return; } int queued = 0; tmLog.debug(m_whoami + "received all repair logs and is repairing surviving replicas."); for (Iv2RepairLogResponseMessage li : m_repairLogUnion) { List<Long> needsRepair = new ArrayList<Long>(5); for (Entry<Long, ReplicaRepairStruct> entry : m_replicaRepairStructs.entrySet()) { if (entry.getValue().needs(li.getHandle())) { ++queued; tmLog.debug(m_whoami + "repairing " + CoreUtils.hsIdToString(entry.getKey()) + ". Max seen " + entry.getValue().m_maxSpHandleSeen + ". Repairing with " + li.getHandle()); needsRepair.add(entry.getKey()); } } if (!needsRepair.isEmpty()) { if (tmLog.isTraceEnabled()) { tmLog.trace(m_whoami + "repairing: " + CoreUtils.hsIdCollectionToString(needsRepair) + " with message: " + li.getPayload()); } m_mailbox.repairReplicasWith(needsRepair, li.getPayload()); } } tmLog.debug(m_whoami + "finished queuing " + queued + " replica repair messages."); m_promotionResult.set(new RepairResult(m_maxSeenTxnId)); } }