/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb; import java.nio.ByteBuffer; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Deque; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.voltcore.logging.VoltLogger; import org.voltcore.network.Connection; import org.voltdb.iv2.MpInitiator; import com.google_voltpatches.common.collect.ImmutableMap; import com.google_voltpatches.common.collect.ImmutableMap.Builder; /** * This manages per-partition handles used to identify responses for * work done in IV2. Since the work generated for a partition at each client interface * (treating multi-part work as a separate partition) is deterministically * ordered and completed, we can use the per-partition lists to determine which * transactions have been dropped due to faults and potentially report that * back to the client. */ public class ClientInterfaceHandleManager { private static final VoltLogger tmLog = new VoltLogger("TM"); static final long READ_BIT = 1L << 63; //Add an extra bit so compared to the 14-bits in txnids so there //can be a short circuit read partition id static final int PART_ID_BITS = 15; static final int MP_PART_ID = (1 << (PART_ID_BITS - 1)) - 1; static final int SHORT_CIRCUIT_PART_ID = MP_PART_ID + 1; static final long PART_ID_SHIFT = 48; static final long SEQNUM_MAX = (1L << PART_ID_SHIFT) - 1L; private long m_outstandingTxns; public final boolean isAdmin; public final Connection connection; public final ClientInterfaceRepairCallback repairCallback; private final long m_expectedThreadId = Thread.currentThread().getId(); final AdmissionControlGroup m_acg; private volatile boolean m_wantsTopologyUpdates = false; private HandleGenerator m_shortCircuitHG = new HandleGenerator(SHORT_CIRCUIT_PART_ID); private final Map<Long, Iv2InFlight> m_shortCircuitReads = new HashMap<Long, Iv2InFlight>(); private static class HandleGenerator { private long m_sequence = 0; final private long m_partitionId; HandleGenerator(int partitionId) { m_partitionId = partitionId; } public long getNextHandle() { if (m_sequence > SEQNUM_MAX) { m_sequence = 0; } return ((m_partitionId << PART_ID_SHIFT) | m_sequence++); } } public static int getPartIdFromHandle(long handle) { return (int)((handle >> PART_ID_SHIFT) & MP_PART_ID); } public static long getSeqNumFromHandle(long handle) { return handle & SEQNUM_MAX; } public static String handleToString(long handle) { return "(pid " + getPartIdFromHandle(handle) + " seq " + getSeqNumFromHandle(handle) + ")"; } static class Iv2InFlight { final long m_ciHandle; final long m_clientHandle; final int m_messageSize; final long m_creationTimeNanos; final String m_procName; final long m_initiatorHSId; Iv2InFlight(long ciHandle, long clientHandle, int messageSize, long creationTimeNanos, String procName, long initiatorHSId) { m_ciHandle = ciHandle; m_clientHandle = clientHandle; m_messageSize = messageSize; m_creationTimeNanos = creationTimeNanos; m_procName = procName; m_initiatorHSId = initiatorHSId; } } static class PartitionData { private final HandleGenerator m_generator; private final Deque<Iv2InFlight> m_reads = new ArrayDeque<Iv2InFlight>(); private final Deque<Iv2InFlight> m_writes = new ArrayDeque<Iv2InFlight>(); private PartitionData(int partitionId) { m_generator = new HandleGenerator(partitionId); } } private ImmutableMap<Integer, PartitionData> m_partitionStuff = new Builder<Integer, PartitionData>().build(); ClientInterfaceHandleManager(boolean isAdmin, Connection connection, ClientInterfaceRepairCallback repairCallback, AdmissionControlGroup acg) { this.isAdmin = isAdmin; this.connection = connection; this.repairCallback = repairCallback; m_acg = acg; } /** * Factory to make a threadsafe version of CIHM. This is used * exclusively by some internal CI adapters that don't have * the natural thread-safety protocol/design of VoltNetwork. */ public static ClientInterfaceHandleManager makeThreadSafeCIHM( boolean isAdmin, Connection connection, ClientInterfaceRepairCallback callback, AdmissionControlGroup acg) { return new ClientInterfaceHandleManager(isAdmin, connection, callback, acg) { @Override synchronized long getHandle(boolean isSinglePartition, int partitionId, long clientHandle, int messageSize, long creationTimeNanos, String procName, long initiatorHSId, boolean readOnly, boolean isShortCircuitRead) { return super.getHandle(isSinglePartition, partitionId, clientHandle, messageSize, creationTimeNanos, procName, initiatorHSId, readOnly, isShortCircuitRead); } @Override synchronized Iv2InFlight findHandle(long ciHandle) { return super.findHandle(ciHandle); } @Override synchronized Iv2InFlight removeHandle(long ciHandle) { return super.removeHandle(ciHandle); } @Override synchronized long getOutstandingTxns() { return super.getOutstandingTxns(); } @Override synchronized void freeOutstandingTxns() { super.freeOutstandingTxns(); } @Override synchronized List<Iv2InFlight> removeHandlesForPartitionAndInitiator(Integer partitionId, Long initiatorHSId) { return super.removeHandlesForPartitionAndInitiator(partitionId, initiatorHSId); } @Override synchronized boolean shouldCheckThreadIdAssertion() { return false; } }; } /** * Create a new handle for a transaction and store the client information * for that transaction in the internal structures. * ClientInterface handles have the partition ID encoded in them as the 10 * high-order non-sign bits (where the MP partition ID is the max value), * and a 53 bit sequence number in the low 53 bits. */ long getHandle( boolean isSinglePartition, int partitionId, long clientHandle, int messageSize, long creationTimeNanos, String procName, long initiatorHSId, boolean readOnly, boolean isShortCircuitRead) { assert(!shouldCheckThreadIdAssertion() || m_expectedThreadId == Thread.currentThread().getId()); if (!isSinglePartition) { partitionId = MP_PART_ID; } PartitionData partitionStuff = m_partitionStuff.get(partitionId); if (partitionStuff == null) { partitionStuff = new PartitionData(partitionId); m_partitionStuff = new Builder<Integer, PartitionData>(). putAll(m_partitionStuff). put(partitionId, partitionStuff).build(); } long ciHandle = isShortCircuitRead ? m_shortCircuitHG.getNextHandle() : partitionStuff.m_generator.getNextHandle(); Iv2InFlight inFlight = new Iv2InFlight(ciHandle, clientHandle, messageSize, creationTimeNanos, procName, initiatorHSId); if (isShortCircuitRead) { /* * Short circuit reads don't use a handle that is partition specific * because ordering doesn't really matter since it isn't used for failure handling * because the read is local to this process */ m_shortCircuitReads.put(ciHandle, inFlight); } else { /* * Reads are not ordered with writes, writes might block due to command logging * so track them separately because they will come back in mixed order */ if (readOnly) { /* * Encode the read only-ness into the handle */ ciHandle = setReadBit(ciHandle); partitionStuff.m_reads.offer(inFlight); } else { partitionStuff.m_writes.offer(inFlight); } } m_outstandingTxns++; m_acg.increaseBackpressure(messageSize); return ciHandle; } private static boolean getReadBit(long handle) { return (handle & READ_BIT) != 0; } private static long unsetReadBit(long handle) { return handle & ~READ_BIT; } private static long setReadBit(long handle) { return (handle |= READ_BIT); } /** * Retrieve the client information for the specified handle */ Iv2InFlight findHandle(long ciHandle) { assert(!shouldCheckThreadIdAssertion() || m_expectedThreadId == Thread.currentThread().getId()); //Check read only encoded bit final boolean readOnly = getReadBit(ciHandle); //Remove read only encoding so comparison works ciHandle = unsetReadBit(ciHandle); /* * Check for a short circuit read */ Iv2InFlight inflight = m_shortCircuitReads.remove(ciHandle); if (inflight != null) { m_acg.reduceBackpressure(inflight.m_messageSize); m_outstandingTxns--; return inflight; } /* * Not a short circuit read, check the partition specific * queue of handles */ int partitionId = getPartIdFromHandle(ciHandle); PartitionData partitionStuff = m_partitionStuff.get(partitionId); if (partitionStuff == null) { // whoa, bad tmLog.error("Unable to find handle list for partition: " + partitionId); return null; } final Deque<Iv2InFlight> perPartDeque = readOnly ? partitionStuff.m_reads : partitionStuff.m_writes; while (perPartDeque.peekFirst() != null) { Iv2InFlight inFlight = perPartDeque.pollFirst(); if (inFlight.m_ciHandle < ciHandle) { // lost txn, do something eventually tmLog.debug("CI found dropped transaction with handle: " + inFlight.m_ciHandle + " for partition: " + partitionId + " while searching for handle " + ciHandle); ClientResponseImpl errorResponse = new ClientResponseImpl( ClientResponseImpl.RESPONSE_UNKNOWN, new VoltTable[0], "Transaction dropped during fault recovery", inFlight.m_clientHandle); ByteBuffer buf = ByteBuffer.allocate(errorResponse.getSerializedSize() + 4); buf.putInt(buf.capacity() - 4); errorResponse.flattenToBuffer(buf); buf.flip(); connection.writeStream().enqueue(buf); m_outstandingTxns--; m_acg.reduceBackpressure(inFlight.m_messageSize); } else if (inFlight.m_ciHandle > ciHandle) { // we've gone too far, need to jam this back into the front of the deque and run away. tmLog.debug("CI clientData lookup missing handle: " + ciHandle + ". Next expected client data handle is: " + inFlight.m_ciHandle); perPartDeque.addFirst(inFlight); break; } else { m_acg.reduceBackpressure(inFlight.m_messageSize); m_outstandingTxns--; return inFlight; } } tmLog.debug("Unable to find Client data for client interface handle: " + ciHandle); return null; } /** Remove a specific handle without destroying any handles ordered before it */ Iv2InFlight removeHandle(long ciHandle) { assert(!shouldCheckThreadIdAssertion() || m_expectedThreadId == Thread.currentThread().getId()); //Check read only encoded bit final boolean readOnly = getReadBit(ciHandle); //Remove read only encoding so comparison works ciHandle = unsetReadBit(ciHandle); // Shouldn't see any reads in this path, since the whole point of this // method is to remove writes during replay which aren't going to get // done. However, this is logically correct, so go ahead and allow it. Iv2InFlight inflight = m_shortCircuitReads.remove(ciHandle); if (inflight != null) { m_acg.reduceBackpressure(inflight.m_messageSize); m_outstandingTxns--; return inflight; } /* * Not a short circuit read, check the partition specific * queue of handles */ int partitionId = getPartIdFromHandle(ciHandle); PartitionData partitionStuff = m_partitionStuff.get(partitionId); if (partitionStuff == null) { // whoa, bad tmLog.error("Unable to find handle list for partition: " + partitionId); return null; } final Deque<Iv2InFlight> perPartDeque = readOnly ? partitionStuff.m_reads : partitionStuff.m_writes; Iterator<Iv2InFlight> iter = perPartDeque.iterator(); while (iter.hasNext()) { Iv2InFlight inFlight = iter.next(); if (inFlight.m_ciHandle > ciHandle) { // we've gone too far, this handle doesn't exist tmLog.error("CI clientData lookup for remove missing handle: " + ciHandle + ". Next expected client data handle is: " + inFlight.m_ciHandle); break; } else if (inFlight.m_ciHandle == ciHandle) { m_acg.reduceBackpressure(inFlight.m_messageSize); m_outstandingTxns--; iter.remove(); return inFlight; } } tmLog.error("Unable to find Client data to remove client interface handle: " + ciHandle); return null; } /** Return a map of ConnectionId::(adminmode, txn count) */ long getOutstandingTxns() { return m_outstandingTxns; } /** * When a connection goes away, free all resources held by that connection * This opens a small window of opportunity for mischief in that work may * still be outstanding in the cluster, but once the client goes away so does * does the mapping to the resources allocated to it. */ void freeOutstandingTxns() { assert(!shouldCheckThreadIdAssertion() || m_expectedThreadId == Thread.currentThread().getId()); for (PartitionData pd : m_partitionStuff.values()) { for (Iv2InFlight inflight : pd.m_reads) { m_outstandingTxns--; m_acg.reduceBackpressure(inflight.m_messageSize); } for (Iv2InFlight inflight : pd.m_writes) { m_outstandingTxns--; m_acg.reduceBackpressure(inflight.m_messageSize); } } for (Iv2InFlight inflight : m_shortCircuitReads.values()) { m_outstandingTxns--; m_acg.reduceBackpressure(inflight.m_messageSize); } } List<Iv2InFlight> removeHandlesForPartitionAndInitiator(Integer partitionId, Long initiatorHSId) { assert(!shouldCheckThreadIdAssertion() || m_expectedThreadId == Thread.currentThread().getId()); List<Iv2InFlight> retval = new ArrayList<Iv2InFlight>(); if (!m_partitionStuff.containsKey(partitionId)) return retval; /* * First clear the pending reads */ PartitionData partitionStuff = m_partitionStuff.get(partitionId); Deque<Iv2InFlight> inFlight = partitionStuff.m_reads; Iterator<Iv2InFlight> i = inFlight.iterator(); while (i.hasNext()) { Iv2InFlight entry = i.next(); if (entry.m_initiatorHSId != initiatorHSId) { i.remove(); retval.add(entry); m_outstandingTxns--; m_acg.reduceBackpressure(entry.m_messageSize); } } /* * MP short circuit reads can be remote, which necessitate repair */ if (partitionId == MpInitiator.MP_INIT_PID) { Iterator<Map.Entry<Long, Iv2InFlight>> itr = m_shortCircuitReads.entrySet().iterator(); while (itr.hasNext()) { Map.Entry<Long, Iv2InFlight> e = itr.next(); Iv2InFlight entry = e.getValue(); if (entry.m_initiatorHSId != initiatorHSId) { itr.remove(); retval.add(entry); m_outstandingTxns--; m_acg.reduceBackpressure(entry.m_messageSize); } } } /* * Then clear the pending writes */ inFlight = partitionStuff.m_writes; i = inFlight.iterator(); while (i.hasNext()) { Iv2InFlight entry = i.next(); if (entry.m_initiatorHSId != initiatorHSId) { i.remove(); retval.add(entry); m_outstandingTxns--; m_acg.reduceBackpressure(entry.m_messageSize); } } return retval; } // Coward's way out...the thread-safe override of this class will return false for this, // which will enable us to keep the thread ID assertions in all of the method calls and // not bomb when using the thread-safe version. boolean shouldCheckThreadIdAssertion() { return true; } public void setWantsTopologyUpdates(boolean wantsTopologyUpdates) { m_wantsTopologyUpdates = wantsTopologyUpdates; } public boolean wantsTopologyUpdates() { return m_wantsTopologyUpdates; } }