/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb.export; import static com.google_voltpatches.common.base.Preconditions.checkNotNull; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.lang.management.ManagementFactory; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Iterator; import java.util.concurrent.Callable; import java.util.concurrent.LinkedTransferQueue; import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.Semaphore; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import org.apache.hadoop_voltpatches.util.PureJavaCrc32; import org.json_voltpatches.JSONArray; import org.json_voltpatches.JSONException; import org.json_voltpatches.JSONObject; import org.json_voltpatches.JSONStringer; import org.voltcore.logging.Level; import org.voltcore.logging.VoltLogger; import org.voltcore.messaging.BinaryPayloadMessage; import org.voltcore.messaging.Mailbox; import org.voltcore.utils.CoreUtils; import org.voltcore.utils.DBBPool; import org.voltcore.utils.DBBPool.BBContainer; import org.voltcore.utils.Pair; import org.voltdb.VoltDB; import org.voltdb.VoltType; import org.voltdb.catalog.CatalogMap; import org.voltdb.catalog.Column; import org.voltdb.export.AdvertisedDataSource.ExportFormat; import org.voltdb.utils.CatalogUtil; import org.voltdb.utils.VoltFile; import com.google_voltpatches.common.base.Preconditions; import com.google_voltpatches.common.base.Throwables; import com.google_voltpatches.common.collect.ImmutableList; import com.google_voltpatches.common.io.Files; import com.google_voltpatches.common.util.concurrent.Futures; import com.google_voltpatches.common.util.concurrent.ListenableFuture; import com.google_voltpatches.common.util.concurrent.ListeningExecutorService; import com.google_voltpatches.common.util.concurrent.SettableFuture; /** * Allows an ExportDataProcessor to access underlying table queues */ public class ExportDataSource implements Comparable<ExportDataSource> { /** * Processors also log using this facility. */ private static final VoltLogger exportLog = new VoltLogger("EXPORT"); private final String m_database; private final String m_tableName; private String m_partitionColumnName = ""; private final String m_signature; private final byte [] m_signatureBytes; private final long m_generation; private final int m_partitionId; private final ExportFormat m_format; public final ArrayList<String> m_columnNames = new ArrayList<String>(); public final ArrayList<Integer> m_columnTypes = new ArrayList<Integer>(); public final ArrayList<Integer> m_columnLengths = new ArrayList<Integer>(); private long m_firstUnpolledUso = 0; private final StreamBlockQueue m_committedBuffers; private boolean m_endOfStream = false; private Runnable m_onDrain; private Runnable m_onMastership; private SettableFuture<BBContainer> m_pollFuture; private final AtomicReference<Pair<Mailbox, ImmutableList<Long>>> m_ackMailboxRefs = new AtomicReference<Pair<Mailbox,ImmutableList<Long>>>(Pair.of((Mailbox)null, ImmutableList.<Long>builder().build())); private final Semaphore m_bufferPushPermits = new Semaphore(16); private final int m_nullArrayLength; private long m_lastReleaseOffset = 0; private long m_lastAckUSO = 0; //This is for testing only. public static boolean m_dontActivateForTest = false; //Set if connector "replicated" property is set to true private boolean m_runEveryWhere = false; private boolean m_isMaster = false; private boolean m_replicaRunning = false; //This is released when all mailboxes are set. private final Semaphore m_allowAcceptingMastership = new Semaphore(0); private volatile boolean m_closed = false; private volatile AtomicBoolean m_mastershipAccepted = new AtomicBoolean(false); private volatile boolean m_replicaMastershipRequested = false; private volatile ListeningExecutorService m_executor; private final Integer m_executorLock = new Integer(0); private final LinkedTransferQueue<RunnableWithES> m_queuedActions = new LinkedTransferQueue<>(); private RunnableWithES m_firstAction = null; // Record the stacktrace of when this data source calls drain to help debug a race condition. private volatile Exception m_drainTraceForDebug = null; /** * Create a new data source. * @param db * @param tableName * @param isReplicated * @param partitionId * @param HSId * @param tableId * @param catalogMap */ public ExportDataSource( final Runnable onDrain, String db, String tableName, int partitionId, String signature, long generation, CatalogMap<Column> catalogMap, Column partitionColumn, String overflowPath ) throws IOException { checkNotNull( onDrain, "onDrain runnable is null"); m_format = ExportFormat.FOURDOTFOUR; m_generation = generation; m_onDrain = new Runnable() { @Override public void run() { try { onDrain.run(); } finally { m_onDrain = null; forwardAckToOtherReplicas(Long.MIN_VALUE); } } }; m_database = db; m_tableName = tableName; m_signature = signature; m_signatureBytes = m_signature.getBytes(StandardCharsets.UTF_8); PureJavaCrc32 crc = new PureJavaCrc32(); crc.update(m_signatureBytes); String nonce = m_tableName + "_" + crc.getValue() + "_" + partitionId; m_committedBuffers = new StreamBlockQueue(overflowPath, nonce); /* * This is not the catalog relativeIndex(). This ID incorporates * a catalog version and a table id so that it is constant across * catalog updates that add or drop tables. */ m_partitionId = partitionId; // Add the Export meta-data columns to the schema followed by the // catalog columns for this table. m_columnNames.add("VOLT_TRANSACTION_ID"); m_columnTypes.add(((int)VoltType.BIGINT.getValue())); m_columnLengths.add(8); m_columnNames.add("VOLT_EXPORT_TIMESTAMP"); m_columnTypes.add(((int)VoltType.BIGINT.getValue())); m_columnLengths.add(8); m_columnNames.add("VOLT_EXPORT_SEQUENCE_NUMBER"); m_columnTypes.add(((int)VoltType.BIGINT.getValue())); m_columnLengths.add(8); m_columnNames.add("VOLT_PARTITION_ID"); m_columnTypes.add(((int)VoltType.BIGINT.getValue())); m_columnLengths.add(8); m_columnNames.add("VOLT_SITE_ID"); m_columnTypes.add(((int)VoltType.BIGINT.getValue())); m_columnLengths.add(8); m_columnNames.add("VOLT_EXPORT_OPERATION"); m_columnTypes.add(((int)VoltType.TINYINT.getValue())); m_columnLengths.add(1); for (Column c : CatalogUtil.getSortedCatalogItems(catalogMap, "index")) { m_columnNames.add(c.getName()); m_columnTypes.add(c.getType()); m_columnLengths.add(c.getSize()); } if (partitionColumn != null) { m_partitionColumnName = partitionColumn.getName(); } File adFile = new VoltFile(overflowPath, nonce + ".ad"); exportLog.info("Creating ad for " + nonce); byte jsonBytes[] = null; try { JSONStringer stringer = new JSONStringer(); stringer.object(); stringer.keySymbolValuePair("database", m_database); writeAdvertisementTo(stringer); stringer.endObject(); JSONObject jsObj = new JSONObject(stringer.toString()); jsonBytes = jsObj.toString(4).getBytes(StandardCharsets.UTF_8); } catch (JSONException e) { exportLog.error("Failed to Write ad file for " + nonce); Throwables.propagate(e); } try (FileOutputStream fos = new FileOutputStream(adFile)) { fos.write(jsonBytes); fos.getFD().sync(); } // compute the number of bytes necessary to hold one bit per // schema column m_nullArrayLength = ((m_columnTypes.size() + 7) & -8) >> 3; // This is not being loaded from file, so activate immediately if (!m_dontActivateForTest) { activate(); } } public ExportDataSource(final Runnable onDrain, File adFile, boolean isContinueingGeneration) throws IOException { /* * Certainly no more data coming if this is coming off of disk */ m_onDrain = new Runnable() { @Override public void run() { try { onDrain.run(); } finally { m_onDrain = null; forwardAckToOtherReplicas(Long.MIN_VALUE); } } }; String overflowPath = adFile.getParent(); byte data[] = Files.toByteArray(adFile); long hsid = -1; try { JSONObject jsObj = new JSONObject(new String(data, StandardCharsets.UTF_8)); long version = jsObj.getLong("adVersion"); if (version != 0) { throw new IOException("Unsupported ad file version " + version); } try { hsid = jsObj.getLong("hsId"); exportLog.info("Found old for export data source file ignoring m_HSId"); } catch (JSONException jex) { hsid = -1; } m_database = jsObj.getString("database"); m_generation = jsObj.getLong("generation"); m_partitionId = jsObj.getInt("partitionId"); m_signature = jsObj.getString("signature"); m_signatureBytes = m_signature.getBytes(StandardCharsets.UTF_8); m_tableName = jsObj.getString("tableName"); JSONArray columns = jsObj.getJSONArray("columns"); for (int ii = 0; ii < columns.length(); ii++) { JSONObject column = columns.getJSONObject(ii); m_columnNames.add(column.getString("name")); int columnType = column.getInt("type"); m_columnTypes.add(columnType); m_columnLengths.add(column.getInt("length")); } if (jsObj.has("format")) { m_format = ExportFormat.valueOf(jsObj.getString("format")); } else { m_format = ExportFormat.FOURDOTFOUR; } try { m_partitionColumnName = jsObj.getString("partitionColumnName"); } catch (Exception ex) { //Ignore these if we have a OLD ad file these may not exist. } } catch (JSONException e) { throw new IOException(e); } String nonce; PureJavaCrc32 crc = new PureJavaCrc32(); crc.update(m_signatureBytes); if (hsid == -1) { nonce = m_tableName + "_" + crc.getValue() + "_" + m_partitionId; } else { nonce = m_tableName + "_" + crc.getValue() + "_" + hsid + "_" + m_partitionId; } //If on disk generation matches catalog generation we dont do end of stream as it will be appended to. m_endOfStream = !isContinueingGeneration; m_committedBuffers = new StreamBlockQueue(overflowPath, nonce); // compute the number of bytes necessary to hold one bit per // schema column m_nullArrayLength = ((m_columnTypes.size() + 7) & -8) >> 3; } public void activate() { setupExecutor(); } public synchronized void updateAckMailboxes(final Pair<Mailbox, ImmutableList<Long>> ackMailboxes) { m_ackMailboxRefs.set( ackMailboxes); } private synchronized void releaseExportBytes(long releaseOffset) throws IOException { // if released offset is in an already-released past, just return success if (!m_committedBuffers.isEmpty() && releaseOffset < m_committedBuffers.peek().uso()) { return; } long lastUso = m_firstUnpolledUso; while (!m_committedBuffers.isEmpty() && releaseOffset >= m_committedBuffers.peek().uso()) { StreamBlock sb = m_committedBuffers.peek(); if (releaseOffset >= sb.uso() + sb.totalUso()) { m_committedBuffers.pop(); try { lastUso = sb.uso() + sb.totalUso(); } finally { sb.discard(); } } else if (releaseOffset >= sb.uso()) { sb.releaseUso(releaseOffset); lastUso = releaseOffset; break; } } m_lastReleaseOffset = releaseOffset; m_firstUnpolledUso = Math.max(m_firstUnpolledUso, lastUso); } public String getDatabase() { return m_database; } public String getTableName() { return m_tableName; } public String getSignature() { return m_signature; } public final int getPartitionId() { return m_partitionId; } public String getPartitionColumnName() { return m_partitionColumnName; } public final void writeAdvertisementTo(JSONStringer stringer) throws JSONException { stringer.keySymbolValuePair("adVersion", 0); stringer.keySymbolValuePair("generation", m_generation); stringer.keySymbolValuePair("partitionId", getPartitionId()); stringer.keySymbolValuePair("signature", m_signature); stringer.keySymbolValuePair("tableName", getTableName()); stringer.keySymbolValuePair("startTime", ManagementFactory.getRuntimeMXBean().getStartTime()); stringer.key("columns").array(); for (int ii=0; ii < m_columnNames.size(); ++ii) { stringer.object(); stringer.keySymbolValuePair("name", m_columnNames.get(ii)); stringer.keySymbolValuePair("type", m_columnTypes.get(ii)); stringer.keySymbolValuePair("length", m_columnLengths.get(ii)); stringer.endObject(); } stringer.endArray(); stringer.keySymbolValuePair("format", ExportFormat.FOURDOTFOUR.toString()); stringer.keySymbolValuePair("partitionColumnName", m_partitionColumnName); } /** * Compare two ExportDataSources for equivalence. This currently does not * compare column names, but it should once column add/drop is allowed. * This comparison is performed to decide if a datasource in a new catalog * needs to be passed to a proccessor. */ @Override public int compareTo(ExportDataSource o) { int result; result = m_database.compareTo(o.m_database); if (result != 0) { return result; } result = m_tableName.compareTo(o.m_tableName); if (result != 0) { return result; } result = (m_partitionId - o.m_partitionId); if (result != 0) { return result; } // does not verify replicated / unreplicated. // does not verify column names / schema return 0; } /** * Make sure equal objects compareTo as 0. */ @Override public boolean equals(Object o) { if (!(o instanceof ExportDataSource)) return false; return compareTo((ExportDataSource)o) == 0; } @Override public int hashCode() { // based on implementation of compareTo int result = 0; result += m_database.hashCode(); result += m_tableName.hashCode(); result += m_partitionId; // does not factor in replicated / unreplicated. // does not factor in column names / schema return result; } public long sizeInBytes() { try { ListeningExecutorService es = getExecutorService(); if (es==null) { return m_committedBuffers.sizeInBytes(); } else { return es.submit(new Callable<Long>() { @Override public Long call() throws Exception { return m_committedBuffers.sizeInBytes(); } }).get(); } } catch (RejectedExecutionException e) { return 0; } catch (IOException e){ // IOException is expected if the committed buffer was closed when stats are requested. assert e.getMessage().contains("has been closed") : e.getMessage(); exportLog.warn("IOException thrown while querying ExportDataSource.sizeInBytes(): " + e.getMessage()); return 0; } catch (Throwable t) { Throwables.throwIfUnchecked(t); throw new RuntimeException(t); } } private void pushExportBufferImpl( long uso, ByteBuffer buffer, boolean sync, boolean endOfStream, boolean poll) throws Exception { final java.util.concurrent.atomic.AtomicBoolean deleted = new java.util.concurrent.atomic.AtomicBoolean(false); if (endOfStream) { assert(!m_endOfStream); assert(buffer == null); assert(!sync); m_endOfStream = endOfStream; if (m_committedBuffers.isEmpty()) { exportLog.info("Pushed EOS buffer with 0 bytes remaining"); if (m_pollFuture != null) { m_pollFuture.set(null); m_pollFuture = null; } if (m_onDrain != null) { m_drainTraceForDebug = new Exception("Push USO " + uso + " endOfStream " + endOfStream + " poll " + poll); m_onDrain.run(); } } else { exportLog.info("EOS for " + m_tableName + " partition " + m_partitionId + " with first unpolled uso " + m_firstUnpolledUso + " and remaining bytes " + m_committedBuffers.sizeInBytes()); } return; } assert(!m_endOfStream); if (buffer != null) { //There will be 8 bytes of no data that we can ignore, it is header space for storing //the USO in stream block if (buffer.capacity() > 8) { final BBContainer cont = DBBPool.wrapBB(buffer); if (m_lastReleaseOffset > 0 && m_lastReleaseOffset >= (uso + (buffer.capacity() - 8))) { //What ack from future is known? if (exportLog.isDebugEnabled()) { exportLog.debug("Dropping already acked USO: " + m_lastReleaseOffset + " Buffer info: " + uso + " Size: " + buffer.capacity()); } cont.discard(); return; } try { m_committedBuffers.offer(new StreamBlock( new BBContainer(buffer) { @Override public void discard() { checkDoubleFree(); cont.discard(); deleted.set(true); } }, uso, false)); } catch (IOException e) { VoltDB.crashLocalVoltDB("Unable to write to export overflow.", true, e); } } else { /* * TupleStreamWrapper::setBytesUsed propagates the USO by sending * over an empty stream block. The block will be deleted * on the native side when this method returns */ exportLog.info("Syncing first unpolled USO to " + uso + " for table " + m_tableName + " partition " + m_partitionId); m_firstUnpolledUso = uso; } } if (sync) { try { //Don't do a real sync, just write the in memory buffers //to a file. @Quiesce or blocking snapshot will do the sync m_committedBuffers.sync(true); } catch (IOException e) { VoltDB.crashLocalVoltDB("Unable to write to export overflow.", true, e); } } if (poll) { pollImpl(m_pollFuture); } } public void pushExportBuffer( final long uso, final ByteBuffer buffer, final boolean sync, final boolean endOfStream) { try { m_bufferPushPermits.acquire(); } catch (InterruptedException e) { Throwables.propagate(e); } ListeningExecutorService es = getExecutorService(); if (es == null) { //If we have not activated lets get the buffer in overflow and dont poll try { pushExportBufferImpl(uso, buffer, sync, endOfStream, false); } catch (Throwable t) { VoltDB.crashLocalVoltDB("Error pushing export buffer", true, t); } finally { m_bufferPushPermits.release(); } return; } if (es.isShutdown()) { m_bufferPushPermits.release(); return; } try { es.execute((new Runnable() { @Override public void run() { try { if (!es.isShutdown()) { //Since we are part of active generation we poll too pushExportBufferImpl(uso, buffer, sync, endOfStream, true /* poll */); } } catch (Throwable t) { VoltDB.crashLocalVoltDB("Error pushing export buffer", true, t); } finally { m_bufferPushPermits.release(); } } })); } catch (RejectedExecutionException rej) { m_bufferPushPermits.release(); //We are shutting down very much rolling generation so dont passup for error reporting. exportLog.info("Error pushing export buffer: ", rej); } } public long getGeneration() { return m_generation; } public ListenableFuture<?> truncateExportToTxnId(final long txnId) { RunnableWithES runnable = new RunnableWithES("truncateExportToTxnId") { @Override public void run() { try { m_committedBuffers.truncateToTxnId(txnId, m_nullArrayLength); if (m_committedBuffers.isEmpty() && m_endOfStream) { if (m_pollFuture != null) { m_pollFuture.set(null); m_pollFuture = null; } if (m_onDrain != null) { m_drainTraceForDebug = new Exception("Truncation txnId " + txnId); m_onDrain.run(); } } } catch (Throwable t) { VoltDB.crashLocalVoltDB("Error while trying to truncate export to txnid " + txnId, true, t); } } }; //This is a setup task when stashed tasks are run this is run first. return stashOrSubmitTask(runnable, false, true); } private class SyncRunnable implements Runnable { private final boolean m_nofsync; SyncRunnable(final boolean nofsync) { this.m_nofsync = nofsync; } @Override public void run() { try { m_committedBuffers.sync(m_nofsync); } catch (IOException e) { exportLog.error("failed to sync export overflow", e); } } } public ListenableFuture<?> sync(final boolean nofsync) { RunnableWithES runnable = new RunnableWithES("sync") { @Override public void run() { new SyncRunnable(nofsync).run(); } }; return stashOrSubmitTask(runnable, false, false); } public boolean isClosed() { return m_closed; } public ListenableFuture<?> closeAndDelete() { m_closed = true; RunnableWithES runnable = new RunnableWithES("closeAndDelete") { @Override public void run() { try { m_committedBuffers.closeAndDelete(); m_ackMailboxRefs.set(null); } catch(IOException e) { exportLog.rateLimitedLog(60, Level.WARN, e, "Error closing commit buffers"); } finally { getLocalExecutorService().shutdown(); } } }; return stashOrSubmitTask(runnable, false, false); } public ListenableFuture<?> close() { m_closed = true; //If we are waiting at this allow to break out when close comes in. m_allowAcceptingMastership.release(); RunnableWithES runnable = new RunnableWithES("close") { @Override public void run() { try { m_committedBuffers.close(); m_ackMailboxRefs.set(null); } catch (IOException e) { exportLog.error(e.getMessage(), e); } finally { getLocalExecutorService().shutdown(); } } }; return stashOrSubmitTask(runnable, false, false); } public ListenableFuture<BBContainer> poll() { final SettableFuture<BBContainer> fut = SettableFuture.create(); RunnableWithES runnable = new RunnableWithES("poll") { @Override public void run() { try { /* * The poll is blocking through the future, shouldn't * call poll a second time until a response has been given * which nulls out the field */ if (m_pollFuture != null) { fut.setException(new RuntimeException("Should not poll more than once")); return; } if (!getLocalExecutorService().isShutdown()) { pollImpl(fut); } } catch (Exception e) { exportLog.error("Exception polling export buffer", e); } catch (Error e) { VoltDB.crashLocalVoltDB("Error polling export buffer", true, e); } } }; stashOrSubmitTask(runnable, true, false); return fut; } //If replica we poll from lowest of ack rcvd or last poll point. private long getFirstUnpolledUso() { if (m_isMaster) { return m_firstUnpolledUso; } return Math.min(m_lastAckUSO, m_firstUnpolledUso); } private void pollImpl(SettableFuture<BBContainer> fut) { if (fut == null) { return; } try { StreamBlock first_unpolled_block = null; if (m_endOfStream && m_committedBuffers.isEmpty()) { //Returning null indicates end of stream try { fut.set(null); } catch (RejectedExecutionException reex) { //We are closing source. } if (m_onDrain != null) { m_drainTraceForDebug = new Exception(); m_onDrain.run(); } return; } //Assemble a list of blocks to delete so that they can be deleted //outside of the m_committedBuffers critical section ArrayList<StreamBlock> blocksToDelete = new ArrayList<StreamBlock>(); //Inside this critical section do the work to find out //what block should be returned by the next poll. //Copying and sending the data will take place outside the critical section try { Iterator<StreamBlock> iter = m_committedBuffers.iterator(); long fuso = getFirstUnpolledUso(); while (iter.hasNext()) { StreamBlock block = iter.next(); // find the first block that has unpolled data if (fuso < block.uso() + block.totalUso()) { first_unpolled_block = block; m_firstUnpolledUso = (block.uso() + block.totalUso()); break; } else { blocksToDelete.add(block); iter.remove(); } } } catch (RuntimeException e) { if (e.getCause() instanceof IOException) { VoltDB.crashLocalVoltDB("Error attempting to find unpolled export data", true, e); } else { throw e; } } finally { //Try hard not to leak memory for (StreamBlock sb : blocksToDelete) { sb.discard(); } } //If there are no unpolled blocks return the firstUnpolledUSO with no data if (first_unpolled_block == null) { m_pollFuture = fut; } else { final AckingContainer ackingContainer = new AckingContainer(first_unpolled_block.unreleasedContainer(), first_unpolled_block.uso() + first_unpolled_block.totalUso()); try { fut.set(ackingContainer); } catch (RejectedExecutionException reex) { //We are closing source. ackingContainer.discard(); } m_pollFuture = null; if (m_drainTraceForDebug != null) { //Making this an ERROR. Looks like this is happening when ackImpl initiates drains and pollImpl is submitted to execute. exportLog.error("Rolling generation " + m_generation + " before it is fully drained. " + "Drain was called from " + Throwables.getStackTraceAsString(m_drainTraceForDebug)); } } } catch (Throwable t) { fut.setException(t); } } class AckingContainer extends BBContainer { final long m_uso; final BBContainer m_backingCont; public AckingContainer(BBContainer cont, long uso) { super(cont.b()); m_uso = uso; m_backingCont = cont; } @Override public void discard() { checkDoubleFree(); RunnableWithES runnable = new RunnableWithES("discard") { @Override public void run() { try { m_backingCont.discard(); try { if (!getLocalExecutorService().isShutdown()) { ackImpl(m_uso); } } finally { forwardAckToOtherReplicas(m_uso); } } catch (Exception e) { exportLog.error("Error acking export buffer", e); } catch (Error e) { VoltDB.crashLocalVoltDB("Error acking export buffer", true, e); } } }; stashOrSubmitTask(runnable, true, false); } } private void forwardAckToOtherReplicas(long uso) { if (m_runEveryWhere && m_replicaRunning) { //we dont forward if we are running as replica in replicated export return; } Pair<Mailbox, ImmutableList<Long>> p = m_ackMailboxRefs.get(); Mailbox mbx = p.getFirst(); if (mbx != null && p.getSecond().size() > 0) { // partition:int(4) + length:int(4) + // signaturesBytes.length + ackUSO:long(8) + 2 bytes for runEverywhere or not. final int msgLen = 4 + 4 + m_signatureBytes.length + 8 + 2; ByteBuffer buf = ByteBuffer.allocate(msgLen); buf.putInt(m_partitionId); buf.putInt(m_signatureBytes.length); buf.put(m_signatureBytes); buf.putLong(uso); buf.putShort((m_runEveryWhere ? (short )1 : (short )0)); BinaryPayloadMessage bpm = new BinaryPayloadMessage(new byte[0], buf.array()); for( Long siteId: p.getSecond()) { mbx.send(siteId, bpm); } } } public void ack(final long uso, boolean runEveryWhere) { // If I am not master and run everywhere connector and I get ack to start replicating....do so and become a exporting replica. if (m_runEveryWhere && !m_isMaster && runEveryWhere) { //These are single threaded so no need to lock. m_lastAckUSO = uso; m_replicaMastershipRequested = true; if (!m_replicaRunning) { m_isMaster = false; m_replicaRunning = acceptMastership(); //If we didnt accept mastership we will depend on next ack to accept. if (m_replicaRunning) { exportLog.info("Export generation " + getGeneration() + " accepting mastership for " + getTableName() + " partition " + getPartitionId() + " as replica"); } } return; } //In replicated only master will be doing this. RunnableWithES runnable = new RunnableWithES("ack") { @Override public void run() { try { if (!getLocalExecutorService().isShutdown()) { ackImpl(uso); } } catch (Exception e) { exportLog.error("Error acking export buffer", e); } catch (Error e) { VoltDB.crashLocalVoltDB("Error acking export buffer", true, e); } } }; stashOrSubmitTask(runnable, true, false); } private void ackImpl(long uso) { if (uso == Long.MIN_VALUE && m_onDrain != null) { m_drainTraceForDebug = new Exception("Acking USO " + uso); m_onDrain.run(); return; } //Process the ack if any and add blocks to the delete list or move the released USO pointer if (uso > 0) { try { releaseExportBytes(uso); } catch (IOException e) { VoltDB.crashLocalVoltDB("Error attempting to release export bytes", true, e); return; } } } /** * Returns if replica was running. * @return */ public boolean setMaster() { exportLog.info("Setting master for partition: " + getPartitionId() + " Table " + getTableName() + " Replica running " + m_replicaRunning); m_isMaster = true; boolean rval = m_replicaRunning; m_replicaRunning = false; return rval; } //Is this a run everywhere source public boolean isRunEveryWhere() { return m_runEveryWhere; } /** * Trigger an execution of the mastership runnable by the associated * executor service */ public synchronized boolean acceptMastership() { if (m_onMastership == null) { exportLog.info("Mastership Runnable not yet set " + getGeneration() + " Table " + getTableName() + " partition " + getPartitionId()); return false; } if (m_mastershipAccepted.get()) { exportLog.info("Export generation " + getGeneration() + " Table " + getTableName() + " mastership already accepted for partition " + getPartitionId()); return true; } exportLog.info("Accepting mastership for export generation " + getGeneration() + " Table " + getTableName() + " partition " + getPartitionId()); RunnableWithES runnable = new RunnableWithES("acceptMastership") { @Override public void run() { try { if (!getLocalExecutorService().isShutdown() || !m_closed) { exportLog.info("Export generation " + getGeneration() + " Table " + getTableName() + " accepting mastership for partition " + getPartitionId()); if (m_onMastership != null) { if (m_mastershipAccepted.compareAndSet(false, true)) { m_onMastership.run(); } } } } catch (Exception e) { exportLog.error("Error in accepting mastership", e); } } }; stashOrSubmitTask(runnable, true, false); return m_mastershipAccepted.get(); } /** * set the runnable task that is to be executed on mastership designation * @param toBeRunOnMastership a {@link @Runnable} task */ public void setOnMastership(Runnable toBeRunOnMastership) { Preconditions.checkNotNull(toBeRunOnMastership, "mastership runnable is null"); m_onMastership = toBeRunOnMastership; if (m_replicaMastershipRequested) { acceptMastership(); } } public ExportFormat getExportFormat() { return m_format; } //Set it from client. public void setRunEveryWhere(boolean runEveryWhere) { m_runEveryWhere = runEveryWhere; } private ListenableFuture<?> stashOrSubmitTask(RunnableWithES runnable, final boolean callExecute, final boolean setupTask) { if (m_executor==null) { synchronized (m_executorLock) { if (m_executor==null) { // Bound the queue. It shouldn't get to this high value. // Log an error so that we know if it does get to the high value. if (m_queuedActions.size() > 50) { StringBuilder builder = new StringBuilder(); builder.append("Export task queue is filled up to: " + m_queuedActions.size()); builder.append(". Not queueing anymore events beyond 50 for generation " + m_generation); builder.append(" and table " + m_tableName + ". The queue contains the following tasks:\n"); for (RunnableWithES queuedR : m_queuedActions) { builder.append(queuedR.getTaskName() + "\t"); } exportLog.warn(builder.toString()); return Futures.immediateFuture(null); } if (setupTask) { m_firstAction = runnable; } else { m_queuedActions.add(runnable); } return Futures.immediateFuture(null); } } } // If we got here executor is not null and this generation is active runnable.setExecutorService(m_executor); if (m_executor.isShutdown()) { return Futures.immediateFuture(null); } if (callExecute) { m_executor.execute(runnable); return Futures.immediateFuture(null); } else { return m_executor.submit(runnable); } } public void setupExecutor() { if (m_executor!=null) { return; } synchronized(m_executorLock) { if (m_executor==null) { ListeningExecutorService es = CoreUtils.getListeningExecutorService( "ExportDataSource gen " + m_generation + " table " + m_tableName + " partition " + m_partitionId, 1); //If we have a truncate task do that first. if (m_firstAction != null) { exportLog.info("Submitting truncate task for ExportDataSource gen " + m_generation + " table " + m_tableName + " partition " + m_partitionId); es.submit(m_firstAction); } if (m_queuedActions.size()>0) { for (RunnableWithES queuedR : m_queuedActions) { queuedR.setExecutorService(es); es.submit(queuedR); } m_queuedActions.clear(); } m_executor = es; } } } public ListeningExecutorService getExecutorService() { return m_executor; } private abstract class RunnableWithES implements Runnable { private final String m_taskName; private ListeningExecutorService m_executorService; public RunnableWithES(String taskName){ m_taskName = taskName; } public void setExecutorService(ListeningExecutorService executorService) { m_executorService = executorService; } public ListeningExecutorService getLocalExecutorService() { return m_executorService; } public String getTaskName() { return m_taskName; } } }