/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.db; import java.io.IOException; import java.lang.management.ManagementFactory; import java.net.InetAddress; import java.net.UnknownHostException; import java.nio.ByteBuffer; import java.nio.charset.CharacterCodingException; import java.util.*; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeoutException; import static com.google.common.base.Charsets.UTF_8; import org.apache.cassandra.utils.FBUtilities; import org.apache.commons.lang.ArrayUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.management.MBeanServer; import javax.management.ObjectName; import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.filter.QueryFilter; import org.apache.cassandra.db.filter.QueryPath; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.Range; import org.apache.cassandra.gms.FailureDetector; import org.apache.cassandra.gms.Gossiper; import org.apache.cassandra.gms.ApplicationState; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.service.*; import org.apache.cassandra.thrift.*; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.WrappedRunnable; import org.cliffc.high_scale_lib.NonBlockingHashSet; /** * For each endpoint for which we have hints, there is a row in the system hints CF. * The key for this row is ByteBuffer.wrap(string), i.e. "127.0.0.1". * * SuperColumns in that row are keys for which we have hinted data. * Subcolumns names within that supercolumn are keyspace+CF, concatenated with SEPARATOR. * Subcolumn values are always empty; instead, we store the row data "normally" * in the application table it belongs in. * * When FailureDetector signals that a node that was down is back up, we read its * hints row to see what rows we need to forward data for, then reach each row in its * entirety and send it over. * * deliverHints is also exposed to JMX so it can be run manually if FD ever misses * its cue somehow. * * HHM never deletes the row from Application tables; usually (but not for CL.ANY!) * the row belongs on this node, as well. instead, we rely on cleanup compactions * to remove data that doesn't belong. (Cleanup compactions may be started manually * -- on a per node basis -- with "nodeprobe cleanup.") * * TODO this avoids our hint rows from growing excessively large by offloading the * message data into application tables. But, this means that cleanup compactions * will nuke HH data. Probably better would be to store the RowMutation messages * in a HHData (non-super) CF, modifying the above to store a UUID value in the * HH subcolumn value, which we use as a key to a [standard] HHData system CF * that would contain the message bytes. */ public class HintedHandOffManager implements HintedHandOffManagerMBean { public static final HintedHandOffManager instance = new HintedHandOffManager(); public static final String HINTS_CF = "HintsColumnFamily"; private static final Logger logger_ = LoggerFactory.getLogger(HintedHandOffManager.class); private static final int PAGE_SIZE = 10000; private static final String SEPARATOR = "-"; private static final int LARGE_NUMBER = 65536; // 64k nodes ought to be enough for anybody. private final NonBlockingHashSet<InetAddress> queuedDeliveries = new NonBlockingHashSet<InetAddress>(); private final ExecutorService executor_ = new JMXEnabledThreadPoolExecutor("HintedHandoff", DatabaseDescriptor.getCompactionThreadPriority()); public HintedHandOffManager() { MBeanServer mbs = ManagementFactory.getPlatformMBeanServer(); try { mbs.registerMBean(this, new ObjectName("org.apache.cassandra.db:type=HintedHandoffManager")); } catch (Exception e) { throw new RuntimeException(e); } } public void registerMBean() { logger_.debug("Created HHOM instance, registered MBean."); } private static boolean sendMessage(InetAddress endpoint, String tableName, String cfName, ByteBuffer key) throws IOException { if (!Gossiper.instance.isKnownEndpoint(endpoint)) { logger_.warn("Hints found for endpoint " + endpoint + " which is not part of the gossip network. discarding."); return true; } if (!FailureDetector.instance.isAlive(endpoint)) { return false; } Table table = Table.open(tableName); DecoratedKey dkey = StorageService.getPartitioner().decorateKey(key); ColumnFamilyStore cfs = table.getColumnFamilyStore(cfName); ByteBuffer startColumn = ByteBufferUtil.EMPTY_BYTE_BUFFER; while (true) { QueryFilter filter = QueryFilter.getSliceFilter(dkey, new QueryPath(cfs.getColumnFamilyName()), startColumn, ByteBufferUtil.EMPTY_BYTE_BUFFER, false, PAGE_SIZE); ColumnFamily cf = cfs.getColumnFamily(filter); if (pagingFinished(cf, startColumn)) break; if (cf.getColumnNames().isEmpty()) { logger_.debug("Nothing to hand off for {}", dkey); break; } startColumn = cf.getColumnNames().last(); RowMutation rm = new RowMutation(tableName, key); rm.add(cf); Message message = rm.makeRowMutationMessage(); IWriteResponseHandler responseHandler = WriteResponseHandler.create(endpoint); MessagingService.instance().sendRR(message, endpoint, responseHandler); try { responseHandler.get(); } catch (TimeoutException e) { return false; } try { Thread.sleep(DatabaseDescriptor.getHintedHandoffThrottleDelay()); } catch (InterruptedException e) { throw new AssertionError(e); } } return true; } private static void deleteHintKey(ByteBuffer endpointAddress, ByteBuffer key, ByteBuffer tableCF, long timestamp) throws IOException { RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, endpointAddress); rm.delete(new QueryPath(HINTS_CF, key, tableCF), timestamp); rm.apply(); } public void deleteHintsForEndpoint(final String ipOrHostname) { try { InetAddress endpoint = InetAddress.getByName(ipOrHostname); deleteHintsForEndpoint(endpoint); } catch (UnknownHostException e) { logger_.warn("Unable to find "+ipOrHostname+", not a hostname or ipaddr of a node?:"); e.printStackTrace(); throw new RuntimeException(e); } } public void deleteHintsForEndpoint(final InetAddress endpoint) { final String ipaddr = endpoint.getHostAddress(); final ColumnFamilyStore hintStore = Table.open(Table.SYSTEM_TABLE).getColumnFamilyStore(HINTS_CF); final RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, ByteBufferUtil.bytes(ipaddr)); rm.delete(new QueryPath(HINTS_CF), System.currentTimeMillis()); // execute asynchronously to avoid blocking caller (which may be processing gossip) Runnable runnable = new Runnable() { public void run() { try { logger_.info("Deleting any stored hints for " + ipaddr); rm.apply(); hintStore.forceFlush(); CompactionManager.instance.submitMajor(hintStore, 0, Integer.MAX_VALUE); } catch (Exception e) { logger_.warn("Could not delete hints for " + ipaddr + ": " + e); } } }; StorageService.scheduledTasks.execute(runnable); } private static boolean pagingFinished(ColumnFamily hintColumnFamily, ByteBuffer startColumn) { // done if no hints found or the start column (same as last column processed in previous iteration) is the only one return hintColumnFamily == null || (hintColumnFamily.getSortedColumns().size() == 1 && hintColumnFamily.getColumn(startColumn) != null); } public static ByteBuffer makeCombinedName(String tableName, String columnFamily) { byte[] withsep = ArrayUtils.addAll(tableName.getBytes(UTF_8), SEPARATOR.getBytes(UTF_8)); return ByteBuffer.wrap(ArrayUtils.addAll(withsep, columnFamily.getBytes(UTF_8))); } private static String[] getTableAndCFNames(ByteBuffer joined) { int index = ByteBufferUtil.lastIndexOf(joined, SEPARATOR.getBytes(UTF_8)[0], joined.limit()); if (index == -1 || index < (joined.position() + 1)) throw new RuntimeException("Corrupted hint name " + ByteBufferUtil.bytesToHex(joined)); try { return new String[] { ByteBufferUtil.string(joined, joined.position(), index - joined.position()), ByteBufferUtil.string(joined, index + 1, joined.limit() - (index + 1)) }; } catch (CharacterCodingException e) { throw new RuntimeException(e); } } private int waitForSchemaAgreement(InetAddress endpoint) throws InterruptedException { Gossiper gossiper = Gossiper.instance; int waited = 0; // first, wait for schema to be gossiped. while (gossiper.getEndpointStateForEndpoint(endpoint).getApplicationState(ApplicationState.SCHEMA) == null) { Thread.sleep(1000); waited += 1000; if (waited > 2 * StorageService.RING_DELAY) throw new RuntimeException("Didin't receive gossiped schema from " + endpoint + " in " + 2 * StorageService.RING_DELAY + "ms"); } waited = 0; // then wait for the correct schema version. while (!gossiper.getEndpointStateForEndpoint(endpoint).getApplicationState(ApplicationState.SCHEMA).value.equals( gossiper.getEndpointStateForEndpoint(FBUtilities.getLocalAddress()).getApplicationState(ApplicationState.SCHEMA).value)) { Thread.sleep(1000); waited += 1000; if (waited > 2 * StorageService.RING_DELAY) throw new RuntimeException("Could not reach schema agreement with " + endpoint + " in " + 2 * StorageService.RING_DELAY + "ms"); } logger_.debug("schema for {} matches local schema", endpoint); return waited; } private void deliverHintsToEndpoint(InetAddress endpoint) throws IOException, DigestMismatchException, InvalidRequestException, TimeoutException, InterruptedException { try { logger_.debug("Checking remote schema before delivering hints"); int waited = waitForSchemaAgreement(endpoint); // sleep a random amount to stagger handoff delivery from different replicas. // (if we had to wait, then gossiper randomness took care of that for us already.) if (waited == 0) { int sleep = new Random().nextInt(60000); logger_.debug("Sleeping {}ms to stagger hint delivery", sleep); Thread.sleep(sleep); } if (!Gossiper.instance.getEndpointStateForEndpoint(endpoint).isAlive()) { logger_.info("Endpoint {} died before hint delivery, aborting", endpoint); return; } } finally { queuedDeliveries.remove(endpoint); } logger_.info("Started hinted handoff for endpoint " + endpoint); // 1. Get the key of the endpoint we need to handoff // 2. For each column read the list of rows: subcolumns are KS + SEPARATOR + CF // 3. Delete the subcolumn if the write was successful // 4. Force a flush // 5. Do major compaction to clean up all deletes etc. ByteBuffer endpointAsUTF8 = ByteBufferUtil.bytes(endpoint.getHostAddress()); // keys have to be UTF8 to make OPP happy DecoratedKey epkey = StorageService.getPartitioner().decorateKey(endpointAsUTF8); int rowsReplayed = 0; ColumnFamilyStore hintStore = Table.open(Table.SYSTEM_TABLE).getColumnFamilyStore(HINTS_CF); ByteBuffer startColumn = ByteBufferUtil.EMPTY_BYTE_BUFFER; delivery: while (true) { QueryFilter filter = QueryFilter.getSliceFilter(epkey, new QueryPath(HINTS_CF), startColumn, ByteBufferUtil.EMPTY_BYTE_BUFFER, false, PAGE_SIZE); ColumnFamily hintColumnFamily = ColumnFamilyStore.removeDeleted(hintStore.getColumnFamily(filter), Integer.MAX_VALUE); if (pagingFinished(hintColumnFamily, startColumn)) break; for (IColumn keyColumn : hintColumnFamily.getSortedColumns()) { startColumn = keyColumn.name(); Collection<IColumn> tableCFs = keyColumn.getSubColumns(); for (IColumn tableCF : tableCFs) { String[] parts = getTableAndCFNames(tableCF.name()); if (sendMessage(endpoint, parts[0], parts[1], keyColumn.name())) { deleteHintKey(endpointAsUTF8, keyColumn.name(), tableCF.name(), tableCF.timestamp()); rowsReplayed++; } else { logger_.info("Could not complete hinted handoff to " + endpoint); break delivery; } startColumn = keyColumn.name(); } } } if (rowsReplayed > 0) { hintStore.forceFlush(); try { CompactionManager.instance.submitMajor(hintStore, 0, Integer.MAX_VALUE).get(); } catch (Exception e) { throw new RuntimeException(e); } } logger_.info(String.format("Finished hinted handoff of %s rows to endpoint %s", rowsReplayed, endpoint)); } /** called when a keyspace is dropped or rename. newTable==null in the case of a drop. */ public static void renameHints(String oldTable, String newTable) throws IOException { DecoratedKey oldTableKey = StorageService.getPartitioner().decorateKey(ByteBufferUtil.bytes(oldTable)); // we're basically going to fetch, drop and add the scf for the old and new table. we need to do it piecemeal // though since there could be GB of data. ColumnFamilyStore hintStore = Table.open(Table.SYSTEM_TABLE).getColumnFamilyStore(HINTS_CF); ByteBuffer startCol = ByteBufferUtil.EMPTY_BYTE_BUFFER; long now = System.currentTimeMillis(); while (true) { QueryFilter filter = QueryFilter.getSliceFilter(oldTableKey, new QueryPath(HINTS_CF), startCol, ByteBufferUtil.EMPTY_BYTE_BUFFER, false, PAGE_SIZE); ColumnFamily cf = ColumnFamilyStore.removeDeleted(hintStore.getColumnFamily(filter), Integer.MAX_VALUE); if (pagingFinished(cf, startCol)) break; if (newTable != null) { RowMutation insert = new RowMutation(Table.SYSTEM_TABLE, ByteBufferUtil.bytes(newTable)); insert.add(cf); insert.apply(); } RowMutation drop = new RowMutation(Table.SYSTEM_TABLE, oldTableKey.key); for (ByteBuffer key : cf.getColumnNames()) { drop.delete(new QueryPath(HINTS_CF, key), now); startCol = key; } drop.apply(); } } /* * This method is used to deliver hints to a particular endpoint. * When we learn that some endpoint is back up we deliver the data * to him via an event driven mechanism. */ public void deliverHints(final InetAddress to) { if (!queuedDeliveries.add(to)) return; Runnable r = new WrappedRunnable() { public void runMayThrow() throws Exception { deliverHintsToEndpoint(to); } }; executor_.execute(r); } public void deliverHints(String to) throws UnknownHostException { deliverHints(InetAddress.getByName(to)); } public List<String> listEndpointsPendingHints() { List<Row> rows = getHintsSlice(1); // Extract the keys as strings to be reported. LinkedList<String> result = new LinkedList<String>(); for (Row r : rows) { if (r.cf != null) //ignore removed rows result.addFirst(new String(r.key.key.array())); } return result; } public Map<String, Integer> countPendingHints() { List<Row> rows = getHintsSlice(Integer.MAX_VALUE); Map<String, Integer> result = new HashMap<String, Integer>(); for (Row r : rows) { if (r.cf != null) //ignore removed rows result.put(new String(r.key.key.array()), r.cf.getColumnCount()); } return result; } private List<Row> getHintsSlice(int column_count) { // ColumnParent for HintsCF... ColumnParent parent = new ColumnParent(HINTS_CF); // Get count # of columns... SlicePredicate predicate = new SlicePredicate(); SliceRange sliceRange = new SliceRange(); sliceRange.setStart(new byte[0]).setFinish(new byte[0]); sliceRange.setCount(column_count); predicate.setSlice_range(sliceRange); // From keys "" to ""... IPartitioner partitioner = StorageService.getPartitioner(); ByteBuffer empty = ByteBufferUtil.EMPTY_BYTE_BUFFER; Range range = new Range(partitioner.getToken(empty), partitioner.getToken(empty)); // Get a bunch of rows! List<Row> rows; try { rows = StorageProxy.getRangeSlice(new RangeSliceCommand("system", parent, predicate, range, LARGE_NUMBER), ConsistencyLevel.ONE); } catch (Exception e) { logger_.info("HintsCF getEPPendingHints timed out."); throw new RuntimeException(e); } return rows; } }