/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.service; import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.IOError; import java.io.IOException; import java.util.*; import org.apache.cassandra.db.*; import java.net.InetAddress; import java.nio.ByteBuffer; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.utils.FBUtilities; import org.cliffc.high_scale_lib.NonBlockingHashMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Turns ReadResponse messages into Row objects, resolving to the most recent * version and setting up read repairs as necessary. */ public class ReadResponseResolver implements IResponseResolver<Row> { private static Logger logger_ = LoggerFactory.getLogger(ReadResponseResolver.class); private final String table; private final Map<Message, ReadResponse> results = new NonBlockingHashMap<Message, ReadResponse>(); public ReadResponseResolver(String table) { this.table = table; } /* * This method for resolving read data should look at the timestamps of each * of the columns that are read and should pick up columns with the latest * timestamp. For those columns where the timestamp is not the latest a * repair request should be scheduled. * */ public Row resolve(Collection<Message> responses) throws DigestMismatchException, IOException { if (logger_.isDebugEnabled()) logger_.debug("resolving " + responses.size() + " responses"); long startTime = System.currentTimeMillis(); List<ColumnFamily> versions = new ArrayList<ColumnFamily>(responses.size()); List<InetAddress> endpoints = new ArrayList<InetAddress>(responses.size()); DecoratedKey key = null; ByteBuffer digest = FBUtilities.EMPTY_BYTE_BUFFER; boolean isDigestQuery = false; /* * Populate the list of rows from each of the messages * Check to see if there is a digest query. If a digest * query exists then we need to compare the digest with * the digest of the data that is received. */ for (Message message : responses) { ReadResponse result = results.get(message); if (result == null) continue; // arrived after quorum already achieved if (result.isDigestQuery()) { digest = result.digest(); isDigestQuery = true; } else { versions.add(result.row().cf); endpoints.add(message.getFrom()); key = result.row().key; } } // If there was a digest query compare it with all the data digests // If there is a mismatch then throw an exception so that read repair can happen. if (isDigestQuery) { for (ColumnFamily cf : versions) { if (!ColumnFamily.digest(cf).equals(digest)) { /* Wrap the key as the context in this exception */ String s = String.format("Mismatch for key %s (%s vs %s)", key, FBUtilities.bytesToHex(ColumnFamily.digest(cf)), FBUtilities.bytesToHex(digest)); throw new DigestMismatchException(s); } } if (logger_.isDebugEnabled()) logger_.debug("digests verified"); } ColumnFamily resolved; if (versions.size() > 1) { resolved = resolveSuperset(versions); if (logger_.isDebugEnabled()) logger_.debug("versions merged"); maybeScheduleRepairs(resolved, table, key, versions, endpoints); } else { resolved = versions.get(0); } if (logger_.isDebugEnabled()) logger_.debug("resolve: " + (System.currentTimeMillis() - startTime) + " ms."); return new Row(key, resolved); } /** * For each row version, compare with resolved (the superset of all row versions); * if it is missing anything, send a mutation to the endpoint it come from. */ public static void maybeScheduleRepairs(ColumnFamily resolved, String table, DecoratedKey key, List<ColumnFamily> versions, List<InetAddress> endpoints) { for (int i = 0; i < versions.size(); i++) { ColumnFamily diffCf = ColumnFamily.diff(versions.get(i), resolved); if (diffCf == null) // no repair needs to happen continue; // create and send the row mutation message based on the diff RowMutation rowMutation = new RowMutation(table, key.key); rowMutation.add(diffCf); RowMutationMessage rowMutationMessage = new RowMutationMessage(rowMutation); Message repairMessage; try { repairMessage = rowMutationMessage.makeRowMutationMessage(StorageService.Verb.READ_REPAIR); } catch (IOException e) { throw new IOError(e); } MessagingService.instance.sendOneWay(repairMessage, endpoints.get(i)); } } static ColumnFamily resolveSuperset(List<ColumnFamily> versions) { assert versions.size() > 0; ColumnFamily resolved = null; for (ColumnFamily cf : versions) { if (cf != null) { resolved = cf.cloneMe(); break; } } if (resolved == null) return null; for (ColumnFamily cf : versions) { resolved.resolve(cf); } return resolved; } public void preprocess(Message message) { byte[] body = message.getMessageBody(); ByteArrayInputStream bufIn = new ByteArrayInputStream(body); try { ReadResponse result = ReadResponse.serializer().deserialize(new DataInputStream(bufIn)); results.put(message, result); } catch (IOException e) { throw new IOError(e); } } /** hack so ConsistencyChecker doesn't have to serialize/deserialize an extra real Message */ public void injectPreProcessed(Message message, ReadResponse result) { results.put(message, result); } public boolean isDataPresent(Collection<Message> responses) { for (Message message : responses) { ReadResponse result = results.get(message); if (result == null) continue; // arrived concurrently if (!result.isDigestQuery()) return true; } return false; } }