/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.service; import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.IOError; import java.io.IOException; import java.net.InetAddress; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Map; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ColumnFamily; import org.apache.cassandra.db.ReadResponse; import org.apache.cassandra.db.Row; import org.apache.cassandra.db.RowMutation; import org.apache.cassandra.db.RowMutationMessage; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; import org.cliffc.high_scale_lib.NonBlockingHashMap; /** * Turns ReadResponse messages into Row objects, resolving to the most recent * version and setting up read repairs as necessary. */ public class ReadResponseResolver extends SimpleReadResponseResolver implements IResponseResolver<Row> { public interface ScheduleRepairListener { void listenRepair(ColumnFamily resolved, String table, String key, List<ColumnFamily> versions, List<InetAddress> endPoints, int versionIndex, ColumnFamily diffCf); } public static ScheduleRepairListener scheduleRepairListener; private final int responseCount; private final Map<InetAddress, ReadResponse> results = new NonBlockingHashMap<InetAddress, ReadResponse>(); public ReadResponseResolver(String table, String key, int responseCount) { super(table,key); assert 1 <= responseCount && responseCount <= DatabaseDescriptor.getReplicationFactor(table) : "invalid response count " + responseCount; this.responseCount = responseCount; } /* (non-Javadoc) * @see org.apache.cassandra.service.IResponseResolver#resolve(org.apache.cassandra.net.Message) */ @Override public Row resolve(Message message) throws IOException { ReadResponse result = parseResponse(message); return result.row(); } /* * This method for resolving read data should look at the timestamps of each * of the columns that are read and should pick up columns with the latest * timestamp. For those columns where the timestamp is not the latest a * repair request should be scheduled. * */ public Row resolve(Collection<Message> responses) throws DigestMismatchException, IOException { if (logger_.isDebugEnabled()) logger_.debug("resolving " + responses.size() + " responses"); long startTime = System.currentTimeMillis(); List<ColumnFamily> versions = new ArrayList<ColumnFamily>(responses.size()); List<InetAddress> endPoints = new ArrayList<InetAddress>(responses.size()); byte[] digest = null; /* * Populate the list of rows from each of the messages * Check to see if there is a digest query. If a digest * query exists then we need to compare the digest with * the digest of the data that is received. */ for (Message message : responses) { ReadResponse result = results.get(message.getFrom()); if (result == null) continue; // arrived after quorum already achieved if (result.isDigestQuery()) { if (digest == null) { digest = result.digest(); } else { byte[] digest2 = result.digest(); if (!Arrays.equals(digest, digest2)) throw new DigestMismatchException(key, digest, digest2); } } else { versions.add(result.row().cf); endPoints.add(message.getFrom()); } } // If there was a digest query compare it with all the data digests // If there is a mismatch then throw an exception so that read repair can happen. if (digest != null) { for (ColumnFamily cf : versions) { byte[] digest2 = ColumnFamily.digest(cf); if (!Arrays.equals(digest, digest2)) throw new DigestMismatchException(key, digest, digest2); } if (logger_.isDebugEnabled()) logger_.debug("digests verified"); } Row resolved = resolve(versions, endPoints); if (logger_.isDebugEnabled()) logger_.debug("resolve: " + (System.currentTimeMillis() - startTime) + " ms."); return resolved; } /** * For each row version, compare with resolved (the superset of all row versions); * if it is missing anything, send a mutation to the endpoint it come from. */ public static void maybeScheduleRepairs(ColumnFamily resolved, String table, String key, List<ColumnFamily> versions, List<InetAddress> endPoints) { for (int i = 0; i < versions.size(); i++) { ColumnFamily diffCf = ColumnFamily.diff(versions.get(i), resolved); if (diffCf == null) // no repair needs to happen continue; if (scheduleRepairListener != null) { scheduleRepairListener.listenRepair(resolved, table, key, versions, endPoints, i, diffCf); } // create and send the row mutation message based on the diff RowMutation rowMutation = new RowMutation(table, key); rowMutation.add(diffCf); RowMutationMessage rowMutationMessage = new RowMutationMessage(rowMutation); Message repairMessage; try { repairMessage = rowMutationMessage.makeRowMutationMessage(StorageService.Verb.READ_REPAIR); } catch (IOException e) { throw new IOError(e); } MessagingService.instance.sendOneWay(repairMessage, endPoints.get(i)); StorageProxy.countReadRepair(); } } static ColumnFamily resolveSuperset(List<ColumnFamily> versions) { assert versions.size() > 0; ColumnFamily resolved = null; for (ColumnFamily cf : versions) { if (cf != null) { resolved = cf.cloneMe(); break; } } if (resolved == null) return null; for (ColumnFamily cf : versions) { resolved.resolve(cf); } return resolved; } public void preprocess(Message message) { try { ReadResponse result = parseResponse(message); results.put(message.getFrom(), result); } catch (IOException e) { throw new IOError(e); } } /** hack so ConsistencyChecker doesn't have to serialize/deserialize an extra real Message */ public void injectPreProcessed(Message message, ReadResponse result) { results.put(message.getFrom(), result); } /** * @param endpoint * @param readResponse */ public void injectPreProcessed(InetAddress endpoint, ReadResponse readResponse) { results.put(endpoint, readResponse); } public boolean isDataPresent(Collection<Message> responses) { int digests = 0; int data = 0; for (Message message : responses) { ReadResponse result = results.get(message.getFrom()); if (result == null) continue; // arrived concurrently if (result.isDigestQuery()) digests++; else data++; } return data > 0 && (data + digests >= responseCount); } }