package org.apache.cassandra.hadoop; /* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ import java.io.IOException; import java.net.InetAddress; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import org.apache.cassandra.client.RingCache; import static org.apache.cassandra.io.SerDeUtils.copy; import org.apache.cassandra.thrift.Cassandra; import org.apache.cassandra.thrift.Clock; import org.apache.cassandra.thrift.Column; import org.apache.cassandra.thrift.ColumnOrSuperColumn; import org.apache.cassandra.thrift.ConsistencyLevel; import org.apache.cassandra.thrift.Deletion; import org.apache.cassandra.thrift.Mutation; import org.apache.cassandra.thrift.SlicePredicate; import org.apache.cassandra.thrift.SliceRange; import org.apache.cassandra.thrift.SuperColumn; import org.apache.cassandra.utils.FBUtilities; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.thrift.transport.TSocket; /** * The <code>ColumnFamilyRecordWriter</code> maps the output <key, value> * pairs to a Cassandra column family. In particular, it applies all mutations * in the value, which it associates with the key, and in turn the responsible * endpoint. * * <p> * Note that, given that round trips to the server are fairly expensive, it * merely batches the mutations in-memory and periodically sends the batched * mutations to the server in one shot. * </p> * * <p> * Furthermore, this writer groups the mutations by the endpoint responsible for * the rows being affected. This allows the mutations to be executed in parallel, * directly to a responsible endpoint. * </p> * * @author Karthick Sankarachary * @see ColumnFamilyOutputFormat * @see OutputFormat * */ final class ColumnFamilyRecordWriter extends RecordWriter<ByteBuffer,List<org.apache.cassandra.avro.Mutation>> implements org.apache.hadoop.mapred.RecordWriter<ByteBuffer,List<org.apache.cassandra.avro.Mutation>> { // The configuration this writer is associated with. private final Configuration conf; // The batched set of mutations grouped by endpoints. private Map<InetAddress,Map<byte[],Map<String,List<Mutation>>>> mutationsByEndpoint; // The ring cache that describes the token ranges each node in the ring is // responsible for. This is what allows us to group the mutations by // the endpoints they should be targeted at. The targeted endpoint // essentially // acts as the primary replica for the rows being affected by the mutations. private final RingCache ringCache; // The number of mutations currently held in the mutations cache. private long batchSize = 0L; // The maximum number of mutations to hold in the mutations cache. private final long batchThreshold; /** * Upon construction, obtain the map that this writer will use to collect * mutations, and the ring cache for the given keyspace. * * @param context the task attempt context * @throws IOException */ ColumnFamilyRecordWriter(TaskAttemptContext context) throws IOException { this(context.getConfiguration()); } ColumnFamilyRecordWriter(Configuration conf) throws IOException { this.conf = conf; this.mutationsByEndpoint = new HashMap<InetAddress,Map<byte[],Map<String,List<Mutation>>>>(); this.ringCache = new RingCache(ConfigHelper.getOutputKeyspace(conf), ConfigHelper.getPartitioner(conf), ConfigHelper.getInitialAddress(conf), ConfigHelper.getRpcPort(conf)); this.batchThreshold = conf.getLong(ColumnFamilyOutputFormat.BATCH_THRESHOLD, Long.MAX_VALUE); } /** * Return the endpoint responsible for the given key. The selected endpoint * one whose token range contains the given key. * * @param key * the key being mutated * @return the endpoint responsible for that key */ protected InetAddress getEndpoint(byte[] key) { return ringCache.getEndpoint(key).iterator().next(); } /** * Writes a key/value pair, not to the Cassandra server, but into a * in-memory cache (viz. {@link #mutationsByEndpoint}. * * <p> * If the key is to be associated with a valid value, a mutation is created * for it with the given column family and columns. In the event the value * in the column is missing (i.e., null), then it is marked for * {@link Deletion}. Similarly, if the entire value for a key is missing * (i.e., null), then the entire key is marked for {@link Deletion}. * </p> * * @param key * the key to write. * @param value * the value to write. * @throws IOException */ @Override public synchronized void write(ByteBuffer keybuff, List<org.apache.cassandra.avro.Mutation> value) throws IOException { maybeFlush(); byte[] key = copy(keybuff); InetAddress endpoint = getEndpoint(key); Map<byte[], Map<String, List<Mutation>>> mutationsByKey = mutationsByEndpoint.get(endpoint); if (mutationsByKey == null) { mutationsByKey = new TreeMap<byte[], Map<String, List<Mutation>>>(FBUtilities.byteArrayComparator); mutationsByEndpoint.put(endpoint, mutationsByKey); } Map<String, List<Mutation>> cfMutation = mutationsByKey.get(key); if (cfMutation == null) { cfMutation = new HashMap<String, List<Mutation>>(); mutationsByKey.put(key, cfMutation); } List<Mutation> mutationList = cfMutation.get(ConfigHelper.getOutputColumnFamily(conf)); if (mutationList == null) { mutationList = new ArrayList<Mutation>(); cfMutation.put(ConfigHelper.getOutputColumnFamily(conf), mutationList); } for (org.apache.cassandra.avro.Mutation amut : value) mutationList.add(avroToThrift(amut)); } /** * Deep copies the given Avro mutation into a new Thrift mutation. */ private Mutation avroToThrift(org.apache.cassandra.avro.Mutation amut) { Mutation mutation = new Mutation(); org.apache.cassandra.avro.ColumnOrSuperColumn acosc = amut.column_or_supercolumn; if (acosc != null) { // creation ColumnOrSuperColumn cosc = new ColumnOrSuperColumn(); mutation.setColumn_or_supercolumn(cosc); if (acosc.column != null) // standard column cosc.setColumn(avroToThrift(acosc.column)); else { // super column byte[] scolname = copy(acosc.super_column.name); List<Column> scolcols = new ArrayList<Column>((int)acosc.super_column.columns.size()); for (org.apache.cassandra.avro.Column acol : acosc.super_column.columns) scolcols.add(avroToThrift(acol)); cosc.setSuper_column(new SuperColumn(scolname, scolcols)); } } else { // deletion Deletion deletion = new Deletion(avroToThrift(amut.deletion.clock)); mutation.setDeletion(deletion); org.apache.cassandra.avro.SlicePredicate apred = amut.deletion.predicate; if (amut.deletion.super_column != null) // super column deletion.setSuper_column(copy(amut.deletion.super_column)); else if (apred.column_names != null) { // column names List<byte[]> colnames = new ArrayList<byte[]>((int)apred.column_names.size()); for (ByteBuffer acolname : apred.column_names) colnames.add(copy(acolname)); deletion.setPredicate(new SlicePredicate().setColumn_names(colnames)); } else { // range deletion.setPredicate(new SlicePredicate().setSlice_range(avroToThrift(apred.slice_range))); } } return mutation; } private SliceRange avroToThrift(org.apache.cassandra.avro.SliceRange asr) { return new SliceRange(copy(asr.start), copy(asr.finish), asr.reversed, asr.count); } private Column avroToThrift(org.apache.cassandra.avro.Column acol) { return new Column(copy(acol.name), copy(acol.value), avroToThrift(acol.clock)); } private Clock avroToThrift(org.apache.cassandra.avro.Clock aclo) { return new Clock().setTimestamp(aclo.timestamp); } /** * Close this <code>RecordWriter</code> to future operations, but not before * flushing out the batched mutations. * * @param context the context of the task * @throws IOException */ @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { flush(); } /** Fills the deprecated RecordWriter interface for streaming. */ @Deprecated @Override public void close(org.apache.hadoop.mapred.Reporter reporter) throws IOException { flush(); } /** * Flush the mutations cache, iff more mutations have been cached than * {@link #batchThreshold}. * * @throws IOException */ private void maybeFlush() throws IOException { if (++batchSize > batchThreshold) { flush(); batchSize = 0L; } } /** * Send the batched mutations over to Cassandra, and then clear the * mutations cache. * * @throws IOException */ protected synchronized void flush() throws IOException { ExecutorService executor = Executors.newCachedThreadPool(); try { List<Future<?>> mutationFutures = new ArrayList<Future<?>>(); for (Map.Entry<InetAddress, Map<byte[], Map<String, List<Mutation>>>> entry : mutationsByEndpoint.entrySet()) { mutationFutures.add(executor.submit(new EndpointCallable(conf, entry.getKey(), entry.getValue()))); } // wait until we have all the results back for (Future<?> mutationFuture : mutationFutures) { try { mutationFuture.get(); } catch (ExecutionException e) { throw new IOException("Could not perform endpoint mutations", e.getCause()); } catch (InterruptedException e) { throw new AssertionError(e); } } } finally { executor.shutdownNow(); mutationsByEndpoint.clear(); } } /** * The <code>EndpointCallable</code> facilitates an asynchronous call to a * specific node in the ring that commands it to perform a batched set of * mutations. Needless to say, the given mutations are targeted at rows that * the selected endpoint is responsible for (i.e., is the primary replica * for). */ public class EndpointCallable implements Callable<Void> { // The task attempt context associated with this callable. private Configuration conf; // The endpoint of the primary replica for the rows being mutated private InetAddress endpoint; // The mutations to be performed in the node referenced by {@link // #endpoint}. private Map<byte[], Map<String, List<Mutation>>> mutations; /** * Constructs an {@link EndpointCallable} for the given endpoint and set * of mutations. * * @param conf job configuration * @param endpoint the endpoint wherein to execute the mutations * @param mutations the mutation map expected by * {@link Cassandra.Client#batch_mutate(Map, ConsistencyLevel)} */ public EndpointCallable(Configuration conf, InetAddress endpoint, Map<byte[], Map<String, List<Mutation>>> mutations) { this.conf = conf; this.endpoint = endpoint; this.mutations = mutations; } /** * Perform the call to * {@link Cassandra.Client#batch_mutate(Map, ConsistencyLevel)}. */ public Void call() throws Exception { TSocket socket = null; try { socket = new TSocket(endpoint.getHostName(), ConfigHelper.getRpcPort(conf)); Cassandra.Client client = ColumnFamilyOutputFormat.createAuthenticatedClient(socket, conf); client.batch_mutate(mutations, ConsistencyLevel.ONE); return null; } finally { if (socket != null) socket.close(); } } } }