/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.hadoop2.multiquery;
import java.io.IOException;
import java.util.List;
import com.datastax.driver.core.Cluster;
import com.datastax.driver.core.Row;
import com.datastax.driver.core.Session;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
// TODO: Update javadoc.
/**
* Hadoop InputFormat for Cassandra that does not use Cassandra's Thrift API.
*
* Hadoop InputFormat allowing map/reduce against Cassandra rows within one ColumnFamily.
*
* This code uses the DataStax Java driver to fetch data from tables.
*
* At minimum, you need to set the KS and CF in your Hadoop job Configuration.
* The ConfigHelper class is provided to make this simple:
* ConfigHelper.setInputColumnFamily
*
* You can also configure:
*
* the number of CQL rows per page
* CQLConfigHelper.setInputCQLPageRowSize. The default page row size is 1000. You
* should set it to "as big as possible, but no bigger." It set the LIMIT for the CQL
* query, so you need set it big enough to minimize the network overhead, and also
* not too big to avoid out of memory issue.
*
* the column names of the select CQL query. The default is all columns
* CQLConfigHelper.setInputColumns
*
* the user defined the where clause
* CQLConfigHelper.setInputWhereClauses. The default is no user defined where clause
*/
public class MultiQueryCqlInputFormat extends InputFormat<Text, List<Row>> {
private static final Logger LOG = LoggerFactory.getLogger(MultiQueryCqlInputFormat.class);
/**
* Validate that all of necessary configuration settings are present.
*
* @param conf Hadoop configuration.
* @param session Open C* session.
* @throws IOException if the configuration is invalid.
*/
protected void validateConfiguration(Configuration conf, Session session) throws IOException {
List<CqlQuerySpec> queries = ConfigHelper.getInputCqlQueries(conf);
if (0 == queries.size()) {
throw new IOException("Must specify a query!");
}
// Check that all keyspaces and tables exist.
MultiQueryRecordReader.checkKeyspacesAndTablesExist(session, queries);
// Check that all tables have the same partition keys.
MultiQueryRecordReader.checkParitionKeysAreIdentical(session, queries);
// Check that all queried columns exist.
// Check that all specified clustering columns are the same across tables.
}
/**
* {@inheritDoc}
*/
@Override
public RecordReader<Text, List<Row>> createRecordReader(
InputSplit inputSplit,
TaskAttemptContext context) {
return new MultiQueryRecordReader();
}
/**
* {@inheritDoc}
*/
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
Configuration conf = context.getConfiguration();
return getSplitsFromConf(conf);
}
/**
* Internal method that we can call with just a Hadoop Configuration - useful for unit testing.
*
* @param conf Hadoop configuration.
* @return A list of input splits for a MapReduce job.
* @throws java.io.IOException
*/
public List<InputSplit> getSplitsFromConf(Configuration conf) throws IOException {
// Create a session with a custom load-balancing policy that will ensure that we send queries
// for system.local and system.peers to the same node.
Cluster cluster = Cluster
.builder()
.addContactPoints(ConfigHelper.getInputNativeTransportContactPoints(conf))
.withPort(ConfigHelper.getInputNativeTransportPort(conf))
.withLoadBalancingPolicy(new ConsistentHostOrderPolicy())
.build();
Session session = cluster.connect();
validateConfiguration(conf, session);
// Get a list of all of the subsplits. A "subsplit" contains the following:
// - A token range (corresponding to a virtual node in the C* cluster)
// - A list of replica nodes for that token range
final SubsplitCreator subsplitCreator = new SubsplitCreator(conf, session);
final List<Subsplit> subsplitsFromTokens = subsplitCreator.createSubsplits();
LOG.debug(String.format("Created %d subsplits from tokens", subsplitsFromTokens.size()));
// In this InputFormat, we allow the user to specify a desired number of InputSplits. We
// will likely have far more subsplits (vnodes) than desired InputSplits. Therefore, we combine
// subsplits (hopefully those that share the same replica nodes) until we get to our desired
// InputSplit count.
final SubsplitCombiner subsplitCombiner = new SubsplitCombiner(conf);
// Get a list of all of the token ranges in the Cassandra cluster.
List<InputSplit> inputSplitList = Lists.newArrayList();
// Java is annoying here about casting a list.
inputSplitList.addAll(subsplitCombiner.combineSubsplits(subsplitsFromTokens));
cluster.close();
LOG.info(String.format("Created a total of %d InputSplits", inputSplitList.size()));
for (InputSplit inputSplit : inputSplitList) {
LOG.debug(inputSplit.toString());
}
return inputSplitList;
}
}