MultiQueryCqlInputFormat.java example

Explorer

cassandra-hadoop-2-master
- cassandra2-hadoop2-master
  - src
    - main
      - java
        org
        apache
        cassandra
        hadoop2
        multiquery
        ConfigHelper.java
        ConsistentHostOrderPolicy.java
        CqlQuerySpec.java
        MultiQueryCqlInputFormat.java
        MultiQueryRecordReader.java
        MultiRowIterator.java
        MultiqueryInputSplit.java
        Subsplit.java
        SubsplitCombiner.java
        SubsplitCreator.java
        TokenRange.java
        WhereClause.java
    - test
      - java
        org
        apache
        cassandra
        hadoop2
        multiquery
        BaseInputFormatTest.java
        SmokeIT.java
        TestInputFormat.java
        TestInputFormatWithBackgroundCassandra.java
        TestMultiRowIterator.java
        TestQuerySpec.java
        TestRecordReader.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.hadoop2.multiquery;

import java.io.IOException;
import java.util.List;

import com.datastax.driver.core.Cluster;
import com.datastax.driver.core.Row;
import com.datastax.driver.core.Session;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

// TODO: Update javadoc.

/**
 * Hadoop InputFormat for Cassandra that does not use Cassandra's Thrift API.
 *
 * Hadoop InputFormat allowing map/reduce against Cassandra rows within one ColumnFamily.
 *
 * This code uses the DataStax Java driver to fetch data from tables.
 *
 * At minimum, you need to set the KS and CF in your Hadoop job Configuration.  
 * The ConfigHelper class is provided to make this simple:
 *   ConfigHelper.setInputColumnFamily
 *
 * You can also configure:
 *
 * the number of CQL rows per page
 *   CQLConfigHelper.setInputCQLPageRowSize. The default page row size is 1000. You
 *   should set it to "as big as possible, but no bigger." It set the LIMIT for the CQL 
 *   query, so you need set it big enough to minimize the network overhead, and also
 *   not too big to avoid out of memory issue.
 *   
 * the column names of the select CQL query. The default is all columns
 *   CQLConfigHelper.setInputColumns
 *   
 * the user defined the where clause
 *   CQLConfigHelper.setInputWhereClauses. The default is no user defined where clause
 */
public class MultiQueryCqlInputFormat extends InputFormat<Text, List<Row>> {
  private static final Logger LOG = LoggerFactory.getLogger(MultiQueryCqlInputFormat.class);

  /**
   * Validate that all of necessary configuration settings are present.
   *
   * @param conf Hadoop configuration.
   * @param session Open C* session.
   * @throws IOException if the configuration is invalid.
   */
  protected void validateConfiguration(Configuration conf, Session session) throws IOException {
    List<CqlQuerySpec> queries = ConfigHelper.getInputCqlQueries(conf);

    if (0 == queries.size()) {
      throw new IOException("Must specify a query!");
    }

    // Check that all keyspaces and tables exist.
    MultiQueryRecordReader.checkKeyspacesAndTablesExist(session, queries);

    // Check that all tables have the same partition keys.
    MultiQueryRecordReader.checkParitionKeysAreIdentical(session, queries);

    // Check that all queried columns exist.

    // Check that all specified clustering columns are the same across tables.
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public RecordReader<Text, List<Row>> createRecordReader(
      InputSplit inputSplit,
      TaskAttemptContext context) {
    return new MultiQueryRecordReader();
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public List<InputSplit> getSplits(JobContext context) throws IOException {
    Configuration conf = context.getConfiguration();
    return getSplitsFromConf(conf);
  }

  /**
   * Internal method that we can call with just a Hadoop Configuration - useful for unit testing.
   *
   * @param conf Hadoop configuration.
   * @return A list of input splits for a MapReduce job.
   * @throws java.io.IOException
   */
  public List<InputSplit> getSplitsFromConf(Configuration conf) throws IOException {
    // Create a session with a custom load-balancing policy that will ensure that we send queries
    // for system.local and system.peers to the same node.
    Cluster cluster = Cluster
        .builder()
        .addContactPoints(ConfigHelper.getInputNativeTransportContactPoints(conf))
        .withPort(ConfigHelper.getInputNativeTransportPort(conf))
        .withLoadBalancingPolicy(new ConsistentHostOrderPolicy())
        .build();
    Session session = cluster.connect();

    validateConfiguration(conf, session);

    // Get a list of all of the subsplits.  A "subsplit" contains the following:
    // - A token range (corresponding to a virtual node in the C* cluster)
    // - A list of replica nodes for that token range
    final SubsplitCreator subsplitCreator = new SubsplitCreator(conf, session);
    final List<Subsplit> subsplitsFromTokens = subsplitCreator.createSubsplits();
    LOG.debug(String.format("Created %d subsplits from tokens", subsplitsFromTokens.size()));

    // In this InputFormat, we allow the user to specify a desired number of InputSplits.  We
    // will likely have far more subsplits (vnodes) than desired InputSplits.  Therefore, we combine
    // subsplits (hopefully those that share the same replica nodes) until we get to our desired
    // InputSplit count.
    final SubsplitCombiner subsplitCombiner = new SubsplitCombiner(conf);

    // Get a list of all of the token ranges in the Cassandra cluster.
    List<InputSplit> inputSplitList = Lists.newArrayList();

    // Java is annoying here about casting a list.
    inputSplitList.addAll(subsplitCombiner.combineSubsplits(subsplitsFromTokens));
    cluster.close();
    LOG.info(String.format("Created a total of %d InputSplits", inputSplitList.size()));
    for (InputSplit inputSplit : inputSplitList) {
      LOG.debug(inputSplit.toString());
    }
    return inputSplitList;
  }

}