SubsplitCreator.java example

Explorer

cassandra-hadoop-2-master
- cassandra2-hadoop2-master
  - src
    - main
      - java
        org
        apache
        cassandra
        hadoop2
        multiquery
        ConfigHelper.java
        ConsistentHostOrderPolicy.java
        CqlQuerySpec.java
        MultiQueryCqlInputFormat.java
        MultiQueryRecordReader.java
        MultiRowIterator.java
        MultiqueryInputSplit.java
        Subsplit.java
        SubsplitCombiner.java
        SubsplitCreator.java
        TokenRange.java
        WhereClause.java
    - test
      - java
        org
        apache
        cassandra
        hadoop2
        multiquery
        BaseInputFormatTest.java
        SmokeIT.java
        TestInputFormat.java
        TestInputFormatWithBackgroundCassandra.java
        TestMultiRowIterator.java
        TestQuerySpec.java
        TestRecordReader.java

package org.apache.cassandra.hadoop2.multiquery;

import java.net.InetAddress;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;

import com.datastax.driver.core.ResultSet;
import com.datastax.driver.core.Row;
import com.datastax.driver.core.Session;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.hadoop.conf.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Class responsible for creating subsplits.
 */
class SubsplitCreator {
  private static final Logger LOG = LoggerFactory.getLogger(SubsplitCreator.class);

  private final Configuration mConf;

  private final Session mSession;

  /**
   * Constructor for SubsplitCreator.
   * @param conf Hadoop configuration containing information about the Cassandra cluster.
   */
  public SubsplitCreator(Configuration conf, Session session) {
    this.mConf = conf;
    mSession = session;

    // Check that this session uses the load-balancing policy that we need.
    Preconditions.checkArgument(
        session.getCluster().getConfiguration().getPolicies().getLoadBalancingPolicy()
            instanceof ConsistentHostOrderPolicy
    );
  }

  /**
   * Create a list of subsplits for this Cassandra cluster.  Each subsplit contains an IP address
   * and a token range.
   * @return The subsplits.
   */
  public List<Subsplit> createSubsplits() {
    // Create subsplits that initially contain a mapping from token ranges to primary hosts.
    List<Subsplit> subsplits = createInitialSubsplits();

    // TODO: Add replica nodes to the subsplits.

    return subsplits;
  }

  private List<Subsplit> createInitialSubsplits() {
    // Retrieve all of the tokens for each host from the system tables.
    Map<String, String> tokensToMasterNodes = getTokenToMasterNodeMapping(mConf);

    // Go from a mapping between tokens and hosts to a mapping between token *ranges* and hosts.
    List<Long> sortedTokens = Lists.newArrayList();
    for (String tok : tokensToMasterNodes.keySet()) {
      sortedTokens.add(Long.parseLong(tok));
    }
    Collections.sort(sortedTokens);
    LOG.debug(String.format("Found %d total tokens", sortedTokens.size()));
    LOG.debug(String.format("Minimum tokens is %s", sortedTokens.get(0)));
    LOG.debug(String.format("Maximum tokens is %s", sortedTokens.get(sortedTokens.size() - 1)));

    // We need to add the global min and global max token values so that we make sure that our
    // subsplits cover all of the data in the cluster.
    sortedTokens.add(Subsplit.RING_START_TOKEN);
    sortedTokens.add(Subsplit.RING_END_TOKEN);
    Collections.sort(sortedTokens);
    Preconditions.checkArgument(sortedTokens.get(0) == Subsplit.RING_START_TOKEN);
    Preconditions.checkArgument(sortedTokens.get(sortedTokens.size() - 1) == Subsplit.RING_END_TOKEN);

    // Loop through all of the pairs of tokens, creating subsplits for every pair.  Remember in
    // C* that the master node for a token gets all data between the *previous* token and the token
    // in question, so we assign ownership of a given subsplit to the node associated with the
    // second (greater) of the two tokens.
    List<Subsplit> subsplits = Lists.newArrayList();

    for (int tokenIndex = 0; tokenIndex < sortedTokens.size() - 1; tokenIndex++) {
      long lowerBoundToken = sortedTokens.get(tokenIndex);

      String startToken = Long.toString(lowerBoundToken);
      String endToken = sortedTokens.get(tokenIndex + 1).toString();

      String hostForEndToken = tokensToMasterNodes.get(endToken);
      if (tokenIndex == sortedTokens.size() - 2) {
        Preconditions.checkArgument(null == hostForEndToken);
        hostForEndToken = tokensToMasterNodes.get(startToken);
        Preconditions.checkNotNull(hostForEndToken);
      }
      Preconditions.checkNotNull(hostForEndToken);

      // Ownership for a given node looks like (previous token, my token], so we add 1 to the
      // start token, unless the start token is the first token in our entire ring.
      final String startTokenAdjustedForExclusive;
      if (tokenIndex > 0) {
        startTokenAdjustedForExclusive = Long.toString(lowerBoundToken + 1);
      } else {
        startTokenAdjustedForExclusive = Long.toString(lowerBoundToken);
      }
      Subsplit subsplit = Subsplit.createFromHost(
          startTokenAdjustedForExclusive,
          endToken,
          hostForEndToken);
      subsplits.add(subsplit);
    }
    return subsplits;
  }

  /**
   * Read metadata from our Cassandra cluster to get the mapping from tokens to master nodes.
   *
   * @param conf Hadoop configuration containing information about the Cassandra cluster.
   * @return A map from tokens (stored as strings) to the master nodes (stored as IP addresses).
   */
  private Map<String, String> getTokenToMasterNodeMapping(Configuration conf) {
    Map<String, String> tokensToMasterNodes = Maps.newHashMap();

    // Get the set of tokens for the local host.
    updateTokenListForLocalHost(mSession, tokensToMasterNodes);

    // Query the `local` and `peers` tables to get mappings from tokens to hosts.
    updateTokenListForPeers(mSession, tokensToMasterNodes);

    return tokensToMasterNodes;
  }

  /**
   * Update our map of tokens to master nodes by getting a list of tokens owned by the local host.
   * @param session An open Cassandra session.
   * @param tokensToHosts The map from tokens to master nodes to update.
   */
  private void updateTokenListForLocalHost(
      Session session,
      Map<String, String> tokensToHosts) {
    String queryString = "SELECT tokens FROM system.local;";
    ResultSet resultSet = session.execute(queryString);
    List<Row> results = resultSet.all();
    Preconditions.checkArgument(results.size() == 1);

    Set<String> tokens = results.get(0).getSet("tokens", String.class);

    updateTokenListForSingleNode("localhost", tokens, tokensToHosts);
  }

  /**
   * Update our map of tokens to master nodes by getting a list of tokens owned by peers.
   * @param session An open Cassandra session.
   * @param tokensToHosts The map from tokens to master nodes to update.
   */
  private void updateTokenListForPeers(
      Session session,
      Map<String, String> tokensToHosts) {

    String queryString = "SELECT rpc_address, tokens FROM system.peers;";
    ResultSet resultSet = session.execute(queryString);

    for (Row row : resultSet.all()) {
      Set<String> tokens = row.getSet("tokens", String.class);
      InetAddress rpcAddress = row.getInet("rpc_address");
      String hostName = rpcAddress.getHostName();
      Preconditions.checkArgument(!hostName.equals("localhost"));
      updateTokenListForSingleNode(hostName, tokens, tokensToHosts);
    }
  }

  /**
   * Given a host and a list of tokens for which the host is the master node, update our map of
   * tokens to master nodes.
   * @param hostName The name of the host.
   * @param tokens A list of tokens for which the host is the master node.
   * @param tokensToHosts The map from tokens to master nodes to update.
   */
  private void updateTokenListForSingleNode(
      String hostName,
      Set<String> tokens,
      Map<String, String> tokensToHosts) {

    LOG.debug(String.format("Got %d tokens for host %s", tokens.size(), hostName));

    // For every token, create an entry in the map from that token to the host.
    for (String token : tokens) {
      Preconditions.checkArgument(!tokensToHosts.containsKey(token));
      tokensToHosts.put(token, hostName);
    }
  }
}