package me.prettyprint.cassandra.service; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicLong; import com.google.common.collect.Lists; import me.prettyprint.hector.api.Keyspace; import me.prettyprint.hector.api.Serializer; import me.prettyprint.hector.api.beans.Row; import me.prettyprint.hector.api.beans.Rows; import me.prettyprint.hector.api.exceptions.HectorException; import me.prettyprint.hector.api.factory.HFactory; import me.prettyprint.hector.api.query.MultigetSliceQuery; import me.prettyprint.hector.api.query.QueryResult; /** * Iterates over the MultigetSliceQuery result set, refreshing until all * qualifying rows are retrieved based on input keys.  This iterator is * optimized for parallelism with the help of maxThreadCount option provided. If * maxThreadCount is not provided, it calls Cassandra with the * set(maxRowCountPerQuery) of row keys at a time unless all keys are queried. * E.g., maxRowCountPerQuery is 100 and maxThreadCount 5, it calls Cassandra 5 * times using 5 threads/ parallelism for total of 500 keys. You can also * configure it not to use Threads and call Cassandra 5 time sequentially * instead of parallelism by not setting maxThreadCount or setting it 0 * * * @author vchella * @param <K> * The type of the row key * @param <N> * Column name type * @param <V> * Column value type */ public class MultigetSliceIterator<K, N, V> implements Iterator<Row<K, N, V>> { /** * DEFAULT constant variable to store and use when maxColumnCountPerRow is * not specified */ private static final int DEFAULT_MAXCOL_COUNT = 100; /** * DEFAULT constant variable to store and use when maxRowCountPerQuery is * not specified */ private static final int DEFAULT_MAXROW_COUNT_PERQUERY = 0; /** * DEFAULT constant variable to store and use when maxThreadCount is not * specified */ private static final int DEFAULT_MAX_THREAD_COUNT = 0; /** * "Row" Iterator to hold the result of MultigetSliceQuery */ private Iterator<Row<K, N, V>> iterator; /** * "Start" key predicate to retrieve a list of columns in the range of * start-finish Either start and or finish can be null which will toggle the * underlying predicate to use an empty byte[] */ private N start; /** * "End" key predicate to retrieve a list of columns in the range of * start-finish Either start and or finish can be null which will toggle the * underlying predicate to use an empty byte[] */ private N finish; /** * Sets the return order of the columns to be reversed. NOTE: this is * slightly less efficient than reading in comparator order. */ private boolean reversed; /** * private internal variable to hold the current index in list of rows */ private int rowKeysIndex = 0; /** * Generic List to hold row keys "List<K>" which can be used in MultigetSliceQuery */ private List<List<K>> rowKeysList = new LinkedList<List<K>>(); /** * Generic List to hold "K" (keys) which are passed to * MultigetSliceIterator */ private List<K> rowKeys = new LinkedList<K>(); /** * internal variable to hold maxRowCountPerQuery which is passed to * MultigetSliceIterator. When it is defaulted to 0, all keys will be * queried at once. This setting gives the flexibility to limit the result * size to be in allowed limit of Thrift library. Use this if size of rows * which will be returned is greater than default * thrift_max_message_length_in_mb (16 MB) */ private int maxRowCountPerQuery = DEFAULT_MAXROW_COUNT_PERQUERY; /** * keyspace to be queried */ private Keyspace keyspace; /** * keySerializer to be used in query */ private Serializer<K> keySerializer; /** * nameSerializer to be used in query */ private Serializer<N> nameSerializer; /** * valueSerializer to be used in query */ private Serializer<V> valueSerializer; /** * keyspace to be queried */ private String columnFamily; /** * How long the operation took to execute in MICRO-seconds. */ private AtomicLong totalExecutionTimeMicro = new AtomicLong(0); /** * How long the operation took to execute in NANO-seconds. */ private AtomicLong totalExecutionTimeNano = new AtomicLong(0); /** * internal variable to hold maxThreadCount which is passed. It is defaulted * to 0 if none specified */ private int maxThreads = DEFAULT_MAX_THREAD_COUNT; /** * internal variable to hold maxColumnCountPerRow which is passed. It is * defaulted to 100. */ private int maxColumnCount = DEFAULT_MAXCOL_COUNT; /** * internal variable to hold thread count which is calculated through * prepareKeysForParallelism(). */ private int threadCount = DEFAULT_MAX_THREAD_COUNT; /** * internal variable to hold RowCountPerQuery, * calculated based on numThreads allowed and numKeys provided */ private int numKeysPerThread; /** * List of Hosts used for execution. This Map is synchronized for thread safety */ private Map<String,CassandraHost> m_hostsUsed = Collections.synchronizedMap(new HashMap<String, CassandraHost>()); /** * List<Rows<K,N,V>> to hold the result. This collection is synchronized for thread safety */ private List<Rows<K, N, V>> queryResult = Collections.synchronizedList(new LinkedList<Rows<K, N, V>>()); /** * Constructor with the required parameters. Below are default parameter values * int maxThreadCount = 0; //Disable parallelism * int maxRowCountPerQuery = 0; // Query all keys at a time. * int maxColumnCountPerRow = 100;// Limit columns count to 100 in each row * * @param reversed * @param maxColsCountPerQuery * @param maxColCount * @param maxRowCountPerQuery * @param keyspace * @param keySerializer * @param nameSerializer * @param valueSerializer */ public MultigetSliceIterator(boolean reversed, Keyspace keyspace, Serializer<K> keySerializer, Serializer<N> nameSerializer, Serializer<V> valueSerializer, String columnFamily, List<K> rowKeys, N start, N finish) { this(reversed, keyspace, keySerializer, nameSerializer, valueSerializer, columnFamily, rowKeys, start, finish, DEFAULT_MAX_THREAD_COUNT, DEFAULT_MAXROW_COUNT_PERQUERY, DEFAULT_MAXCOL_COUNT); } /** * Constructor with the required parameters Constructor with the required * parameters. Below are default parameter values * int maxThreadCount 0; //Disable parallelism * int maxRowCountPerQuery = 0; // Query all keys at a time. * * @param reversed * @param maxColsCountPerQuery * @param maxColCount * @param maxRowCountPerQuery * @param keyspace * @param keySerializer * @param nameSerializer * @param valueSerializer */ public MultigetSliceIterator(boolean reversed, Keyspace keyspace, Serializer<K> keySerializer, Serializer<N> nameSerializer, Serializer<V> valueSerializer, String columnFamily, List<K> rowKeys, N start, N finish, int maxColumnCountPerRow) { this(reversed, keyspace, keySerializer, nameSerializer, valueSerializer, columnFamily, rowKeys, start, finish, DEFAULT_MAX_THREAD_COUNT, DEFAULT_MAXROW_COUNT_PERQUERY, maxColumnCountPerRow); } /** * Constructor with the required parameters. Below are default parameter values * int maxThreadCount = 0; //Disable parallelism * * @param reversed * @param maxColsCountPerQuery * @param maxColCount * @param maxRowCountPerQuery * @param keyspace * @param keySerializer * @param nameSerializer * @param valueSerializer */ public MultigetSliceIterator(boolean reversed, int maxRowCountPerQuery, Keyspace keyspace, Serializer<K> keySerializer, Serializer<N> nameSerializer, Serializer<V> valueSerializer, String columnFamily, List<K> rowKeys, N start, N finish, int maxColumnCountPerRow) { this(reversed, keyspace, keySerializer, nameSerializer, valueSerializer, columnFamily, rowKeys, start, finish, DEFAULT_MAX_THREAD_COUNT, maxRowCountPerQuery, maxColumnCountPerRow); } /** * Constructor with the required parameters. Below are default parameter values * int maxThreadCount = 0; //Disable parallelism * int maxColumnCountPerRow = 100;// Limit columns count to 100 in each row * * @param reversed * @param maxColsCountPerQuery * @param maxColCount * @param maxRowCountPerQuery * @param keyspace * @param keySerializer * @param nameSerializer * @param valueSerializer */ public MultigetSliceIterator(boolean reversed, int maxRowCountPerQuery, Keyspace keyspace, Serializer<K> keySerializer, Serializer<N> nameSerializer, Serializer<V> valueSerializer, String columnFamily, List<K> rowKeys, N start, N finish) { this(reversed, keyspace, keySerializer, nameSerializer, valueSerializer, columnFamily, rowKeys, start, finish, DEFAULT_MAX_THREAD_COUNT, maxRowCountPerQuery, DEFAULT_MAXCOL_COUNT); } /** * Constructor with all required parameters. No default values will be used * * @param reversed * @param maxColsCountPerQuery * @param maxColCount * @param maxRowCountPerQuery * @param keyspace * @param keySerializer * @param nameSerializer * @param valueSerializer */ public MultigetSliceIterator(boolean reversed, Keyspace keyspace, Serializer<K> keySerializer, Serializer<N> nameSerializer, Serializer<V> valueSerializer, String columnFamily, List<K> rowKeys, N start, N finish, int maxThreadCount, int maxRowCountPerQuery, int maxColumnCountPerRow) { this.reversed = reversed; this.maxRowCountPerQuery = maxRowCountPerQuery; this.keyspace = keyspace; this.keySerializer = keySerializer; this.nameSerializer = nameSerializer; this.valueSerializer = valueSerializer; this.columnFamily = columnFamily; this.start = start; this.finish = finish; this.rowKeys = rowKeys; this.maxColumnCount = maxColumnCountPerRow; this.maxThreads = maxThreadCount; this.rowKeysList = prepareKeysForParallelism(); } /** * This method prepares keys for execution, determines whether to use * parallelism or not to query Cassandra, executes the query and collects the result */ private void runQuery() { if(this.rowKeysList != null && this.rowKeysList.size() > 0) { // Check if there are rowkeys to query Cassandra if (threadCount > 1) { // When thread count greater than 1 enables parallelism, use threads to query Cassandra // multiple times ExecutorService executor = Executors .newFixedThreadPool(threadCount); List<Future<?>> futures = new LinkedList<Future<?>>(); for (final List<K> param : this.rowKeysList) { Future<?> future = executor.submit(new Runnable() { public void run() { // Query Cassandra with the input keys provided runMultigetSliceQuery(param); } }); futures.add(future); } for (Future<?> f : futures) {// iterate through thread results try { f.get(); // wait for thread to complete } catch (InterruptedException e) { throw new HectorException("Failed to retrieve rows from Cassandra.",e); } catch (ExecutionException e) { throw new HectorException("Failed to retrieve rows from Cassandra.",e); } } // Safe to shutdown the threadpool and release the resources executor.shutdown(); // set the rowKeysIndex to size of input keys so as no further calls // will be made to Cassandra. // This ensures iterator.hasNext() returns false when all keys are // queried rowKeysIndex = this.rowKeysList.size(); } else {// When thread count less than or equal to 1 (0 or negative) disables // parallelism, set of(maxRowCountPerQuery) keys queries // Cassandra at a time runMultigetSliceQuery(this.rowKeysList.get(rowKeysIndex)); // Increment the rowKeyIndex instead of setting it to this.rowKeysList.size(); rowKeysIndex++; } } ArrayList<Row<K, N, V>> resultList = new ArrayList<Row<K, N, V>>(queryResult.size()); synchronized (queryResult) { // Ensure that runMultigetSliceQuery() method call updates global // variable queryResult with query result (if exists) if (queryResult != null && queryResult.size() > 0) { for (Rows<K, N, V> rows : queryResult) { if (rows != null && rows.getCount() > 0) { for (Row<K, N, V> row : rows) { // prepare List<Row<K, N, V>> to return // the iterator of <Row<K,N,V>> to the caller resultList.add(row); } } } } } // assign global iterator with the result of multigetSliceQuery iterator = resultList.iterator(); } /** * Execute MultigetSliceQuery with the set of (maxRowCountPerQuery) keys * provided and keep the result in global variable queryResult (List<Rows<K, * N, V>>) * * @param param */ private void runMultigetSliceQuery(final List<K> param) { MultigetSliceQuery<K, N, V> multigetSliceQuery = HFactory .createMultigetSliceQuery(keyspace, keySerializer, nameSerializer, valueSerializer); multigetSliceQuery.setColumnFamily(columnFamily); multigetSliceQuery.setKeys(param); multigetSliceQuery.setRange(start, finish, reversed, maxColumnCount); QueryResult<Rows<K, N, V>> result = multigetSliceQuery.execute(); queryResult.add(result.get()); // Add current query execution time to internal variable // totalExecutionTimeMicro. When parallelism is enabled, this value might not be correct // always due to available system resources and thread implementation totalExecutionTimeMicro.addAndGet(result.getExecutionTimeMicro()); // Add current query execution time to internal variable // totalExecutionTimeNano. When parallelism is enabled, this value might not be correct // always due to available system resources and thread implementation totalExecutionTimeNano.addAndGet(result.getExecutionTimeNano()); //Add host used to the list m_hostsUsed.put(result.getHostUsed().getIp(), result.getHostUsed()); } @Override public boolean hasNext() { if (iterator == null) {// if iterator is null, call runQuery runQuery(); } else if (!iterator.hasNext() && rowKeysIndex < this.rowKeysList.size()) { // only need to do another query if all keys were not queried retrieved runQuery(); } return iterator.hasNext(); } @Override public Row<K, N, V> next() { return iterator.next(); } @Override public void remove() { iterator.remove(); } /** * How long the operation took to execute in MICRO-seconds. When parallelism * is enabled, this value might not be correct always due to available system resources and thread implementation * * @return the totalExecutionTimeMicro */ public long getTotalExecutionTimeMicro() { return this.totalExecutionTimeMicro.get(); } /** * How long the operation took to execute in NANO-seconds. When parallelism * is enabled, this value might not be correct always due to available system resources and thread implementation * * @return the totalExecutionTimeNano */ public long getTotalExecutionTimeNano() { return this.totalExecutionTimeNano.get(); } /** * The {@link CassandraHost} on which this operation * was successful */ public String getHostsUsed() { String hostsUsed= new String(); StringBuilder strBldr = new StringBuilder(); Set<Entry<String,CassandraHost>> se= m_hostsUsed.entrySet(); for (Entry<String, CassandraHost> entry : se) { strBldr.append(entry.getValue().toString()); strBldr.append(';'); } if(se.size()>0 && strBldr.length()>0) { hostsUsed=strBldr.substring(0, strBldr.length()-1); } return hostsUsed; } /** * Number of threads used to call Cassandra * @return Thread count used */ public int getThreadCountUsed() { return threadCount; } /** * Returns the RowCount per query used in this operation * @return */ public int getRowCountPerQueryUsed() { return numKeysPerThread; } /** * prepare row Keys For Parallelism by considering maxRowCountPerQuery, * m_maxThreads and numKeys * * @param m_maxThreads * @return */ private List<List<K>> prepareKeysForParallelism() { // Calculate the number of row keys to be queried at a time. // Consider whether parallelism is enabled or not. // When numThreads is calculated, the int truncation // causes one fewer thread to be used if numKeys isn't evenly divided // into maxRowCountPerQuery. Thus, // the keys are divided among threads/ calls evenly with each thread // getting up to maxRowCountPerQuery List<List<K>> returnKeys = new LinkedList<List<K>>(); int numKeys = rowKeys.size(); // Calculate how many thread are required based on input # keys and each // time rowkeys limit of maxRowCountPerQuery int numThreads = 1; if(maxRowCountPerQuery>0){ numThreads=(int) Math.ceil((numKeys / (double) maxRowCountPerQuery)); numThreads= Math.max(numThreads, 1); } // if number of threads required is more than the maximum limit of // allowable threads then cap the number of threads threadCount=Math.min(numThreads, maxThreads); // We get the ceiling of numKeys/numThreads in order to spread out the // row keys evenly. // e.g. if numKeys=101 and maxRowCountPerQuery=50, it makes set of 34 // keys to be queried at a time. numThreads= Ceil(101/50)=>3 numKeysPerThread = (int) Math .ceil(numKeys / (double) numThreads); // Default it to 1, so that all keys will be passed once, instead of breaking it numKeysPerThread=Math.max(numKeysPerThread, 1); //Check if there are any rowkeys if(this.rowKeys!=null && this.rowKeys.size()>0) { // split keys into subsets based on the above calculation returnKeys = Lists.partition(rowKeys, numKeysPerThread); } return returnKeys; } }