/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.usergrid.persistence.core.astyanax; import java.util.*; import com.google.common.base.Optional; import org.apache.usergrid.persistence.core.shard.SmartShard; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.netflix.astyanax.Keyspace; import com.netflix.astyanax.connectionpool.exceptions.ConnectionException; import com.netflix.astyanax.model.Column; import com.netflix.astyanax.model.ColumnFamily; import com.netflix.astyanax.model.ColumnList; import com.netflix.astyanax.model.ConsistencyLevel; import com.netflix.astyanax.model.Rows; import com.netflix.astyanax.query.RowSliceQuery; import com.netflix.astyanax.util.RangeBuilder; /** * * */ public class MultiRowShardColumnIterator<R, C, T> implements Iterator<T> { private static final Logger logger = LoggerFactory.getLogger( MultiRowShardColumnIterator.class ); private final int pageSize; private final ColumnFamily<R, C> cf; private final ColumnParser<C, T> columnParser; private final ColumnSearch<T> columnSearch; private final Comparator<T> comparator; private final Keyspace keyspace; private final ConsistencyLevel consistencyLevel; private T startColumn; private boolean moreToReturn; private Iterator<T> currentColumnIterator; private Iterator<SmartShard> currentShardIterator; private List<SmartShard> rowKeysWithShardEnd; private SmartShard currentShard; private List<T> resultsTracking; // use for de-duping results that are possible during shard transition private int skipSize = 0; // used for determining if we've skipped a whole page during shard transition private boolean ascending = false; private Optional<Long> lastTimestamp; public MultiRowShardColumnIterator( final Keyspace keyspace, final ColumnFamily<R, C> cf, final ConsistencyLevel consistencyLevel, final ColumnParser<C, T> columnParser, final ColumnSearch<T> columnSearch, final Comparator<T> comparator, final int pageSize, final List<SmartShard> rowKeysWithShardEnd, final boolean ascending, final Optional<Long> lastTimestamp) { this.cf = cf; this.pageSize = pageSize; this.columnParser = columnParser; this.columnSearch = columnSearch; this.comparator = comparator; this.keyspace = keyspace; this.consistencyLevel = consistencyLevel; this.moreToReturn = true; this.rowKeysWithShardEnd = rowKeysWithShardEnd; this.resultsTracking = new ArrayList<>(); this.ascending = ascending; this.lastTimestamp = lastTimestamp; } @Override public boolean hasNext() { // if column iterator is null, initialize with first call to advance() // advance if we know there more columns exist in the current shard but we've exhausted this page fetch from c* if ( currentColumnIterator == null || ( !currentColumnIterator.hasNext() && moreToReturn ) ) { advance(); } // when there are no more columns, nothing reported to return, but more shards available, go to the next shard if( currentColumnIterator != null && !currentColumnIterator.hasNext() && !moreToReturn && currentShardIterator.hasNext()){ if(logger.isTraceEnabled()){ logger.trace("Advancing shard iterator"); logger.trace("Shard before advance: {}", currentShard); } // advance to the next shard currentShard = currentShardIterator.next(); // handle marked deleted shards while ( currentShard.isDeleted() && currentShardIterator.hasNext()){ if(logger.isTraceEnabled()) { logger.trace("Shard is marked deleted, advancing to next - {}", currentShard); } currentShard = currentShardIterator.next(); } // if the last shard is deleted, return false, there is no next to seek if ( currentShard.isDeleted() && !currentShardIterator.hasNext()){ if(logger.isTraceEnabled()) { logger.trace("Shard is marked deleted, and there are no more shards - {}", currentShard); } return false; } if(logger.isTraceEnabled()){ logger.trace("Shard after advance: {}", currentShard); } advance(); } return currentColumnIterator.hasNext(); } @Override public T next() { if ( !hasNext() ) { throw new NoSuchElementException( "No new element exists" ); } final T next = currentColumnIterator.next(); return next; } @Override public void remove() { throw new UnsupportedOperationException( "Remove is unsupported this is a read only iterator" ); } public void advance() { if (logger.isTraceEnabled()) logger.trace( "Advancing multi row column iterator" ); /** * If the edge is present, we need to being seeking from this */ final boolean skipFirstColumn = startColumn != null; final int selectSize = skipFirstColumn ? pageSize + 1 : pageSize; final RangeBuilder rangeBuilder = new RangeBuilder(); SmartShard startShard = null; if(currentShardIterator == null){ // create a copy that we use to search for our 'starting shard' final List<SmartShard> shards = new ArrayList<>(rowKeysWithShardEnd); // flip the order of our shards if ascending if(ascending){ Collections.reverse(rowKeysWithShardEnd); } if(lastTimestamp.isPresent()) { //always seek from 0 to find out where our cursor last should fall Collections.reverse(shards); for ( SmartShard shard : shards){ if ( lastTimestamp.get().compareTo(shard.getShardIndex()) > 0) { startShard = shard; } } } currentShardIterator = rowKeysWithShardEnd.iterator(); } if(currentShard == null){ if(logger.isTraceEnabled()){ logger.trace("currentShard: {}", currentShard); } currentShard = currentShardIterator.next(); if (startShard != null){ while(!currentShard.equals(startShard)){ currentShard = currentShardIterator.next(); } } // skip over shards that are marked deleted while ( currentShard.isDeleted() && currentShardIterator.hasNext() ){ if(logger.isTraceEnabled()){ logger.trace("Shard is marked deleted - {}", currentShard); } currentShard = currentShardIterator.next(); } if(logger.isTraceEnabled()){ logger.trace("all shards when starting: {}", rowKeysWithShardEnd); logger.trace("initializing iterator with shard: {}", currentShard); } } // initial request, build the range with no start and no end if ( startColumn == null && currentShard.getShardEnd() == null ){ columnSearch.buildRange( rangeBuilder ); if(logger.isTraceEnabled()){ logger.trace("initial search (no start or shard end)"); } } // if there's only a startColumn set the range start startColumn always else if ( startColumn != null && currentShard.getShardEnd() == null ){ columnSearch.buildRange( rangeBuilder, startColumn, null ); if(logger.isTraceEnabled()){ logger.trace("search (no shard end) with start: {}", startColumn); } } // if there's only a shardEnd, set the start/end according based on the search order else if ( startColumn == null && currentShard.getShardEnd() != null ){ T shardEnd = (T) currentShard.getShardEnd(); // if we have a shardEnd and it's not an ascending search, use the shardEnd as a start if(!ascending) { columnSearch.buildRange(rangeBuilder, shardEnd, null); if(logger.isTraceEnabled()){ logger.trace("search descending with start: {}", shardEnd); } } // if we have a shardEnd and it is an ascending search, use the shardEnd as the end else{ columnSearch.buildRange( rangeBuilder, null, shardEnd ); if(logger.isTraceEnabled()){ logger.trace("search ascending with end: {}", shardEnd); } } } // if there's both a startColumn and a shardEnd, decide which should be used as start/end based on search order else if ( startColumn != null && currentShard.getShardEnd() != null) { T shardEnd = (T) currentShard.getShardEnd(); // if the search is not ascending, set the start to be the older edge if(!ascending){ T searchStart = comparator.compare(shardEnd, startColumn) > 0 ? shardEnd : startColumn; columnSearch.buildRange( rangeBuilder, searchStart, null); if(logger.isTraceEnabled()){ logger.trace("search descending with start: {} in shard", searchStart, currentShard); } } // if the search is ascending, then always use the startColumn for the start and shardEnd for the range end else{ columnSearch.buildRange( rangeBuilder, startColumn , shardEnd); if(logger.isTraceEnabled()){ logger.trace("search with start: {}, end: {}", startColumn, shardEnd); } } } rangeBuilder.setLimit( selectSize ); if (logger.isTraceEnabled()) logger.trace( "Executing cassandra query with shard {}", currentShard ); /** * Get our list of slices */ final RowSliceQuery<R, C> query = keyspace.prepareQuery( cf ).setConsistencyLevel( consistencyLevel ).getKeySlice( (R) currentShard.getRowKey() ) .withColumnRange( rangeBuilder.build() ); final Rows<R, C> result; try { result = query.execute().getResult(); } catch ( ConnectionException e ) { throw new RuntimeException( "Unable to connect to casandra", e ); } final List<T> mergedResults; skipSize = 0; mergedResults = processResults( result, selectSize ); if(logger.isTraceEnabled()){ logger.trace("skipped amount: {}", skipSize); } final int size = mergedResults.size(); if(logger.isTraceEnabled()){ logger.trace("current shard: {}, retrieved size: {}", currentShard, size); logger.trace("selectSize={}, size={}, ", selectSize, size); } moreToReturn = size == selectSize; if(selectSize == 1001 && mergedResults.size() == 1000){ moreToReturn = true; } // if a whole page is skipped OR the result size equals the the difference of what's skipped, // it is likely during a shard transition and we should assume there is more to read if( skipSize == selectSize || skipSize == selectSize - 1 || size == selectSize - skipSize || size == (selectSize -1) - skipSize ){ moreToReturn = true; } //we have a first column to to check if( size > 0) { final T firstResult = mergedResults.get( 0 ); //The search has either told us to skip the first element, or it matches our last, therefore we disregard it if(columnSearch.skipFirst( firstResult ) || (skipFirstColumn && comparator.compare( startColumn, firstResult ) == 0)){ if(logger.isTraceEnabled()){ logger.trace("removing an entry"); } mergedResults.remove( 0 ); } } // set the start column for the enxt query if(moreToReturn && mergedResults.size() > 0){ startColumn = mergedResults.get( mergedResults.size() - 1 ); } currentColumnIterator = mergedResults.iterator(); //force an advance of this iterator when there are still shards to read but result set on current shard is 0 if(size == 0 && currentShardIterator.hasNext()){ hasNext(); } if(logger.isTraceEnabled()){ logger.trace("currentColumnIterator.hasNext()={}, " + "moreToReturn={}, currentShardIterator.hasNext()={}", currentColumnIterator.hasNext(), moreToReturn, currentShardIterator.hasNext()); } } /** * Process the result set and filter any duplicates that may have already been seen in previous shards. During * a shard transition, there could be the same columns in multiple shards (rows). This will also allow for * filtering the startColumn (the seek starting point) when paging a row in Cassandra. * * @param result * @return */ private List<T> processResults(final Rows<R, C> result, final int maxSize ) { final List<T> mergedResults = new ArrayList<>(maxSize); for ( final R key : result.getKeys() ) { final ColumnList<C> columns = result.getRow( key ).getColumns(); for (final Column<C> column :columns ) { final T returnedValue = columnParser.parseColumn( column ); // use an O(log n) search, same as a tree, but with fast access to indexes for later operations int searchIndex = Collections.binarySearch( resultsTracking, returnedValue, comparator ); //we've already seen the column, filter it out as we might be in a shard transition or our start column if(searchIndex > -1){ if(logger.isTraceEnabled()){ logger.trace("skipping column as it was already retrieved before"); } skipSize++; continue; } resultsTracking.add(returnedValue); mergedResults.add(returnedValue ); } if (logger.isTraceEnabled()) logger.trace( "Candidate result set size is {}", mergedResults.size() ); } return mergedResults; } }