/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.usergrid.persistence.graph.serialization.impl.shard.impl; import java.util.Collections; import java.util.Iterator; import com.google.common.base.Optional; import org.apache.usergrid.persistence.graph.serialization.impl.shard.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.usergrid.persistence.core.consistency.TimeService; import org.apache.usergrid.persistence.core.scope.ApplicationScope; import org.apache.usergrid.persistence.core.util.ValidationUtils; import org.apache.usergrid.persistence.graph.GraphFig; import org.apache.usergrid.persistence.graph.MarkedEdge; import org.apache.usergrid.persistence.graph.SearchByEdgeType; import org.apache.usergrid.persistence.graph.exception.GraphRuntimeException; import org.apache.usergrid.persistence.graph.serialization.util.GraphValidation; import com.google.common.base.Preconditions; import com.google.inject.Inject; import com.netflix.astyanax.MutationBatch; import com.netflix.astyanax.connectionpool.exceptions.ConnectionException; import com.netflix.astyanax.util.TimeUUIDUtils; /** * Implementation of the node shard monitor and allocation */ public class NodeShardAllocationImpl implements NodeShardAllocation { private static final Logger logger = LoggerFactory.getLogger( NodeShardAllocationImpl.class ); private final EdgeShardSerialization edgeShardSerialization; private final EdgeColumnFamilies edgeColumnFamilies; private final ShardedEdgeSerialization shardedEdgeSerialization; private final TimeService timeService; private final GraphFig graphFig; private final ShardGroupCompaction shardGroupCompaction; private final NodeShardCache nodeShardCache; @Inject public NodeShardAllocationImpl( final EdgeShardSerialization edgeShardSerialization, final EdgeColumnFamilies edgeColumnFamilies, final ShardedEdgeSerialization shardedEdgeSerialization, final TimeService timeService, final GraphFig graphFig, final ShardGroupCompaction shardGroupCompaction, final NodeShardCache nodeShardCache) { this.edgeShardSerialization = edgeShardSerialization; this.edgeColumnFamilies = edgeColumnFamilies; this.shardedEdgeSerialization = shardedEdgeSerialization; this.timeService = timeService; this.graphFig = graphFig; this.shardGroupCompaction = shardGroupCompaction; this.nodeShardCache = nodeShardCache; } @Override public Iterator<ShardEntryGroup> getShards(final ApplicationScope scope, final DirectedEdgeMeta directedEdgeMeta) { ValidationUtils.validateApplicationScope( scope ); GraphValidation.validateDirectedEdgeMeta( directedEdgeMeta ); Iterator<Shard> existingShards; //its a new node, it doesn't need to check cassandra, it won't exist if ( isNewNode( directedEdgeMeta ) ) { existingShards = Collections.singleton( Shard.MIN_SHARD ).iterator(); } else { existingShards = edgeShardSerialization.getShardMetaData( scope, Optional.absent(), directedEdgeMeta ); /** * We didn't get anything out of cassandra, so we need to create the minimum shard */ if ( existingShards == null || !existingShards.hasNext() ) { final MutationBatch batch = edgeShardSerialization.writeShardMeta( scope, Shard.MIN_SHARD, directedEdgeMeta ); try { batch.execute(); } catch ( ConnectionException e ) { throw new RuntimeException( "Unable to connect to casandra", e ); } existingShards = Collections.singleton( Shard.MIN_SHARD ).iterator(); } } return new ShardEntryGroupIterator( existingShards, graphFig.getShardMinDelta(), shardGroupCompaction, scope, directedEdgeMeta ); } @Override public boolean auditShard( final ApplicationScope scope, final ShardEntryGroup shardEntryGroup, final DirectedEdgeMeta directedEdgeMeta ) { ValidationUtils.validateApplicationScope( scope ); GraphValidation.validateShardEntryGroup( shardEntryGroup ); GraphValidation.validateDirectedEdgeMeta( directedEdgeMeta ); Preconditions.checkNotNull( shardEntryGroup, "shardEntryGroup cannot be null" ); /** * Nothing to do, it's been created very recently, we don't create a new one */ if ( shardEntryGroup.isCompactionPending() ) { if (logger.isTraceEnabled()) logger.trace( "Shard entry group {} is compacting, not auditing", shardEntryGroup ); return false; } //we can't allocate, we have more than 1 write shard currently. We need to compact first if ( shardEntryGroup.entrySize() != 1 ) { if (logger.isTraceEnabled()) logger.trace( "Shard entry group {} does not have 1 entry, not allocating", shardEntryGroup ); return false; } /** * Check the min shard in our system */ final Shard shard = shardEntryGroup.getMinShard(); final long minTime = getMinTime(); if ( shard.getCreatedTime() >= minTime ) { if (logger.isTraceEnabled()) logger.trace( "Shard entry group {} and shard {} is before the minimum created time of {}. Not allocating", shardEntryGroup, shard, minTime ); return false; } /** * Check out if we have a count for our shard allocation */ final long shardSize = graphFig.getShardSize(); /** * We want to allocate a new shard as close to the max value as possible. This way if we're filling up a * shard rapidly, we split it near the head of the values. * Further checks to this group will result in more splits, similar to creating a tree type structure and * splitting each node. * * This means that the lower shard can be re-split later if it is still too large. We do the division to * truncate * to a split point < what our current max is that would be approximately be our pivot ultimately if we split * from the * lower bound and moved forward. Doing this will stop the current shard from expanding and avoid a point * where we cannot * ultimately compact to the correct shard size. */ /** * Allocate the shard */ final Iterator<MarkedEdge> edges = directedEdgeMeta .loadEdges( shardedEdgeSerialization, edgeColumnFamilies, scope, Collections.singletonList(shard),0, SearchByEdgeType.Order.ASCENDING ); if ( !edges.hasNext() ) { if (logger.isTraceEnabled()) logger.trace( "Tried to allocate a new shard for edge meta data {}, but no max value could be found in that row", directedEdgeMeta ); return false; } MarkedEdge marked = null; /** * Advance to the pivot point we should use. Once it's compacted, we can split again. * We either want to take the first one (unlikely) or we take our total count - the shard size. * If this is a negative number, we're approaching our max count for this shard, so the first * element will suffice. */ long edgeCount = 0; for ( long i = 1; edges.hasNext(); i++ ) { //we hit a pivot shard, set it since it could be the last one we encounter if ( i % shardSize == 0 ) { marked = edges.next(); } else { edges.next(); } edgeCount++; } /** * Sanity check in case we audit before we have a full shard */ if ( marked == null ) { if (logger.isTraceEnabled()){ logger.trace( "Shard {} in shard group {} not full, " + "not splitting. Edge count: {}", shard, shardEntryGroup, edgeCount ); } return false; } final long createTimestamp = timeService.getCurrentTime(); final Shard newShard = new Shard( marked.getTimestamp(), createTimestamp, false ); if(logger.isTraceEnabled()) { logger.trace("Allocating new shard {} for edge meta {}", newShard, directedEdgeMeta); } final MutationBatch batch = this.edgeShardSerialization.writeShardMeta( scope, newShard, directedEdgeMeta ); try { batch.execute(); if(logger.isTraceEnabled()) { logger.trace("Clearing shard cache"); } // invalidate the shard cache so we can be sure that all read shards are up to date nodeShardCache.invalidate(scope, directedEdgeMeta); } catch ( ConnectionException e ) { throw new RuntimeException( "Unable to connect to casandra", e ); } return true; } @Override public long getMinTime() { final long minimumAllowed = ( long ) (2.5 * graphFig.getShardCacheTimeout()); final long minDelta = graphFig.getShardMinDelta(); if ( minDelta < minimumAllowed ) { throw new GraphRuntimeException( String .format( "You must configure the property %s to be >= 2 x %s. Otherwise you risk losing data", GraphFig.SHARD_MIN_DELTA, GraphFig.SHARD_CACHE_TIMEOUT ) ); } return timeService.getCurrentTime() - minDelta; } /** * Return true if the node has been created within our timeout. If this is the case, we dont' need to check * cassandra, we know it won't exist */ private boolean isNewNode( DirectedEdgeMeta directedEdgeMeta ) { //The timeout is in milliseconds. Time for a time uuid is 1/10000 of a milli, so we need to get the units // correct final long timeoutDelta = graphFig.getShardCacheTimeout(); final long timeNow = timeService.getCurrentTime(); boolean isNew = true; for ( DirectedEdgeMeta.NodeMeta node : directedEdgeMeta.getNodes() ) { //short circuit if not a type 1 time UUID if ( !isNew || node.getId().getUuid().version() != 1 ) { return false; } final long uuidTime = TimeUUIDUtils.getTimeFromUUID( node.getId().getUuid() ); final long newExpirationTimeout = uuidTime + timeoutDelta; //our expiration is after our current time, treat it as new isNew = isNew && newExpirationTimeout > timeNow; } return isNew; } }