/* * Copyright 2013 Jive Software, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.jivesoftware.os.amza.service; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.jivesoftware.os.amza.api.DeltaOverCapacityException; import com.jivesoftware.os.amza.api.FailedToAchieveQuorumException; import com.jivesoftware.os.amza.api.partition.Consistency; import com.jivesoftware.os.amza.api.partition.PartitionName; import com.jivesoftware.os.amza.api.partition.PartitionProperties; import com.jivesoftware.os.amza.api.ring.RingMember; import com.jivesoftware.os.amza.api.scan.RowsChanged; import com.jivesoftware.os.amza.api.stream.ClientUpdates; import com.jivesoftware.os.amza.api.stream.KeyValueStream; import com.jivesoftware.os.amza.api.stream.PrefixedKeyRanges; import com.jivesoftware.os.amza.api.stream.TxKeyValueStream; import com.jivesoftware.os.amza.api.stream.TxKeyValueStream.TxResult; import com.jivesoftware.os.amza.api.stream.UnprefixedWALKeys; import com.jivesoftware.os.amza.api.take.Highwaters; import com.jivesoftware.os.amza.api.take.TakeResult; import com.jivesoftware.os.amza.api.wal.WALHighwater; import com.jivesoftware.os.amza.api.wal.WALUpdated; import com.jivesoftware.os.amza.service.partition.VersionedPartitionProvider; import com.jivesoftware.os.amza.service.replication.PartitionStripeProvider; import com.jivesoftware.os.amza.service.stats.AmzaStats; import com.jivesoftware.os.amza.service.take.HighwaterStorage; import com.jivesoftware.os.amza.service.take.TakeCoordinator; import com.jivesoftware.os.aquarium.LivelyEndState; import com.jivesoftware.os.aquarium.State; import com.jivesoftware.os.jive.utils.ordered.id.OrderIdProvider; import com.jivesoftware.os.mlogger.core.MetricLogger; import com.jivesoftware.os.mlogger.core.MetricLoggerFactory; import java.util.Set; public class StripedPartition implements Partition { private static final MetricLogger LOG = MetricLoggerFactory.getLogger(); private final AmzaStats amzaStats; private final OrderIdProvider orderIdProvider; private final VersionedPartitionProvider versionedPartitionProvider; private final WALUpdated walUpdated; private final RingMember ringMember; private final PartitionName partitionName; private final PartitionStripeProvider partitionStripeProvider; private final AckWaters ackWaters; private final AmzaRingStoreReader ringReader; private final TakeFullySystemReady systemReady; private final TakeCoordinator takeCoordinator; public StripedPartition(AmzaStats amzaStats, OrderIdProvider orderIdProvider, VersionedPartitionProvider versionedPartitionProvider, WALUpdated walUpdated, RingMember ringMember, PartitionName partitionName, PartitionStripeProvider partitionStripeProvider, AckWaters ackWaters, AmzaRingStoreReader ringReader, TakeFullySystemReady systemReady, TakeCoordinator takeCoordinator) { this.amzaStats = amzaStats; this.orderIdProvider = orderIdProvider; this.versionedPartitionProvider = versionedPartitionProvider; this.walUpdated = walUpdated; this.ringMember = ringMember; this.partitionName = partitionName; this.partitionStripeProvider = partitionStripeProvider; this.ackWaters = ackWaters; this.ringReader = ringReader; this.systemReady = systemReady; this.takeCoordinator = takeCoordinator; } public PartitionName getPartitionName() { return partitionName; } @Override public void commit(Consistency consistency, byte[] prefix, ClientUpdates updates, long timeoutInMillis) throws Exception { long end = System.currentTimeMillis() + timeoutInMillis; systemReady.await(timeoutInMillis); if (System.currentTimeMillis() > end) { throw new FailedToAchieveQuorumException("Timed out waiting for system ready"); } PartitionProperties properties = versionedPartitionProvider.getProperties(partitionName); if (properties.requireConsistency && !properties.consistency.supportsWrites(consistency)) { throw new FailedToAchieveQuorumException("This partition has a minimum consistency of " + properties.consistency + " which does not support writes at consistency " + consistency); } Set<RingMember> neighbors = ringReader.getNeighboringRingMembers(partitionName.getRingName(), 0); int takeQuorum = consistency.quorum(neighbors.size()); if (neighbors.size() < takeQuorum) { throw new FailedToAchieveQuorumException("There are an insufficent number of nodes to achieve desired take quorum:" + takeQuorum); } while (true) { try { long currentTime = System.currentTimeMillis(); long version = orderIdProvider.nextId(); partitionStripeProvider.txPartition(partitionName, (txPartitionStripe, highwaterStorage, versionedAquarium) -> { long leadershipToken = -1; if (takeQuorum > 0) { LivelyEndState livelyEndState = versionedAquarium.getLivelyEndState(); if (consistency.requiresLeader() && (!livelyEndState.isOnline() || livelyEndState.getCurrentState() != State.leader)) { throw new FailedToAchieveQuorumException("Leader has changed."); } leadershipToken = livelyEndState.getLeaderWaterline().getTimestamp(); } RowsChanged commit = txPartitionStripe.tx((deltaIndex, stripeIndex, partitionStripe) -> { return partitionStripe.commit(highwaterStorage, versionedAquarium, true, Optional.absent(), true, prefix, (highwaters, stream) -> updates.updates((key, value, valueTimestamp, valueTombstone) -> { long timestamp = valueTimestamp > 0 ? valueTimestamp : currentTime; return stream.row(-1L, key, value, timestamp, valueTombstone, version); }), walUpdated); }); if (takeQuorum > 0) { long timeToWait = Math.max(0, end - System.currentTimeMillis()); LOG.debug("Awaiting quorum for {} ms", timeToWait); int takenBy = 0; if (timeToWait > 0) { takenBy = ackWaters.await(versionedAquarium.getVersionedPartitionName(), commit.getLargestCommittedTxId(), neighbors, takeQuorum, timeToWait, leadershipToken, takeCoordinator); } if (takenBy < takeQuorum) { throw new FailedToAchieveQuorumException( "Timed out attempting to achieve desired take quorum:" + takeQuorum + " got:" + takenBy); } } //TODO necessary? aquarium.tapTheGlass(); amzaStats.direct(partitionName, commit.getApply().size(), commit.getSmallestCommittedTxId()); return null; }); break; } catch (DeltaOverCapacityException e) { long timeRemaining = end - System.currentTimeMillis(); if (timeRemaining <= 0) { throw e; } Thread.sleep(Math.min(timeRemaining, 1000L)); //TODO magic number } } long fsyncWaitInMillis = Math.max(end - System.currentTimeMillis(), 0); if (fsyncWaitInMillis > 0) { partitionStripeProvider.flush(partitionName, properties.durability, fsyncWaitInMillis); } else { throw new FailedToAchieveQuorumException("Timed out before commit achieved durability:" + properties.durability); } } private void checkReadConsistencySupport(Consistency consistency) throws Exception { PartitionProperties properties = versionedPartitionProvider.getProperties(partitionName); if (properties.requireConsistency && !properties.consistency.supportsReads(consistency)) { throw new FailedToAchieveQuorumException("This partition has a minimum consistency of " + properties.consistency + " which does not support reads at consistency " + consistency); } } @Override public boolean get(Consistency consistency, byte[] prefix, boolean requiresOnline, UnprefixedWALKeys keys, KeyValueStream stream) throws Exception { systemReady.await(0); checkReadConsistencySupport(consistency); return partitionStripeProvider.txPartition(partitionName, (txPartitionStripe, highwaterStorage, versionedAquarium) -> { return txPartitionStripe.tx((deltaIndex, stripeIndex, partitionStripe) -> { return partitionStripe.get(versionedAquarium, prefix, requiresOnline, keys, stream); }); }); } @Override public boolean scan(PrefixedKeyRanges ranges, boolean hydrateValues, boolean requiresOnline, KeyValueStream stream) throws Exception { systemReady.await(0); return partitionStripeProvider.txPartition(partitionName, (txPartitionStripe, highwaterStorage, versionedAquarium) -> { return txPartitionStripe.tx((deltaIndex, stripeIndex, partitionStripe) -> { return ranges.consume((fromPrefix, fromKey, toPrefix, toKey) -> { if (fromKey == null && toKey == null) { partitionStripe.rowScan(versionedAquarium, stream, hydrateValues, requiresOnline); } else { partitionStripe.rangeScan(versionedAquarium, fromPrefix, fromKey, toPrefix, toKey, hydrateValues, requiresOnline, stream); } return true; }); }); }); } @Override public TakeResult takeFromTransactionId(long txId, boolean requiresOnline, Highwaters highwaters, TxKeyValueStream stream) throws Exception { systemReady.await(0); return takeFromTransactionIdInternal(false, null, txId, requiresOnline, highwaters, stream); } @Override public TakeResult takePrefixFromTransactionId(byte[] prefix, long txId, boolean requiresOnline, Highwaters highwaters, TxKeyValueStream stream) throws Exception { Preconditions.checkNotNull(prefix, "Must specify a prefix"); systemReady.await(0); return takeFromTransactionIdInternal(true, prefix, txId, requiresOnline, highwaters, stream); } private TakeResult takeFromTransactionIdInternal(boolean usePrefix, byte[] takePrefix, long txId, boolean requiresOnline, Highwaters highwaters, TxKeyValueStream stream) throws Exception { return partitionStripeProvider.txPartition(partitionName, (txPartitionStripe, highwaterStorage, versionedAquarium) -> { long[] lastTxId = { -1 }; TxResult[] done = new TxResult[1]; TxKeyValueStream txKeyValueStream = (rowTxId, prefix, key, value, valueTimestamp, valueTombstone, valueVersion) -> { if (done[0] != null && rowTxId > lastTxId[0]) { // streamed to end of txId return done[0]; } if (done[0] != null) { if (done[0].isAccepted()) { // ignore result; lastTxId is unchanged stream.stream(rowTxId, prefix, key, value, valueTimestamp, valueTombstone, valueVersion); } } else { TxResult result = stream.stream(rowTxId, prefix, key, value, valueTimestamp, valueTombstone, valueVersion); if (result.isAccepted()) { if (rowTxId > lastTxId[0]) { lastTxId[0] = rowTxId; } } if (!result.wantsMore()) { if (result.isAccepted()) { // stream to end of txId done[0] = result; } else { // reject entire txId return result; } } } return TxResult.MORE; }; WALHighwater highwater = txPartitionStripe.tx((deltaIndex, stripeIndex, partitionStripe) -> { if (usePrefix) { return partitionStripe.takeFromTransactionId(versionedAquarium, takePrefix, txId, requiresOnline, highwaterStorage, highwaters, txKeyValueStream); } else { return partitionStripe.takeFromTransactionId(amzaStats.takeIoStats, versionedAquarium, txId, requiresOnline, highwaterStorage, highwaters, txKeyValueStream); } }); return new TakeResult(ringMember, lastTxId[0], highwater); }); } @Override public long count() throws Exception { return partitionStripeProvider.txPartition(partitionName, (txPartitionStripe, highwaterStorage, versionedAquarium) -> { return txPartitionStripe.tx((deltaIndex, stripeIndex, partitionStripe) -> { return partitionStripe.count(versionedAquarium); }); }); } @Override public long approximateCount() throws Exception { return partitionStripeProvider.txPartition(partitionName, (txPartitionStripe, highwaterStorage, versionedAquarium) -> { return txPartitionStripe.tx((deltaIndex, stripeIndex, partitionStripe) -> { return partitionStripe.approximateCount(versionedAquarium); }); }); } /*@Override public long highestTxId() throws Exception { return partitionStripeProvider.txPartition(partitionName, (txPartitionStripe, highwaterStorage, versionedAquarium) -> { return txPartitionStripe.tx((deltaIndex, stripeIndex, partitionStripe) -> { long highestTxId = partitionStripe.highestTxId(versionedAquarium.getVersionedPartitionName()); return highestTxId == HighwaterStorage.LOCAL_NONE ? -1 : highestTxId; }); }); }*/ @Override public LivelyEndState livelyEndState() throws Exception { return partitionStripeProvider.txPartition(partitionName, (txPartitionStripe, highwaterStorage, versionedAquarium) -> { return versionedAquarium.getLivelyEndState(); }); } }