/*
* Copyright 2013 Jive Software, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.jivesoftware.os.amza.service;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.jivesoftware.os.amza.api.AmzaInterner;
import com.jivesoftware.os.amza.api.partition.PartitionName;
import com.jivesoftware.os.amza.api.partition.VersionedPartitionName;
import com.jivesoftware.os.amza.api.ring.RingHost;
import com.jivesoftware.os.amza.api.ring.RingMember;
import com.jivesoftware.os.amza.api.scan.RowChanges;
import com.jivesoftware.os.amza.api.wal.WALUpdated;
import com.jivesoftware.os.amza.service.TakeFullySystemReady.SystemRingSizeProvider;
import com.jivesoftware.os.amza.service.filer.DirectByteBufferFactory;
import com.jivesoftware.os.amza.service.replication.AmzaAquariumProvider;
import com.jivesoftware.os.amza.service.replication.AmzaAquariumProvider.AmzaLivelinessStorage;
import com.jivesoftware.os.amza.service.replication.AsyncStripeFlusher;
import com.jivesoftware.os.amza.service.replication.PartitionBackedHighwaterStorage;
import com.jivesoftware.os.amza.service.replication.PartitionComposter;
import com.jivesoftware.os.amza.service.replication.PartitionStripeProvider;
import com.jivesoftware.os.amza.service.replication.PartitionTombstoneCompactor;
import com.jivesoftware.os.amza.service.replication.RowChangeTaker;
import com.jivesoftware.os.amza.service.replication.StorageVersionProvider;
import com.jivesoftware.os.amza.service.replication.StripedPartitionCommitChanges;
import com.jivesoftware.os.amza.service.replication.SystemPartitionCommitChanges;
import com.jivesoftware.os.amza.service.replication.TakeFailureListener;
import com.jivesoftware.os.amza.service.ring.CacheId;
import com.jivesoftware.os.amza.service.ring.RingSet;
import com.jivesoftware.os.amza.service.ring.RingTopology;
import com.jivesoftware.os.amza.service.stats.AmzaStats;
import com.jivesoftware.os.amza.service.storage.PartitionCreator;
import com.jivesoftware.os.amza.service.storage.PartitionIndex;
import com.jivesoftware.os.amza.service.storage.PartitionPropertyMarshaller;
import com.jivesoftware.os.amza.service.storage.SystemWALStorage;
import com.jivesoftware.os.amza.service.storage.binary.BinaryHighwaterRowMarshaller;
import com.jivesoftware.os.amza.service.storage.binary.BinaryPrimaryRowMarshaller;
import com.jivesoftware.os.amza.service.storage.binary.BinaryRowIOProvider;
import com.jivesoftware.os.amza.service.storage.binary.MemoryBackedRowIOProvider;
import com.jivesoftware.os.amza.service.storage.binary.RowIOProvider;
import com.jivesoftware.os.amza.service.storage.delta.DeltaStripeWALStorage;
import com.jivesoftware.os.amza.service.storage.delta.DeltaWALFactory;
import com.jivesoftware.os.amza.service.take.AvailableRowsTaker;
import com.jivesoftware.os.amza.service.take.HighwaterStorage;
import com.jivesoftware.os.amza.service.take.RowsTakerFactory;
import com.jivesoftware.os.amza.service.take.TakeCoordinator;
import com.jivesoftware.os.aquarium.AquariumStats;
import com.jivesoftware.os.aquarium.Liveliness;
import com.jivesoftware.os.aquarium.Member;
import com.jivesoftware.os.aquarium.interfaces.AtQuorum;
import com.jivesoftware.os.jive.utils.collections.bah.ConcurrentBAHash;
import com.jivesoftware.os.jive.utils.ordered.id.ConstantWriterIdProvider;
import com.jivesoftware.os.jive.utils.ordered.id.IdPacker;
import com.jivesoftware.os.jive.utils.ordered.id.OrderIdProviderImpl;
import com.jivesoftware.os.jive.utils.ordered.id.TimestampedOrderIdProvider;
import com.jivesoftware.os.mlogger.core.MetricLogger;
import com.jivesoftware.os.mlogger.core.MetricLoggerFactory;
import com.jivesoftware.os.routing.bird.health.api.HealthTimer;
import com.jivesoftware.os.routing.bird.health.checkers.SickThreads;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.channels.FileChannel;
import java.nio.channels.FileLock;
import java.nio.file.StandardOpenOption;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
public class AmzaServiceInitializer {
private static final MetricLogger LOG = MetricLoggerFactory.getLogger();
public static class AmzaServiceConfig {
public String[] workingDirectories = null;
public long asyncFsyncIntervalMillis = 1_000;
public int numberOfTakerThreads = 8;
public int systemRingSize = -1;
public int systemReadyInitConcurrencyLevel = 8;
public int corruptionParanoiaFactor = 10;
public int updatesBetweenLeaps = 4_096;
public int maxLeaps = 64;
public long initialBufferSegmentSize = 1_024 * 1_024;
public long maxBufferSegmentSize = 1_024 * 1_024 * 1_024;
public int maxUpdatesBeforeDeltaStripeCompaction = 1_000_000;
public int deltaStripeCompactionIntervalInMillis = 1_000 * 60;
public int deltaMaxValueSizeInIndex = 8;
public boolean deltaUseHighwaterTxId = false;
public int deltaMergeThreads = -1;
public int ackWatersStripingLevel = 1024;
public boolean ackWatersVerboseLogTimeouts = false;
public int awaitOnlineStripingLevel = 1024;
public boolean hardFsync = false;
public long flushHighwatersAfterNUpdates = 10_000;
public boolean useMemMap = true;
public long takeCyaIntervalInMillis = 1_000;
public long takeSlowThresholdInMillis = 1_000 * 60;
public long takeLongPollTimeoutMillis = 10_000;
public long takeSystemReofferDeltaMillis = 100;
public long takeReofferDeltaMillis = 1_000;
public long takeReofferMaxElectionsPerHeartbeat = 1_000_000;
public long hangupAvailableRowsAfterUnresponsiveMillis = 60_000;
public long pongIntervalMillis = 10_000;
public long rowsTakerLimit = 65_536L;
public long aquariumLeaderDeadAfterMillis = 60_000;
public long aquariumLivelinessFeedEveryMillis = 500;
public int tombstoneCompactionFactor = 2;
public long checkIfCompactionIsNeededIntervalInMillis = 60_000;
public long rebalanceableEveryNMillis = TimeUnit.HOURS.toMillis(1);
public long rebalanceIfImbalanceGreaterThanNBytes = 1024 * 1024 * 1024;
public long interruptBlockingReadsIfLingersForNMillis = 60_000;
public boolean rackDistributionEnabled = true;
public long discoveryIntervalMillis = 30_000;
}
public interface IndexProviderRegistryCallback {
void call(File[] workingIndexDirectories,
WALIndexProviderRegistry indexProviderRegistry,
RowIOProvider ephemeralRowIOProvider,
RowIOProvider persistentRowIOProvider,
int numberOfStripes) throws Exception;
}
public interface AmzaThreadPoolProvider {
ExecutorService allocateThreadPool(int threadCount, String name);
}
public AmzaService initialize(AmzaServiceConfig config,
AmzaInterner amzaInterner,
AquariumStats aquariumStats,
AmzaStats amzaSystemStats,
AmzaStats amzaStats,
HealthTimer quorumLatency,
SystemRingSizeProvider systemRingSizeProvider,
SickThreads sickThreads,
SickPartitions sickPartitions,
BinaryPrimaryRowMarshaller primaryRowMarshaller,
BinaryHighwaterRowMarshaller highwaterRowMarshaller,
RingMember ringMember,
RingHost ringHost,
Set<RingMember> blacklistRingMembers,
TimestampedOrderIdProvider orderIdProvider,
IdPacker idPacker,
PartitionPropertyMarshaller partitionPropertyMarshaller,
IndexProviderRegistryCallback indexProviderRegistryCallback,
AvailableRowsTaker availableRowsTaker,
RowsTakerFactory systemRowsTakerFactory,
RowsTakerFactory rowsTakerFactory,
Optional<TakeFailureListener> takeFailureListener,
RowChanges allRowChanges,
AmzaThreadPoolProvider amzaThreadPoolProvider) throws Exception {
AmzaPartitionWatcher amzaSystemPartitionWatcher = new AmzaPartitionWatcher(true, allRowChanges);
int numberOfStripes = config.workingDirectories.length;
//TODO configure
MemoryBackedRowIOProvider ephemeralRowIOProvider = new MemoryBackedRowIOProvider(
config.initialBufferSegmentSize,
config.maxBufferSegmentSize,
config.updatesBetweenLeaps,
config.maxLeaps,
new DirectByteBufferFactory());
BinaryRowIOProvider persistentRowIOProvider = new BinaryRowIOProvider(
config.updatesBetweenLeaps,
config.maxLeaps,
config.useMemMap);
File[] workingWALDirectories = new File[config.workingDirectories.length];
File[] workingIndexDirectories = new File[config.workingDirectories.length];
for (int i = 0; i < workingWALDirectories.length; i++) {
workingWALDirectories[i] = new File(config.workingDirectories[i], "wal");
workingIndexDirectories[i] = new File(config.workingDirectories[i], "index");
}
WALIndexProviderRegistry indexProviderRegistry = new WALIndexProviderRegistry(ephemeralRowIOProvider, persistentRowIOProvider);
indexProviderRegistryCallback.call(workingIndexDirectories, indexProviderRegistry, ephemeralRowIOProvider, persistentRowIOProvider, numberOfStripes);
IndexedWALStorageProvider walStorageProvider = new IndexedWALStorageProvider(amzaStats,
workingWALDirectories,
numberOfStripes,
indexProviderRegistry,
primaryRowMarshaller,
highwaterRowMarshaller,
orderIdProvider,
sickPartitions,
config.tombstoneCompactionFactor,
config.rebalanceIfImbalanceGreaterThanNBytes);
int numProc = Runtime.getRuntime().availableProcessors();
PartitionIndex partitionIndex = new PartitionIndex(amzaSystemStats,
amzaStats,
orderIdProvider,
walStorageProvider,
numProc,
amzaThreadPoolProvider.allocateThreadPool(numProc, "partition-loader"));
AsyncStripeFlusher systemFlusher = new AsyncStripeFlusher(-1,
config.asyncFsyncIntervalMillis,
null);
SystemWALStorage systemWALStorage = new SystemWALStorage(
amzaSystemStats,
partitionIndex,
primaryRowMarshaller,
highwaterRowMarshaller,
amzaSystemPartitionWatcher,
systemFlusher,
config.hardFsync);
File[] walDirs = new File[numberOfStripes];
long[] stripeVersions = new long[numberOfStripes];
FileLock[] stripeLocks = new FileLock[numberOfStripes];
for (int i = 0; i < numberOfStripes; i++) {
walDirs[i] = new File(config.workingDirectories[i % config.workingDirectories.length], "delta-wal-" + i);
if (!walDirs[i].exists()) {
if (!walDirs[i].mkdirs()) {
throw new IllegalStateException("Please check your file permission. " + walDirs[i].getAbsolutePath());
}
}
File versionFile = new File(walDirs[i], "version");
if (versionFile.exists()) {
stripeLocks[i] = FileChannel.open(versionFile.toPath(), StandardOpenOption.WRITE).lock();
try (FileInputStream fileInputStream = new FileInputStream(versionFile)) {
DataInput input = new DataInputStream(fileInputStream);
stripeVersions[i] = input.readLong();
LOG.info("Loaded stripeVersion:" + stripeVersions[i] + " for stripe:" + i + " from " + versionFile);
}
} else if (versionFile.createNewFile()) {
stripeLocks[i] = FileChannel.open(versionFile.toPath(), StandardOpenOption.WRITE).lock();
try (FileOutputStream fileOutputStream = new FileOutputStream(versionFile)) {
DataOutput output = new DataOutputStream(fileOutputStream);
stripeVersions[i] = orderIdProvider.nextId();
output.writeLong(stripeVersions[i]);
LOG.info("Created stripeVersion:" + stripeVersions[i] + " for stripe:" + i + " to " + versionFile);
}
} else {
throw new IllegalStateException("Please check your file permission. " + versionFile.getAbsolutePath());
}
}
ConcurrentBAHash<CacheId<RingTopology>> ringsCache = new ConcurrentBAHash<>(13, true, numProc);
ConcurrentBAHash<CacheId<RingSet>> ringMemberRingNamesCache = new ConcurrentBAHash<>(13, true, numProc);
List<WALUpdated> walUpdateDelegates = Lists.newCopyOnWriteArrayList();
WALUpdated walUpdated = (versionedPartitionName, txId) -> {
for (WALUpdated delegate : walUpdateDelegates) {
delegate.updated(versionedPartitionName, txId);
}
};
PartitionCreator partitionCreator = new PartitionCreator(
orderIdProvider,
partitionPropertyMarshaller,
partitionIndex,
systemWALStorage,
walUpdated,
allRowChanges,
amzaInterner);
TakeFullySystemReady systemReady = new TakeFullySystemReady(systemRingSizeProvider, partitionCreator, sickPartitions, sickThreads);
AtomicLong nodeCacheId = new AtomicLong(0);
AmzaRingStoreReader ringStoreReader = new AmzaRingStoreReader(systemReady,
amzaInterner,
ringMember,
ringsCache,
ringMemberRingNamesCache,
nodeCacheId,
ImmutableSet.copyOf(blacklistRingMembers));
AwaitNotify<PartitionName> awaitOnline = new AwaitNotify<>(config.awaitOnlineStripingLevel);
long maxUpdatesBeforeCompaction = config.maxUpdatesBeforeDeltaStripeCompaction;
AckWaters ackWaters = new AckWaters(amzaSystemStats, amzaStats, quorumLatency, config.ackWatersStripingLevel, config.ackWatersVerboseLogTimeouts);
HighwaterStorage highwaterStorage = new PartitionBackedHighwaterStorage(amzaSystemStats,
amzaStats,
amzaInterner,
orderIdProvider,
ringMember,
partitionCreator,
systemWALStorage,
walUpdated,
config.flushHighwatersAfterNUpdates,
numberOfStripes);
DeltaStripeWALStorage[] deltaStripeWALStorages = new DeltaStripeWALStorage[numberOfStripes];
BinaryRowIOProvider deltaRowIOProvider = new BinaryRowIOProvider(
-1,
0,
config.useMemMap);
int deltaMergeThreads = config.deltaMergeThreads;
if (deltaMergeThreads <= 0) {
deltaMergeThreads = numProc;
}
for (int i = 0; i < numberOfStripes; i++) {
DeltaWALFactory deltaWALFactory = new DeltaWALFactory(orderIdProvider, walDirs[i], deltaRowIOProvider, primaryRowMarshaller,
highwaterRowMarshaller, config.corruptionParanoiaFactor);
deltaStripeWALStorages[i] = new DeltaStripeWALStorage(
amzaInterner,
i,
amzaStats,
ackWaters,
sickThreads,
ringStoreReader,
highwaterStorage,
deltaWALFactory,
config.deltaMaxValueSizeInIndex,
config.deltaUseHighwaterTxId,
indexProviderRegistry,
maxUpdatesBeforeCompaction,
amzaThreadPoolProvider.allocateThreadPool(deltaMergeThreads, "merge-deltas-" + i));
}
long stripeMaxFreeWithinNBytes = config.rebalanceIfImbalanceGreaterThanNBytes / 2; //TODO config separately
StorageVersionProvider storageVersionProvider = new StorageVersionProvider(amzaInterner,
orderIdProvider,
ringMember,
systemWALStorage,
partitionCreator,
ringStoreReader,
workingIndexDirectories,
stripeVersions,
stripeLocks,
stripeMaxFreeWithinNBytes,
deltaStripeWALStorages,
walUpdated,
awaitOnline);
amzaSystemPartitionWatcher.watch(PartitionCreator.PARTITION_VERSION_INDEX.getPartitionName(), storageVersionProvider);
amzaSystemPartitionWatcher.watch(PartitionCreator.REGION_PROPERTIES.getPartitionName(), partitionCreator);
TakeCoordinator takeCoordinator = new TakeCoordinator(systemWALStorage,
ringMember,
amzaSystemStats, amzaStats,
orderIdProvider,
idPacker,
partitionCreator,
config.takeCyaIntervalInMillis,
config.takeSlowThresholdInMillis,
config.takeSystemReofferDeltaMillis,
config.takeReofferDeltaMillis,
config.takeReofferMaxElectionsPerHeartbeat,
config.hangupAvailableRowsAfterUnresponsiveMillis);
walUpdateDelegates.add((versionedPartitionName, txId) -> {
takeCoordinator.update(ringStoreReader, Preconditions.checkNotNull(versionedPartitionName), txId);
});
long startupVersion = orderIdProvider.nextId();
Member rootAquariumMember = ringMember.asAquariumMember();
AmzaLivelinessStorage livelinessStorage = new AmzaLivelinessStorage(systemWALStorage, orderIdProvider, walUpdated, rootAquariumMember, startupVersion);
AtQuorum livelinessAtQuorm = count -> {
int ringSize = systemRingSizeProvider.get();
return ringSize > 0 && count > ringSize / 2;
};
Liveliness liveliness = new Liveliness(aquariumStats,
System::currentTimeMillis,
livelinessStorage,
rootAquariumMember,
livelinessAtQuorm,
config.aquariumLeaderDeadAfterMillis,
new AtomicLong(-1));
AmzaAquariumProvider aquariumProvider = new AmzaAquariumProvider(aquariumStats,
amzaInterner,
ringMember,
orderIdProvider,
ringStoreReader,
systemWALStorage,
storageVersionProvider,
partitionCreator,
takeCoordinator,
walUpdated,
liveliness,
config.aquariumLivelinessFeedEveryMillis,
awaitOnline,
sickThreads);
amzaSystemPartitionWatcher.watch(PartitionCreator.AQUARIUM_STATE_INDEX.getPartitionName(), aquariumProvider);
AmzaPartitionWatcher amzaStripedPartitionWatcher = new AmzaPartitionWatcher(false, allRowChanges);
AsyncStripeFlusher[] stripeFlusher = new AsyncStripeFlusher[numberOfStripes];
for (int i = 0; i < numberOfStripes; i++) {
int index = i;
stripeFlusher[i] = new AsyncStripeFlusher(index,
config.asyncFsyncIntervalMillis,
() -> {
deltaStripeWALStorages[index].flush(true);
return null;
});
}
PartitionStripeProvider partitionStripeProvider = new PartitionStripeProvider(
amzaStats,
partitionCreator,
partitionIndex,
primaryRowMarshaller,
deltaStripeWALStorages,
highwaterStorage,
highwaterRowMarshaller,
ringMember,
ringStoreReader,
aquariumProvider,
storageVersionProvider,
takeCoordinator,
awaitOnline,
amzaStripedPartitionWatcher,
systemFlusher,
stripeFlusher,
config.deltaStripeCompactionIntervalInMillis,
amzaThreadPoolProvider.allocateThreadPool(deltaStripeWALStorages.length, "compact-deltas"),
amzaThreadPoolProvider.allocateThreadPool(deltaStripeWALStorages.length + 1, "stripe-flusher")
);
PartitionComposter partitionComposter = new PartitionComposter(amzaSystemStats, amzaStats, partitionIndex, partitionCreator, ringStoreReader,
partitionStripeProvider, storageVersionProvider, amzaInterner, numProc);
amzaSystemPartitionWatcher.watch(PartitionCreator.REGION_INDEX.getPartitionName(), partitionComposter);
amzaSystemPartitionWatcher.watch(PartitionCreator.PARTITION_VERSION_INDEX.getPartitionName(), partitionComposter);
amzaSystemPartitionWatcher.watch(PartitionCreator.AQUARIUM_STATE_INDEX.getPartitionName(), partitionComposter);
AmzaRingStoreWriter amzaRingWriter = new AmzaRingStoreWriter(ringStoreReader,
systemWALStorage,
orderIdProvider,
walUpdated,
ringsCache,
ringMemberRingNamesCache,
nodeCacheId,
config.rackDistributionEnabled);
amzaSystemPartitionWatcher.watch(PartitionCreator.RING_INDEX.getPartitionName(), amzaRingWriter);
amzaSystemPartitionWatcher.watch(PartitionCreator.NODE_INDEX.getPartitionName(), amzaRingWriter);
int systemReadyInitConcurrencyLevel = config.systemReadyInitConcurrencyLevel;
for (int i = 0; i < systemReadyInitConcurrencyLevel; i++) {
int index = i;
systemReady.onReady(() -> {
LOG.info("Loading highest txIds for index:{} after system ready...", index);
int count = 0;
for (PartitionName partitionName : partitionCreator.getMemberPartitions(ringStoreReader)) {
if (index == Math.abs(partitionName.hashCode() % systemReadyInitConcurrencyLevel)) {
count++;
try {
partitionStripeProvider.txPartition(partitionName, (txPartitionStripe, highwaterStorage1, versionedAquarium) -> {
return txPartitionStripe.tx((deltaIndex, stripeIndex, partitionStripe) -> {
VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName();
long highestTxId = partitionStripe.highestTxId(versionedPartitionName);
if (highestTxId != HighwaterStorage.LOCAL_NONE) {
takeCoordinator.update(ringStoreReader, versionedPartitionName, highestTxId);
} else {
LOG.warn("Skipped system ready init for a partition, likely because it is only partially defined: {}",
versionedPartitionName);
}
return null;
});
});
} catch (PartitionIsDisposedException x) {
LOG.info("Skipped a partition because its disposed: {}", partitionName);
} catch (PropertiesNotPresentException x) {
LOG.warn("Skipped system ready init for a partition because its properties were missing: {}", partitionName);
} catch (Exception x) {
LOG.error("Failed system ready init for a partition, please fix: {}", new Object[] { partitionName }, x);
}
}
}
highwaterStorage.flushLocal(); // in case we repaired any highwater txIds
LOG.info("Finished loading {} highest txIds for index:{} after system ready!", count, index);
return null;
});
}
RowChangeTaker changeTaker = new RowChangeTaker(amzaSystemStats,
amzaStats,
numberOfStripes,
storageVersionProvider,
ringStoreReader,
systemReady,
ringHost,
systemRowsTakerFactory.create(),
rowsTakerFactory.create(),
partitionStripeProvider,
availableRowsTaker,
amzaThreadPoolProvider.allocateThreadPool(config.numberOfTakerThreads, "row-taker"),
new SystemPartitionCommitChanges(storageVersionProvider, systemWALStorage, highwaterStorage, walUpdated),
new StripedPartitionCommitChanges(partitionStripeProvider, config.hardFsync, walUpdated),
new OrderIdProviderImpl(new ConstantWriterIdProvider(1)),
takeFailureListener,
config.takeLongPollTimeoutMillis,
config.pongIntervalMillis,
config.rowsTakerLimit,
primaryRowMarshaller,
highwaterRowMarshaller);
PartitionTombstoneCompactor partitionCompactor = new PartitionTombstoneCompactor(amzaStats,
walStorageProvider,
partitionCreator,
partitionIndex,
storageVersionProvider,
config.checkIfCompactionIsNeededIntervalInMillis,
config.rebalanceableEveryNMillis,
numberOfStripes);
return new AmzaService(orderIdProvider,
amzaSystemStats,
amzaStats,
numberOfStripes,
indexProviderRegistry,
storageVersionProvider,
ringStoreReader,
amzaRingWriter,
ackWaters,
systemWALStorage,
highwaterStorage,
takeCoordinator,
changeTaker,
partitionCompactor,
partitionComposter, // its all about being GREEN!!
partitionIndex,
partitionCreator,
partitionStripeProvider,
walUpdated,
amzaSystemPartitionWatcher,
amzaStripedPartitionWatcher,
aquariumProvider,
systemReady,
liveliness);
}
}