/*
* Copyright 2013 Jive Software, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.jivesoftware.os.amza.service.storage;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.jivesoftware.os.amza.api.AmzaInterner;
import com.jivesoftware.os.amza.api.TimestampedValue;
import com.jivesoftware.os.amza.api.filer.UIO;
import com.jivesoftware.os.amza.api.partition.Consistency;
import com.jivesoftware.os.amza.api.partition.Durability;
import com.jivesoftware.os.amza.api.partition.PartitionName;
import com.jivesoftware.os.amza.api.partition.PartitionProperties;
import com.jivesoftware.os.amza.api.partition.RingMembership;
import com.jivesoftware.os.amza.api.partition.VersionedPartitionName;
import com.jivesoftware.os.amza.api.scan.RowChanges;
import com.jivesoftware.os.amza.api.scan.RowsChanged;
import com.jivesoftware.os.amza.api.stream.RowType;
import com.jivesoftware.os.amza.api.wal.WALKey;
import com.jivesoftware.os.amza.api.wal.WALUpdated;
import com.jivesoftware.os.amza.api.wal.WALValue;
import com.jivesoftware.os.amza.service.partition.VersionedPartitionProvider;
import com.jivesoftware.os.amza.service.replication.SystemStriper;
import com.jivesoftware.os.amza.service.ring.AmzaRingReader;
import com.jivesoftware.os.amza.service.storage.PartitionIndex.PartitionPropertiesStream;
import com.jivesoftware.os.jive.utils.ordered.id.OrderIdProvider;
import com.jivesoftware.os.mlogger.core.MetricLogger;
import com.jivesoftware.os.mlogger.core.MetricLoggerFactory;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicLong;
public class PartitionCreator implements RowChanges, VersionedPartitionProvider {
private static final MetricLogger LOG = MetricLoggerFactory.getLogger();
public static final VersionedPartitionName NODE_INDEX = new VersionedPartitionName(
new PartitionName(true, AmzaRingReader.SYSTEM_RING, "NODE_INDEX".getBytes()),
VersionedPartitionName.STATIC_VERSION);
public static final VersionedPartitionName RING_INDEX = new VersionedPartitionName(
new PartitionName(true, AmzaRingReader.SYSTEM_RING, "RING_INDEX".getBytes()),
VersionedPartitionName.STATIC_VERSION);
public static final VersionedPartitionName REGION_INDEX = new VersionedPartitionName(
new PartitionName(true, AmzaRingReader.SYSTEM_RING, "REGION_INDEX".getBytes()),
VersionedPartitionName.STATIC_VERSION);
public static final VersionedPartitionName REGION_PROPERTIES = new VersionedPartitionName(
new PartitionName(true, AmzaRingReader.SYSTEM_RING, "REGION_PROPERTIES".getBytes()),
VersionedPartitionName.STATIC_VERSION);
public static final VersionedPartitionName HIGHWATER_MARK_INDEX = new VersionedPartitionName(
new PartitionName(true, AmzaRingReader.SYSTEM_RING, "HIGHWATER_MARK_INDEX".getBytes()),
VersionedPartitionName.STATIC_VERSION);
public static final VersionedPartitionName PARTITION_VERSION_INDEX = new VersionedPartitionName(
new PartitionName(true, AmzaRingReader.SYSTEM_RING, "PARTITION_VERSION_INDEX".getBytes()),
VersionedPartitionName.STATIC_VERSION);
public static final VersionedPartitionName AQUARIUM_STATE_INDEX = new VersionedPartitionName(
new PartitionName(true, AmzaRingReader.SYSTEM_RING, "AQUARIUM_STATE_INDEX".getBytes()),
VersionedPartitionName.STATIC_VERSION);
public static final VersionedPartitionName AQUARIUM_LIVELINESS_INDEX = new VersionedPartitionName(
new PartitionName(true, AmzaRingReader.SYSTEM_RING, "AQUARIUM_LIVELINESS_INDEX".getBytes()),
VersionedPartitionName.STATIC_VERSION);
private final OrderIdProvider orderIdProvider;
private final PartitionPropertyMarshaller partitionPropertyMarshaller;
private final PartitionIndex partitionIndex;
private final SystemWALStorage systemWALStorage;
private final WALUpdated walUpdated;
private final RowChanges rowChanges;
private final AmzaInterner amzaInterner;
private final ConcurrentMap<PartitionName, PartitionProperties> partitionProperties = Maps.newConcurrentMap();
private final AtomicLong partitionPropertiesVersion = new AtomicLong();
private static final PartitionProperties REPLICATED_PROPERTIES = new PartitionProperties(Durability.fsync_never,
0, 0, 0, 0, 0, 0, 0, 0,
true,
Consistency.none,
true,
true,
false,
RowType.primary,
"memory_persistent",
8,
null,
-1,
-1);
private static final PartitionProperties NON_REPLICATED_PROPERTIES = new PartitionProperties(Durability.fsync_never,
0, 0, 0, 0, 0, 0, 0, 0,
true,
Consistency.none,
true,
false,
false,
RowType.primary,
"memory_persistent",
8,
null,
Integer.MAX_VALUE,
-1);
private static final PartitionProperties AQUARIUM_PROPERTIES = new PartitionProperties(Durability.ephemeral,
0, 0, 0, 0, 0, 0, 0, 0,
false,
Consistency.none,
true,
true,
false,
RowType.primary,
"memory_ephemeral",
8,
null,
16,
4);
public static final Map<VersionedPartitionName, PartitionProperties> SYSTEM_PARTITIONS = ImmutableMap
.<VersionedPartitionName, PartitionProperties>builder()
.put(PartitionCreator.REGION_INDEX, REPLICATED_PROPERTIES)
.put(PartitionCreator.RING_INDEX, REPLICATED_PROPERTIES)
.put(PartitionCreator.NODE_INDEX, REPLICATED_PROPERTIES)
.put(PartitionCreator.PARTITION_VERSION_INDEX, REPLICATED_PROPERTIES)
.put(PartitionCreator.REGION_PROPERTIES, REPLICATED_PROPERTIES)
.put(PartitionCreator.HIGHWATER_MARK_INDEX, NON_REPLICATED_PROPERTIES)
.put(PartitionCreator.AQUARIUM_STATE_INDEX, AQUARIUM_PROPERTIES)
.put(PartitionCreator.AQUARIUM_LIVELINESS_INDEX, AQUARIUM_PROPERTIES)
.build();
public PartitionCreator(OrderIdProvider orderIdProvider,
PartitionPropertyMarshaller partitionPropertyMarshaller,
PartitionIndex partitionIndex,
SystemWALStorage systemWALStorage,
WALUpdated walUpdated,
RowChanges rowChanges,
AmzaInterner amzaInterner) {
this.orderIdProvider = orderIdProvider;
this.partitionPropertyMarshaller = partitionPropertyMarshaller;
this.partitionIndex = partitionIndex;
this.walUpdated = walUpdated;
this.systemWALStorage = systemWALStorage;
this.rowChanges = rowChanges;
this.amzaInterner = amzaInterner;
}
public void init(SystemStriper systemStriper) throws Exception {
for (Map.Entry<VersionedPartitionName, PartitionProperties> entry : SYSTEM_PARTITIONS.entrySet()) {
VersionedPartitionName versionedPartitionName = entry.getKey();
int systemStripe = systemStriper.getSystemStripe(versionedPartitionName.getPartitionName());
partitionIndex.get("init", versionedPartitionName, entry.getValue(), systemStripe);
}
}
@Override
public boolean hasPartition(PartitionName partitionName) throws Exception {
if (partitionName.isSystemPartition()) {
return true;
}
byte[] rawPartitionName = partitionName.toBytes();
TimestampedValue propertiesValue = systemWALStorage.getTimestampedValue(REGION_PROPERTIES, null, rawPartitionName);
if (propertiesValue != null) {
TimestampedValue indexValue = systemWALStorage.getTimestampedValue(REGION_INDEX, null, rawPartitionName);
if (indexValue != null) {
return true;
}
}
return false;
}
public boolean hasStore(String context, VersionedPartitionName versionedPartitionName, int stripeIndex) throws Exception {
PartitionProperties properties = getProperties(versionedPartitionName.getPartitionName());
return properties != null && partitionIndex.exists(context, versionedPartitionName, properties, stripeIndex);
}
public PartitionStore createStoreIfAbsent(String context, VersionedPartitionName versionedPartitionName, int stripe) throws Exception {
PartitionName partitionName = versionedPartitionName.getPartitionName();
Preconditions.checkArgument(!partitionName.isSystemPartition(), "You cannot create a system partition");
PartitionProperties properties = getProperties(partitionName);
if (properties == null) {
return null;
} else {
return partitionIndex.get(context, versionedPartitionName, properties, stripe);
}
}
public boolean createPartitionIfAbsent(PartitionName partitionName, PartitionProperties properties) throws Exception {
byte[] rawPartitionName = partitionName.toBytes();
TimestampedValue regionIndexValue = systemWALStorage.getTimestampedValue(REGION_INDEX, null, rawPartitionName);
long timestampAndVersion;
if (regionIndexValue != null) {
timestampAndVersion = regionIndexValue.getTimestampId();
} else {
timestampAndVersion = orderIdProvider.nextId();
RowsChanged changed = systemWALStorage.update(REGION_INDEX, null,
(highwater, scan) -> scan.row(-1, rawPartitionName, rawPartitionName, timestampAndVersion, false, timestampAndVersion),
walUpdated);
if (!changed.isEmpty()) {
rowChanges.changes(changed);
}
}
TimestampedValue propertiesValue = systemWALStorage.getTimestampedValue(REGION_PROPERTIES, null, rawPartitionName);
if (propertiesValue == null) {
return setPartitionProperties(partitionName, properties, timestampAndVersion);
} else {
return false;
}
}
public void updatePartitionPropertiesIfNecessary(PartitionName partitionName, PartitionProperties properties) throws Exception {
PartitionProperties got = getProperties(partitionName);
if (got.equals(properties)) {
return;
}
LOG.info("Updating partition properties for {}", partitionName);
byte[] rawPartitionName = partitionName.toBytes();
TimestampedValue regionIndexValue = systemWALStorage.getTimestampedValue(REGION_INDEX, null, rawPartitionName);
if (regionIndexValue == null) {
throw new IllegalArgumentException("Partition has not been initialized: " + partitionName);
}
long timestampAndVersion = orderIdProvider.nextId();
setPartitionProperties(partitionName, properties, timestampAndVersion);
}
public void updatePartitionProperties(PartitionName partitionName, PartitionProperties properties) throws Exception {
byte[] rawPartitionName = partitionName.toBytes();
TimestampedValue regionIndexValue = systemWALStorage.getTimestampedValue(REGION_INDEX, null, rawPartitionName);
if (regionIndexValue == null) {
throw new IllegalArgumentException("Partition has not been initialized: " + partitionName);
}
long timestampAndVersion = orderIdProvider.nextId();
setPartitionProperties(partitionName, properties, timestampAndVersion);
}
private boolean setPartitionProperties(PartitionName partitionName, PartitionProperties properties, long timestampAndVersion) throws Exception {
byte[] partitionNameBytes = partitionName.toBytes();
RowsChanged changed = systemWALStorage.update(REGION_PROPERTIES, null, (highwater, scan) -> {
return scan.row(-1, partitionNameBytes, partitionPropertyMarshaller.toBytes(properties), timestampAndVersion, false, timestampAndVersion);
}, walUpdated);
if (!changed.isEmpty()) {
rowChanges.changes(changed);
partitionProperties.put(partitionName, properties);
return true;
}
return false;
}
private void removeProperties(PartitionName partitionName) {
partitionProperties.remove(partitionName);
}
@Override
public PartitionProperties getProperties(PartitionName partitionName) {
return partitionProperties.computeIfAbsent(partitionName, (key) -> {
try {
if (partitionName.isSystemPartition()) {
return SYSTEM_PARTITIONS.get(new VersionedPartitionName(partitionName, VersionedPartitionName.STATIC_VERSION));
} else {
TimestampedValue rawPartitionProperties = systemWALStorage.getTimestampedValue(REGION_PROPERTIES, null, partitionName.toBytes());
if (rawPartitionProperties == null) {
return null;
}
return partitionPropertyMarshaller.fromBytes(rawPartitionProperties.getValue());
}
} catch (Exception e) {
throw new RuntimeException(e);
}
});
}
@Override
public VersionedPartitionProperties getVersionedProperties(PartitionName partitionName, VersionedPartitionProperties versionedPartitionProperties) {
long version = partitionPropertiesVersion.get();
if (versionedPartitionProperties != null && versionedPartitionProperties.version >= version) {
return versionedPartitionProperties;
}
return new VersionedPartitionProperties(version, getProperties(partitionName));
}
public PartitionStore get(String context, VersionedPartitionName versionedPartitionName, int stripeIndex) throws Exception {
PartitionProperties properties = getProperties(versionedPartitionName.getPartitionName());
return partitionIndex.get(context, versionedPartitionName, properties, stripeIndex);
}
@Override
public Iterable<PartitionName> getMemberPartitions(RingMembership ringMembership) throws Exception {
List<PartitionName> partitionNames = Lists.newArrayList();
systemWALStorage.rowScan(REGION_INDEX, (prefix, key, value, valueTimestamp, valueTombstoned, valueVersion) -> {
if (!valueTombstoned && valueTimestamp != -1 && value != null) {
PartitionName partitionName = amzaInterner.internPartitionName(key, 0, key.length);
if (ringMembership == null || ringMembership.isMemberOfRing(partitionName.getRingName(), 0)) {
partitionNames.add(partitionName);
}
}
return true;
}, true);
return partitionNames;
}
public void streamAllParitions(PartitionPropertiesStream partitionStream) throws Exception {
systemWALStorage.rowScan(REGION_PROPERTIES, (prefix, key, value, valueTimestamp, valueTombstoned, valueVersion) -> {
if (!valueTombstoned) {
PartitionName partitionName = amzaInterner.internPartitionName(key, 0, key.length);
PartitionProperties properties = partitionPropertyMarshaller.fromBytes(value);
if (!partitionStream.stream(partitionName, properties)) {
return false;
}
}
return true;
}, true);
}
public void markForDisposal(PartitionName partitionName) throws Exception {
byte[] rawPartitionName = partitionName.toBytes();
long timestampAndVersion = orderIdProvider.nextId();
byte[] disposalValue = new byte[rawPartitionName.length + 8];
System.arraycopy(rawPartitionName, 0, disposalValue, 0, rawPartitionName.length);
UIO.longBytes(timestampAndVersion, disposalValue, rawPartitionName.length);
RowsChanged changed = systemWALStorage.update(REGION_INDEX, null,
(highwater, scan) -> scan.row(-1, rawPartitionName, disposalValue, timestampAndVersion, false, timestampAndVersion),
walUpdated);
if (!changed.isEmpty()) {
rowChanges.changes(changed);
}
}
@Override
public long getPartitionDisposal(PartitionName partitionName) throws Exception {
if (partitionName.isSystemPartition()) {
return -1L;
}
byte[] rawPartitionName = partitionName.toBytes();
TimestampedValue indexValue = systemWALStorage.getTimestampedValue(REGION_INDEX, null, rawPartitionName);
byte[] value = (indexValue == null) ? null : indexValue.getValue();
if (value != null && value.length > rawPartitionName.length) {
return UIO.bytesLong(value, rawPartitionName.length);
}
return -1;
}
// this is really dangerous for a variety of reasons (e.g. it removes a disposal marker)
public void destroyPartition(PartitionName partitionName) throws Exception {
Preconditions.checkArgument(!partitionName.isSystemPartition(), "You cannot destroy a system partition");
long timestampAndVersion = orderIdProvider.nextId();
systemWALStorage.update(REGION_PROPERTIES, null, (highwaters, scan) -> {
return scan.row(-1, partitionName.toBytes(), null, timestampAndVersion, true, timestampAndVersion);
}, walUpdated);
systemWALStorage.update(REGION_INDEX, null, (highwaters, scan) -> {
return scan.row(-1, partitionName.toBytes(), null, timestampAndVersion, true, timestampAndVersion);
}, walUpdated);
partitionProperties.remove(partitionName);
partitionIndex.invalidate(partitionName);
}
public Iterable<VersionedPartitionName> getSystemPartitions() {
return SYSTEM_PARTITIONS.keySet();
}
@Override
public void changes(final RowsChanged changes) throws Exception {
if (changes.getVersionedPartitionName().getPartitionName().equals(REGION_PROPERTIES.getPartitionName())) {
try {
for (Map.Entry<WALKey, WALValue> entry : changes.getApply().entrySet()) {
byte[] key = entry.getKey().key;
PartitionName partitionName = amzaInterner.internPartitionName(key, 0, key.length);
removeProperties(partitionName);
PartitionProperties properties = getProperties(partitionName);
partitionIndex.updateStoreProperties(partitionName, properties);
}
partitionPropertiesVersion.incrementAndGet();
} catch (Throwable ex) {
throw new RuntimeException("Error while streaming entry set.", ex);
}
}
}
}