/*
* Copyright 2016 The Simple File Server Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sfs.nodes.all.segment;
import com.google.common.base.Optional;
import io.vertx.core.Vertx;
import io.vertx.core.logging.Logger;
import org.sfs.Server;
import org.sfs.VertxContext;
import org.sfs.filesystem.volume.DigestBlob;
import org.sfs.filesystem.volume.ReadStreamBlob;
import org.sfs.io.PipedEndableWriteStream;
import org.sfs.io.PipedReadStream;
import org.sfs.nodes.Nodes;
import org.sfs.nodes.VolumeReplicaGroup;
import org.sfs.nodes.all.blobreference.DeleteBlobReference;
import org.sfs.rx.Defer;
import org.sfs.vo.TransientBlobReference;
import org.sfs.vo.TransientSegment;
import org.sfs.vo.TransientServiceDef;
import rx.Observable;
import rx.functions.Func1;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.FluentIterable.from;
import static com.google.common.collect.Iterables.concat;
import static io.vertx.core.logging.LoggerFactory.getLogger;
import static java.lang.Math.abs;
import static org.sfs.rx.RxHelper.combineSinglesDelayError;
import static org.sfs.rx.RxHelper.iterate;
import static org.sfs.util.Limits.NOT_SET;
import static org.sfs.util.MessageDigestFactory.SHA512;
import static rx.Observable.just;
public class RebalanceSegment implements Func1<TransientSegment, Observable<Boolean>> {
private static final Logger LOGGER = getLogger(RebalanceSegment.class);
private VertxContext<Server> vertxContext;
private Nodes nodes;
private List<TransientServiceDef> dataNodes;
private Vertx vertx;
public RebalanceSegment(VertxContext<Server> vertxContext, List<TransientServiceDef> copyOfDataNodes) {
this.vertxContext = vertxContext;
this.nodes = vertxContext.verticle().nodes();
this.dataNodes = copyOfDataNodes;
this.vertx = vertxContext.vertx();
}
@Override
public Observable<Boolean> call(final TransientSegment transientSegment) {
if (transientSegment.isTinyData()) {
return just(true);
} else {
return Defer.aVoid()
.flatMap(aVoid -> reBalance(transientSegment));
}
}
protected Observable<Boolean> reBalance(TransientSegment transientSegment) {
List<TransientBlobReference> existingObjectCopies =
from(transientSegment.verifiedAckdBlobs())
.filter(input -> {
Optional<Integer> verifyFailCount = input.getVerifyFailCount();
return !verifyFailCount.isPresent() || verifyFailCount.get() <= 0;
})
.toList();
int numberOfObjectReplicasRequestedOnContainer = transientSegment.getParent().getParent().getParent().getObjectReplicas();
int numberOfExpectedCopies = NOT_SET == numberOfObjectReplicasRequestedOnContainer ? nodes.getNumberOfObjectCopies() : numberOfObjectReplicasRequestedOnContainer + 1;
checkState(numberOfExpectedCopies >= 1, "Number of object copies must be greater >= 1");
int numberOfExistingCopies = existingObjectCopies.size();
int numberOfCopiesNeeded = numberOfExpectedCopies - numberOfExistingCopies;
return Defer.aVoid()
.flatMap(aVoid -> {
if (numberOfCopiesNeeded < 0) {
return balanceDown(existingObjectCopies, abs(numberOfCopiesNeeded))
.onErrorResumeNext(throwable -> {
LOGGER.error("Handling Balance Down Replicas Exception", throwable);
return Defer.just(false);
});
} else {
return Defer.just(false);
}
})
.flatMap(balancedDown -> {
if (numberOfCopiesNeeded > 0) {
Set<String> usedVolumeIds =
from(concat(existingObjectCopies))
.transform(input -> input.getVolumeId().get())
.toSet();
return balanceUp(transientSegment, usedVolumeIds, numberOfCopiesNeeded)
.map(balancedUp -> balancedDown || balancedUp)
.onErrorResumeNext(throwable -> {
LOGGER.error("Handling Balance Up Exception", throwable);
return Defer.just(balancedDown);
});
} else {
return Defer.just(balancedDown);
}
});
}
protected Observable<Boolean> balanceDown(List<TransientBlobReference> blobs, int delta) {
checkState(delta > 0, "Delta must be greater than 0");
checkState(blobs.size() >= delta, "Number of blobs must be >= %s but was %s", delta, blobs.size());
AtomicInteger counter = new AtomicInteger(0);
return iterate(
vertx,
blobs,
transientBlobReference ->
just(transientBlobReference)
.flatMap(new DeleteBlobReference(vertxContext))
.doOnNext(deleted -> {
if (Boolean.TRUE.equals(deleted)) {
transientBlobReference.setDeleted(deleted);
counter.incrementAndGet();
}
})
.map(deleted -> counter.get() < delta))
.map(aborted -> counter.get() > 0);
}
protected Observable<Boolean> balanceUp(TransientSegment transientSegment, Set<String> usedVolumeIds, int numberOfCopiesNeeded) {
return Defer.just(transientSegment)
.flatMap(new GetSegmentReadStream(vertxContext, true))
.filter(Optional::isPresent)
.map(Optional::get)
.flatMap(holder -> {
ReadStreamBlob readStreamBlob = holder.value1();
VolumeReplicaGroup volumeReplicaGroup =
new VolumeReplicaGroup(vertxContext, numberOfCopiesNeeded)
.setAllowSameNode(nodes.isAllowSameNode())
.setExcludeVolumeIds(usedVolumeIds);
PipedReadStream pipedReadStream = new PipedReadStream();
PipedEndableWriteStream pipedEndableWriteStream = new PipedEndableWriteStream(pipedReadStream);
Observable<Void> producer = readStreamBlob.produce(pipedEndableWriteStream);
Observable<List<DigestBlob>> consumer = volumeReplicaGroup.consume(readStreamBlob.getLength(), SHA512, pipedReadStream);
return combineSinglesDelayError(producer, consumer, (aVoid, digestBlobs) -> {
for (DigestBlob digestBlob : digestBlobs) {
transientSegment.newBlob()
.setVolumeId(digestBlob.getVolume())
.setPosition(digestBlob.getPosition())
.setReadLength(digestBlob.getLength())
.setReadSha512(digestBlob.getDigest(SHA512).get());
}
return null;
});
})
.map(aVoid -> transientSegment)
// Don't ack the segments since writing these to the index
// is being done as part of a bulk update. The next run
// of the bulk update will see these records that are not ackd
// and will ack them if they can be verified. If these
// records where ackd here it would be possible for volumes
// to end up with records that are marked as ackd in the volume
// but not recorded in the index. This strategy allows the volume garabge collector
// to purge the data from its local store if this index update
// fails to persist
.map(transientSegment1 -> true)
.singleOrDefault(false);
}
}