/**
* Copyright 2011 LiveRamp
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.liveramp.hank.partition_server;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import com.google.common.collect.Sets;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.liveramp.hank.config.PartitionServerConfigurator;
import com.liveramp.hank.coordinator.Domain;
import com.liveramp.hank.coordinator.DomainAndVersion;
import com.liveramp.hank.coordinator.DomainVersion;
import com.liveramp.hank.coordinator.Host;
import com.liveramp.hank.coordinator.HostDomain;
import com.liveramp.hank.coordinator.HostDomainPartition;
import com.liveramp.hank.coordinator.Hosts;
import com.liveramp.hank.coordinator.RingGroup;
import com.liveramp.hank.storage.Deleter;
import com.liveramp.hank.storage.StorageEngine;
import com.liveramp.hank.util.DurationAggregator;
import com.liveramp.hank.util.FormatUtils;
import com.liveramp.hank.util.HankTimer;
/**
* Manages the domain update process.
*/
public class UpdateManager implements IUpdateManager {
private static final int UPDATE_EXECUTOR_TERMINATION_CHECK_TIMEOUT_VALUE = 10;
private static final TimeUnit UPDATE_EXECUTOR_TERMINATION_CHECK_TIMEOUT_UNIT = TimeUnit.SECONDS;
private static final Logger LOG = LoggerFactory.getLogger(UpdateManager.class);
private final class PartitionUpdateTaskStatisticsAggregator {
static private final int NUM_PARTITIONS_USED_FOR_PROJECTION_RATIO = 10;
static private final int MIN_NUM_PARTITIONS_USED_FOR_PROJECTION = 10;
private final Map<Domain, List<PartitionUpdateTaskStatistics>> domainToPartitionUpdateTaskStatistics
= new HashMap<Domain, List<PartitionUpdateTaskStatistics>>();
private final Map<Domain, Integer> domainToNumPartitionUpdateTasks = new HashMap<Domain, Integer>();
public synchronized void recordPartitionUpdateTaskStatistics(PartitionUpdateTask partitionUpdateTask,
PartitionUpdateTaskStatistics partitionUpdateTaskStatistics) {
List<PartitionUpdateTaskStatistics> partitionUpdateTaskStatisticsList
= domainToPartitionUpdateTaskStatistics.get(partitionUpdateTask.getDomain());
partitionUpdateTaskStatisticsList.add(partitionUpdateTaskStatistics);
}
public synchronized void register(PartitionUpdateTask partitionUpdateTask) {
Domain domain = partitionUpdateTask.getDomain();
// Initialize maps
if (!domainToPartitionUpdateTaskStatistics.containsKey(domain)) {
domainToPartitionUpdateTaskStatistics.put(domain, new ArrayList<PartitionUpdateTaskStatistics>());
}
Integer numPartitionUpdateTasks = domainToNumPartitionUpdateTasks.get(domain);
if (numPartitionUpdateTasks == null) {
domainToNumPartitionUpdateTasks.put(domain, 1);
} else {
domainToNumPartitionUpdateTasks.put(domain, numPartitionUpdateTasks + 1);
}
}
/**
* @return ETA in seconds, a negative number if no ETA could be computed
*/
public synchronized long computeETA() {
long maxDomainETA = -1;
// For each domain, compute the number of updated partitions per second
for (Map.Entry<Domain, List<PartitionUpdateTaskStatistics>> entry : domainToPartitionUpdateTaskStatistics.entrySet()) {
Domain domain = entry.getKey();
List<PartitionUpdateTaskStatistics> partitionUpdateTaskStatisticsList = entry.getValue();
// Only consider at a fixed number of partitions in the past
int numPartitionUpdateTasksForDomain = domainToNumPartitionUpdateTasks.get(domain);
int numPartitionsToConsider = numPartitionUpdateTasksForDomain / NUM_PARTITIONS_USED_FOR_PROJECTION_RATIO;
if (numPartitionsToConsider < MIN_NUM_PARTITIONS_USED_FOR_PROJECTION) {
numPartitionsToConsider = MIN_NUM_PARTITIONS_USED_FOR_PROJECTION;
}
numPartitionsToConsider = Math.min(numPartitionsToConsider, numPartitionUpdateTasksForDomain);
// Consider statistics
int firstIndex = Math.max(0, partitionUpdateTaskStatisticsList.size() - numPartitionsToConsider);
long minStartTimeMs = -1;
long maxEndTimeMs = -1;
// Compute time window for the chosen subset of partition update statistics
for (int i = firstIndex; i < partitionUpdateTaskStatisticsList.size(); ++i) {
long startTimeMs = partitionUpdateTaskStatisticsList.get(i).getStartTimeMs();
long endTimeMs = partitionUpdateTaskStatisticsList.get(i).getEndTimeMs();
if (minStartTimeMs < 0 || startTimeMs < minStartTimeMs) {
minStartTimeMs = startTimeMs;
}
if (maxEndTimeMs < 0 || endTimeMs > maxEndTimeMs) {
maxEndTimeMs = endTimeMs;
}
}
// Compute window statistics
long windowDurationMS = maxEndTimeMs - minStartTimeMs;
long numPartitionUpdateTasksFinishedInWindow = Math.min(partitionUpdateTaskStatisticsList.size(), numPartitionsToConsider);
if (windowDurationMS == 0 || numPartitionUpdateTasksFinishedInWindow == 0) {
return -1;
}
// Compute time taken by partition updates of this domain
double numSecondsPerPartitionUpdateTask = ((double)windowDurationMS / 1000.0d) / (double)numPartitionUpdateTasksFinishedInWindow;
// Compute ETA in seconds for this domain
long numRemainingPartitionUpdateTasksForDomain = numPartitionUpdateTasksForDomain - partitionUpdateTaskStatisticsList.size();
long domainETA = Math.round(numRemainingPartitionUpdateTasksForDomain * numSecondsPerPartitionUpdateTask);
if (domainETA > maxDomainETA) {
maxDomainETA = domainETA;
}
}
return maxDomainETA;
}
public synchronized void logStats() {
Map<String, DurationAggregator> hankTimerDurationAggregators = new TreeMap<String, DurationAggregator>();
for (Map.Entry<Domain, List<PartitionUpdateTaskStatistics>> entry1 : domainToPartitionUpdateTaskStatistics.entrySet()) {
Domain domain = entry1.getKey();
List<PartitionUpdateTaskStatistics> partitionUpdateTaskStatisticsList = entry1.getValue();
for (PartitionUpdateTaskStatistics partitionUpdateTaskStatistics : partitionUpdateTaskStatisticsList) {
for (Map.Entry<String, Long> entry2 : partitionUpdateTaskStatistics.getDurationsMs().entrySet()) {
String name = domain.getName() + " - " + entry2.getKey();
Long duration = entry2.getValue();
DurationAggregator aggregator = hankTimerDurationAggregators.get(name);
if (aggregator == null) {
aggregator = new DurationAggregator(name);
hankTimerDurationAggregators.put(name, aggregator);
}
aggregator.add(duration);
}
}
}
for (DurationAggregator aggregator : hankTimerDurationAggregators.values()) {
aggregator.logStats();
}
}
}
private final class PartitionUpdateTask implements Runnable, Comparable<PartitionUpdateTask> {
private final HostDomain hostDomain;
private final Domain domain;
private final HostDomainPartition partition;
private final PartitionUpdateTaskStatisticsAggregator partitionUpdateTaskStatisticsAggregator;
private final List<Throwable> encounteredThrowables;
private final DiskPartitionAssignment assignment;
public PartitionUpdateTask(HostDomain hostDomain,
HostDomainPartition partition,
PartitionUpdateTaskStatisticsAggregator partitionUpdateTaskStatisticsAggregator,
List<Throwable> encounteredThrowables,
DiskPartitionAssignment assignment) {
this.hostDomain = hostDomain;
this.encounteredThrowables = encounteredThrowables;
this.domain = hostDomain.getDomain();
this.partition = partition;
this.assignment = assignment;
this.partitionUpdateTaskStatisticsAggregator = partitionUpdateTaskStatisticsAggregator;
// Register itself in the aggregator
partitionUpdateTaskStatisticsAggregator.register(this);
}
public Domain getDomain() {
return domain;
}
public String getDataDirectory() {
return assignment.getDisk(partition.getPartitionNumber());
}
@Override
public void run() {
PartitionUpdateTaskStatistics statistics = new PartitionUpdateTaskStatistics();
statistics.setStartTimeMs(System.currentTimeMillis());
try {
// Determine target version
DomainAndVersion targetDomainAndVersion =
ringGroup.getDomainGroup().getDomainVersion(domain);
// If unable to determine the version, this partition is deletable (the corresponding domain is not in the
// target domain group version)
if (partition.isDeletable() || targetDomainAndVersion == null) {
deletePartition(hostDomain, partition);
} else {
// Determine Domain Version
DomainVersion targetDomainVersion =
domain.getVersion(targetDomainAndVersion.getVersionNumber());
// Skip partitions already up-to-date
if (partition.getCurrentDomainVersion() != null &&
partition.getCurrentDomainVersion().equals(targetDomainVersion.getVersionNumber())) {
LOG.info(String.format(
"Skipping partition update of domain %s partition %d to version %d (it is already up-to-date).",
domain.getName(), partition.getPartitionNumber(), targetDomainVersion.getVersionNumber()));
return;
}
// Mark the beginning of the update by first unsetting the partition's current version number.
// That way, if the update fails, we will have to update it again, and won't be able to serve it.
partition.setCurrentDomainVersion(null);
// Perform update
StorageEngine storageEngine = domain.getStorageEngine();
LOG.info(String.format(
"Starting partition update of domain %s partition %d to version %d in %s.",
domain.getName(), partition.getPartitionNumber(), targetDomainVersion.getVersionNumber(), getDataDirectory()));
storageEngine.getUpdater(assignment, partition.getPartitionNumber()).updateTo(targetDomainVersion, statistics);
// Record update success
partition.setCurrentDomainVersion(targetDomainVersion.getVersionNumber());
LOG.info(String.format(
"Completed partition update of domain %s partition %d to version %d.",
domain.getName(), partition.getPartitionNumber(), targetDomainVersion.getVersionNumber()));
}
} catch (Throwable t) {
LOG.error(String.format("Failed to complete partition update of domain %s partition %d.",
domain.getName(), partition.getPartitionNumber()), t);
encounteredThrowables.add(t);
} finally {
statistics.setEndTimeMs(System.currentTimeMillis());
partitionUpdateTaskStatisticsAggregator.recordPartitionUpdateTaskStatistics(this, statistics);
}
}
private void deletePartition(HostDomain hostDomain,
HostDomainPartition partition) throws IOException {
LOG.info("Deleting Domain " + hostDomain.getDomain().getName() + " partition " + partition.getPartitionNumber());
Deleter deleter = hostDomain.getDomain().getStorageEngine().getDeleter(assignment, partition.getPartitionNumber());
deleter.delete();
hostDomain.removePartition(partition.getPartitionNumber());
}
@Override
public int compareTo(PartitionUpdateTask other) {
if (partition.getPartitionNumber() < other.partition.getPartitionNumber()) {
return -1;
} else if (partition.getPartitionNumber() > other.partition.getPartitionNumber()) {
return 1;
} else {
return 0;
}
}
}
private static class UpdaterThreadFactory implements ThreadFactory {
private final String dataDirectory;
private AtomicInteger threadID = new AtomicInteger(0);
public UpdaterThreadFactory(String dataDirectory) {
this.dataDirectory = dataDirectory;
}
@Override
public Thread newThread(Runnable r) {
return new Thread(r, "Updater Thread Pool Thread: " + dataDirectory + " #" + threadID.getAndIncrement());
}
}
private static class UpdateThreadPoolExecutor extends ThreadPoolExecutor {
private final Semaphore semaphore;
public UpdateThreadPoolExecutor(int numThreads,
ThreadFactory threadFactory,
Semaphore semaphore) {
// Essentially a fixed thread pool
super(
numThreads,
numThreads,
0L,
TimeUnit.MILLISECONDS,
new LinkedBlockingQueue<Runnable>(),
threadFactory);
this.semaphore = semaphore;
}
@Override
protected void beforeExecute(Thread thread, Runnable runnable) {
try {
semaphore.acquire();
} catch (InterruptedException e) {
LOG.error("Failed to acquire update thread semaphore", e);
throw new RuntimeException(e);
}
}
@Override
protected void afterExecute(Runnable runnable, Throwable throwable) {
semaphore.release();
}
}
private final PartitionServerConfigurator configurator;
private final Host host;
private final RingGroup ringGroup;
public UpdateManager(PartitionServerConfigurator configurator, Host host, RingGroup ringGroup) throws IOException {
this.configurator = configurator;
this.host = host;
this.ringGroup = ringGroup;
}
@Override
public void update() throws IOException {
HankTimer timer = new HankTimer();
try {
// Delete unknown files
deleteUnknownFiles();
// Perform update
Semaphore concurrentUpdatesSemaphore = new Semaphore(configurator.getNumConcurrentUpdates());
List<Throwable> encounteredThrowables = new ArrayList<Throwable>();
PartitionUpdateTaskStatisticsAggregator partitionUpdateTaskStatisticsAggregator = new PartitionUpdateTaskStatisticsAggregator();
Map<String, Queue<PartitionUpdateTask>> dataDirectoryToUpdateTasks = new HashMap<String, Queue<PartitionUpdateTask>>();
List<PartitionUpdateTask> allUpdateTasks = buildPartitionUpdateTasks(partitionUpdateTaskStatisticsAggregator, encounteredThrowables);
// Build and organize update tasks per data directory
for (PartitionUpdateTask updateTask : allUpdateTasks) {
String dataDirectory = updateTask.getDataDirectory();
Queue<PartitionUpdateTask> updateTasks = dataDirectoryToUpdateTasks.get(dataDirectory);
if (updateTasks == null) {
updateTasks = new LinkedList<PartitionUpdateTask>();
dataDirectoryToUpdateTasks.put(dataDirectory, updateTasks);
}
updateTasks.add(updateTask);
}
// Logging
LOG.info("Number of update tasks: " + allUpdateTasks.size());
for (Map.Entry<String, Queue<PartitionUpdateTask>> entry : dataDirectoryToUpdateTasks.entrySet()) {
LOG.info("Number of update tasks scheduled in " + entry.getKey() + ": " + entry.getValue().size());
}
// Build executor services
Map<String, ExecutorService> dataDirectoryToExecutorService = new HashMap<String, ExecutorService>();
for (String dataDirectory : dataDirectoryToUpdateTasks.keySet()) {
dataDirectoryToExecutorService.put(dataDirectory,
new UpdateThreadPoolExecutor(
configurator.getMaxConcurrentUpdatesPerDataDirectory(),
new UpdaterThreadFactory(dataDirectory),
concurrentUpdatesSemaphore));
}
LOG.info("Submitting update tasks for " + dataDirectoryToUpdateTasks.size() + " directories.");
// Execute tasks. We execute one task for each data directory and loop around so that the tasks
// attempt to acquire the semaphore in a reasonable order.
boolean remaining = true;
while (remaining) {
remaining = false;
for (Map.Entry<String, Queue<PartitionUpdateTask>> entry : dataDirectoryToUpdateTasks.entrySet()) {
// Pop next task
Queue<PartitionUpdateTask> partitionUpdateTasks = entry.getValue();
if (!partitionUpdateTasks.isEmpty()) {
PartitionUpdateTask partitionUpdateTask = partitionUpdateTasks.remove();
// Execute task
dataDirectoryToExecutorService.get(entry.getKey()).execute(partitionUpdateTask);
}
if (!partitionUpdateTasks.isEmpty()) {
remaining = true;
}
}
}
LOG.info("All update tasks submitted, shutting down executor services");
// Shutdown executors
for (ExecutorService executorService : dataDirectoryToExecutorService.values()) {
executorService.shutdown();
}
LOG.info("Waiting for executors to finish.");
// Wait for executors to finish
for (Map.Entry<String, ExecutorService> entry : dataDirectoryToExecutorService.entrySet()) {
String directory = entry.getKey();
ExecutorService executorService = entry.getValue();
boolean keepWaiting = true;
while (keepWaiting) {
try {
LOG.info("Waiting for updates to complete on data directory: " + directory);
boolean terminated = executorService.awaitTermination(
UPDATE_EXECUTOR_TERMINATION_CHECK_TIMEOUT_VALUE,
UPDATE_EXECUTOR_TERMINATION_CHECK_TIMEOUT_UNIT);
if (terminated) {
// We finished executing all tasks
// Otherwise, timeout elapsed and current thread was not interrupted. Keep waiting.
LOG.info("Finished updates for directory: " + directory);
keepWaiting = false;
}
// Record update ETA
Hosts.setUpdateETA(host, partitionUpdateTaskStatisticsAggregator.computeETA());
} catch (InterruptedException e) {
// Received interruption (stop request).
// Swallow the interrupted state and ask the executor to shutdown immediately. Also, keep waiting.
LOG.info("The update manager was interrupted. Stopping the update process (stop executing new partition update tasks" +
" and wait for those that were running to finish).");
// Shutdown all executors
for (ExecutorService otherExecutorService : dataDirectoryToExecutorService.values()) {
otherExecutorService.shutdownNow();
}
// Record failed update exception (we need to keep waiting)
encounteredThrowables.add(new IOException("Failed to complete update: update interruption was requested."));
}
}
}
LOG.info("All executors have finished updates");
// Shutdown all executors
for (ExecutorService executorService : dataDirectoryToExecutorService.values()) {
executorService.shutdownNow();
}
LOG.info("Finished with " + encounteredThrowables.size() + " errors.");
// Detect failures
if (!encounteredThrowables.isEmpty()) {
LOG.error(String.format("%d exceptions encountered while running partition update tasks:", encounteredThrowables.size()));
int i = 0;
for (Throwable t : encounteredThrowables) {
LOG.error(String.format("Exception %d/%d:", ++i, encounteredThrowables.size()), t);
}
throw new IOException(String.format(
"Failed to complete update: %d exceptions encountered while running partition update tasks.",
encounteredThrowables.size()));
}
// Garbage collect useless host domains
garbageCollectHostDomains(host);
// Log statistics
partitionUpdateTaskStatisticsAggregator.logStats();
} catch (IOException e) {
LOG.info("Update failed and took " + FormatUtils.formatSecondsDuration(timer.getDurationMs() / 1000));
throw e;
}
LOG.info("Update succeeded and took " + FormatUtils.formatSecondsDuration(timer.getDurationMs() / 1000));
}
private ArrayList<PartitionUpdateTask> buildPartitionUpdateTasks(
PartitionUpdateTaskStatisticsAggregator partitionUpdateTaskStatisticsAggregator,
List<Throwable> encounteredThrowables) throws IOException {
ArrayList<PartitionUpdateTask> partitionUpdateTasks = new ArrayList<PartitionUpdateTask>();
for (HostDomain hostDomain : host.getAssignedDomains()) {
StorageEngine engine = hostDomain.getDomain().getStorageEngine();
DiskPartitionAssignment assignments = engine.getDataDirectoryPerPartition(configurator, getPartitionNumbers(hostDomain.getPartitions()));
for (HostDomainPartition partition : hostDomain.getPartitions()) {
partitionUpdateTasks.add(
new PartitionUpdateTask(
hostDomain,
partition,
partitionUpdateTaskStatisticsAggregator,
encounteredThrowables,
assignments));
}
}
// Sort update tasks per partition id, so that we update domains concurrently but in order of partition number
Collections.sort(partitionUpdateTasks);
return partitionUpdateTasks;
}
private void garbageCollectHostDomains(Host host) throws IOException {
// Delete deletable domains and partitions
for (HostDomain hostDomain : host.getAssignedDomains()) {
// Host domain does not contain anymore partitions. Delete it
LOG.info("Host Domain " + hostDomain + " is assigned " + hostDomain.getPartitions().size() + " partitions.");
if (hostDomain.getPartitions().size() == 0) {
LOG.info("Garbage collecting Host Domain " + hostDomain + " as it is not used anymore.");
host.removeDomain(hostDomain.getDomain());
}
}
}
private void deleteUnknownFiles() throws IOException {
// Compute expected files
Set<String> expectedFiles = new HashSet<String>();
for (HostDomain hostDomain : host.getAssignedDomains()) {
StorageEngine storageEngine = hostDomain.getDomain().getStorageEngine();
DiskPartitionAssignment assignments = storageEngine.getDataDirectoryPerPartition(configurator, getPartitionNumbers(hostDomain.getPartitions()));
for (HostDomainPartition hostDomainPartition : hostDomain.getPartitions()) {
Integer versionNumber = hostDomainPartition.getCurrentDomainVersion();
if (versionNumber != null) {
for (String filePath : storageEngine.getFiles(assignments, versionNumber, hostDomainPartition.getPartitionNumber())) {
File file = new File(filePath);
// Add the file itself
expectedFiles.add(file.getCanonicalPath());
// Add all parent directories
File parent = file.getParentFile();
while (parent != null) {
expectedFiles.add(parent.getCanonicalPath());
parent = parent.getParentFile();
}
}
}
}
}
// Delete unknown files
for (String dataDirectoryPath : configurator.getDataDirectories()) {
LOG.info("Deleting unknown files in " + dataDirectoryPath);
File dataDirectory = new File(dataDirectoryPath);
if (dataDirectory.exists()) {
for (File file : FileUtils.listFiles(dataDirectory, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE)) {
if (!expectedFiles.contains(file.getCanonicalPath())) {
LOG.info("Deleting unknown file: " + file.getCanonicalPath());
FileUtils.deleteQuietly(file);
}
}
}
}
}
private static Set<Integer> getPartitionNumbers(Collection<HostDomainPartition> partition) {
Set<Integer> partitionNumbers = Sets.newHashSet();
for (HostDomainPartition hostDomainPartition : partition) {
partitionNumbers.add(hostDomainPartition.getPartitionNumber());
}
return partitionNumbers;
}
}