/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.accumulo.server.tabletserver;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.TreeMap;
import org.apache.accumulo.core.client.Instance;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.client.impl.Tables;
import org.apache.accumulo.core.conf.Property;
import org.apache.accumulo.core.data.impl.KeyExtent;
import org.apache.accumulo.server.conf.ServerConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The LargestFirstMemoryManager attempts to keep memory between 80% and 90% full. It adapts over time the point at which it should start a compaction based on
* how full memory gets between successive calls. It will also flush idle tablets based on a per-table configurable idle time. It will only attempt to flush
* tablets up to 20% of all memory. And, as the name of the class would suggest, it flushes the tablet with the highest memory footprint. However, it actually
* chooses the tablet as a function of its size doubled for every 15 minutes of idle time.
*/
public class LargestFirstMemoryManager implements MemoryManager {
private static final Logger log = LoggerFactory.getLogger(LargestFirstMemoryManager.class);
private static final long ZERO_TIME = System.currentTimeMillis();
private static final int TSERV_MINC_MAXCONCURRENT_NUMWAITING_MULTIPLIER = 2;
private static final double MAX_FLUSH_AT_ONCE_PERCENT = 0.20;
private long maxMemory = -1;
private int maxConcurrentMincs;
private int numWaitingMultiplier;
private long prevIngestMemory;
// The fraction of memory that needs to be used before we begin flushing.
private double compactionThreshold;
private long maxObserved;
private final HashMap<String,Long> mincIdleThresholds = new HashMap<>();
private ServerConfiguration config = null;
private static class TabletInfo {
final KeyExtent extent;
final long memTableSize;
final long idleTime;
final long load;
public TabletInfo(KeyExtent extent, long memTableSize, long idleTime, long load) {
this.extent = extent;
this.memTableSize = memTableSize;
this.idleTime = idleTime;
this.load = load;
}
}
// A little map that will hold the "largest" N tablets, where largest is a result of the timeMemoryLoad function
private static class LargestMap {
final int max;
final TreeMap<Long,List<TabletInfo>> map = new TreeMap<>();
LargestMap(int n) {
max = n;
}
public boolean put(Long key, TabletInfo value) {
if (map.size() == max) {
if (key.compareTo(map.firstKey()) < 0)
return false;
try {
add(key, value);
return true;
} finally {
map.remove(map.firstKey());
}
} else {
add(key, value);
return true;
}
}
private void add(Long key, TabletInfo value) {
List<TabletInfo> lst = map.get(key);
if (lst != null) {
lst.add(value);
} else {
lst = new ArrayList<>();
lst.add(value);
map.put(key, lst);
}
}
public boolean isEmpty() {
return map.isEmpty();
}
public Entry<Long,List<TabletInfo>> lastEntry() {
return map.lastEntry();
}
public void remove(Long key) {
map.remove(key);
}
}
LargestFirstMemoryManager(long maxMemory, int maxConcurrentMincs, int numWaitingMultiplier) {
this();
this.maxMemory = maxMemory;
this.maxConcurrentMincs = maxConcurrentMincs;
this.numWaitingMultiplier = numWaitingMultiplier;
}
@Override
public void init(ServerConfiguration conf) {
this.config = conf;
maxMemory = conf.getSystemConfiguration().getAsBytes(Property.TSERV_MAXMEM);
maxConcurrentMincs = conf.getSystemConfiguration().getCount(Property.TSERV_MINC_MAXCONCURRENT);
numWaitingMultiplier = TSERV_MINC_MAXCONCURRENT_NUMWAITING_MULTIPLIER;
}
public LargestFirstMemoryManager() {
prevIngestMemory = 0;
compactionThreshold = 0.5;
maxObserved = 0;
}
protected long getMinCIdleThreshold(KeyExtent extent) {
String tableId = extent.getTableId();
if (!mincIdleThresholds.containsKey(tableId))
mincIdleThresholds.put(tableId, config.getTableConfiguration(tableId).getTimeInMillis(Property.TABLE_MINC_COMPACT_IDLETIME));
return mincIdleThresholds.get(tableId);
}
protected boolean tableExists(Instance instance, String tableId) {
return Tables.exists(instance, tableId);
}
@Override
public MemoryManagementActions getMemoryManagementActions(List<TabletState> tablets) {
if (maxMemory < 0)
throw new IllegalStateException("need to initialize " + LargestFirstMemoryManager.class.getName());
final Instance instance = config.getInstance();
final int maxMinCs = maxConcurrentMincs * numWaitingMultiplier;
mincIdleThresholds.clear();
final MemoryManagementActions result = new MemoryManagementActions();
result.tabletsToMinorCompact = new ArrayList<>();
LargestMap largestMemTablets = new LargestMap(maxMinCs);
final LargestMap largestIdleMemTablets = new LargestMap(maxConcurrentMincs);
final long now = currentTimeMillis();
long ingestMemory = 0;
long compactionMemory = 0;
int numWaitingMincs = 0;
// find the largest and most idle tablets
for (TabletState ts : tablets) {
// Make sure that the table still exists
if (!tableExists(instance, ts.getExtent().getTableId())) {
log.trace("Ignoring extent for deleted table: {}", ts.getExtent());
continue;
}
final long memTabletSize = ts.getMemTableSize();
final long minorCompactingSize = ts.getMinorCompactingMemTableSize();
final long idleTime = now - Math.max(ts.getLastCommitTime(), ZERO_TIME);
final long timeMemoryLoad = timeMemoryLoad(memTabletSize, idleTime);
ingestMemory += memTabletSize;
if (minorCompactingSize == 0 && memTabletSize > 0) {
TabletInfo tabletInfo = new TabletInfo(ts.getExtent(), memTabletSize, idleTime, timeMemoryLoad);
try {
// If the table was deleted, getMinCIdleThreshold will throw an exception
if (idleTime > getMinCIdleThreshold(ts.getExtent())) {
largestIdleMemTablets.put(timeMemoryLoad, tabletInfo);
}
} catch (IllegalArgumentException e) {
Throwable cause = e.getCause();
if (null != cause && cause instanceof TableNotFoundException) {
log.trace("Ignoring extent for deleted table: {}", ts.getExtent());
// The table might have been deleted during the iteration of the tablets
// We just want to eat this exception, do nothing with this tablet, and continue
continue;
}
throw e;
}
// Only place the tablet into largestMemTablets map when the table still exists
largestMemTablets.put(timeMemoryLoad, tabletInfo);
}
compactionMemory += minorCompactingSize;
if (minorCompactingSize > 0)
numWaitingMincs++;
}
if (ingestMemory + compactionMemory > maxObserved) {
maxObserved = ingestMemory + compactionMemory;
}
final long memoryChange = ingestMemory - prevIngestMemory;
prevIngestMemory = ingestMemory;
boolean startMinC = false;
if (numWaitingMincs < maxMinCs) {
// based on previous ingest memory increase, if we think that the next increase will
// take us over the threshold for non-compacting memory, then start a minor compaction
// or if the idle time of the chosen tablet is greater than the threshold, start a minor compaction
if (memoryChange >= 0 && ingestMemory + memoryChange > compactionThreshold * maxMemory) {
startMinC = true;
} else if (!largestIdleMemTablets.isEmpty()) {
startMinC = true;
// switch largestMemTablets to largestIdleMemTablets
largestMemTablets = largestIdleMemTablets;
log.debug("IDLE minor compaction chosen");
}
}
if (startMinC) {
long toBeCompacted = compactionMemory;
outer: for (int i = numWaitingMincs; i < maxMinCs && !largestMemTablets.isEmpty(); /* empty */) {
Entry<Long,List<TabletInfo>> lastEntry = largestMemTablets.lastEntry();
for (TabletInfo largest : lastEntry.getValue()) {
toBeCompacted += largest.memTableSize;
result.tabletsToMinorCompact.add(largest.extent);
log.debug(String.format("COMPACTING %s total = %,d ingestMemory = %,d", largest.extent.toString(), (ingestMemory + compactionMemory), ingestMemory));
log.debug(String.format("chosenMem = %,d chosenIT = %.2f load %,d", largest.memTableSize, largest.idleTime / 1000.0, largest.load));
if (toBeCompacted > ingestMemory * MAX_FLUSH_AT_ONCE_PERCENT)
break outer;
i++;
}
largestMemTablets.remove(lastEntry.getKey());
}
} else if (memoryChange < 0) {
// before idle mincs, starting a minor compaction meant that memoryChange >= 0.
// we thought we might want to remove the "else" if that changed,
// however it seems performing idle compactions shouldn't make the threshold
// change more often, so it is staying for now.
// also, now we have the case where memoryChange < 0 due to an idle compaction, yet
// we are still adjusting the threshold. should this be tracked and prevented?
// memory change < 0 means a minor compaction occurred
// we want to see how full the memory got during the compaction
// (the goal is for it to have between 80% and 90% memory utilization)
// and adjust the compactionThreshold accordingly
log.debug(String.format("BEFORE compactionThreshold = %.3f maxObserved = %,d", compactionThreshold, maxObserved));
if (compactionThreshold < 0.82 && maxObserved < 0.8 * maxMemory) {
// 0.82 * 1.1 is about 0.9, which is our desired max threshold
compactionThreshold *= 1.1;
} else if (compactionThreshold > 0.056 && maxObserved > 0.9 * maxMemory) {
// 0.056 * 0.9 is about 0.05, which is our desired min threshold
compactionThreshold *= 0.9;
}
maxObserved = 0;
log.debug(String.format("AFTER compactionThreshold = %.3f", compactionThreshold));
}
return result;
}
protected long currentTimeMillis() {
return System.currentTimeMillis();
}
@Override
public void tabletClosed(KeyExtent extent) {}
// The load function: memory times the idle time, doubling every 15 mins
static long timeMemoryLoad(long mem, long time) {
double minutesIdle = time / 60000.0;
return (long) (mem * Math.pow(2, minutesIdle / 15.0));
}
}