/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.accumulo.tserver.tablet;
import static com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.TimeUnit;
import org.apache.accumulo.core.conf.ConfigurationTypeHelper;
import org.apache.accumulo.core.conf.Property;
import org.apache.accumulo.core.data.impl.KeyExtent;
import org.apache.accumulo.core.metadata.schema.DataFileValue;
import org.apache.accumulo.core.replication.ReplicationConfigurationUtil;
import org.apache.accumulo.core.trace.Span;
import org.apache.accumulo.core.trace.Trace;
import org.apache.accumulo.core.util.MapCounter;
import org.apache.accumulo.core.util.Pair;
import org.apache.accumulo.fate.zookeeper.IZooReaderWriter;
import org.apache.accumulo.server.ServerConstants;
import org.apache.accumulo.server.fs.FileRef;
import org.apache.accumulo.server.fs.VolumeManager;
import org.apache.accumulo.server.master.state.TServerInstance;
import org.apache.accumulo.server.replication.StatusUtil;
import org.apache.accumulo.server.util.MasterMetadataUtil;
import org.apache.accumulo.server.util.MetadataTableUtil;
import org.apache.accumulo.server.util.ReplicationTableUtil;
import org.apache.accumulo.server.zookeeper.ZooReaderWriter;
import org.apache.accumulo.tserver.TLevel;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;
class DatafileManager {
private final Logger log = Logger.getLogger(DatafileManager.class);
// access to datafilesizes needs to be synchronized: see CompactionRunner#getNumFiles
private final Map<FileRef,DataFileValue> datafileSizes = Collections.synchronizedMap(new TreeMap<FileRef,DataFileValue>());
private final Tablet tablet;
private Long maxMergingMinorCompactionFileSize;
// ensure we only have one reader/writer of our bulk file notes at at time
private final Object bulkFileImportLock = new Object();
DatafileManager(Tablet tablet, SortedMap<FileRef,DataFileValue> datafileSizes) {
for (Entry<FileRef,DataFileValue> datafiles : datafileSizes.entrySet()) {
this.datafileSizes.put(datafiles.getKey(), datafiles.getValue());
}
this.tablet = tablet;
}
private FileRef mergingMinorCompactionFile = null;
private final Set<FileRef> filesToDeleteAfterScan = new HashSet<>();
private final Map<Long,Set<FileRef>> scanFileReservations = new HashMap<>();
private final MapCounter<FileRef> fileScanReferenceCounts = new MapCounter<>();
private long nextScanReservationId = 0;
private boolean reservationsBlocked = false;
private final Set<FileRef> majorCompactingFiles = new HashSet<>();
static void rename(VolumeManager fs, Path src, Path dst) throws IOException {
if (!fs.rename(src, dst)) {
throw new IOException("Rename " + src + " to " + dst + " returned false ");
}
}
Pair<Long,Map<FileRef,DataFileValue>> reserveFilesForScan() {
synchronized (tablet) {
while (reservationsBlocked) {
try {
tablet.wait(50);
} catch (InterruptedException e) {
log.warn(e, e);
}
}
Set<FileRef> absFilePaths = new HashSet<>(datafileSizes.keySet());
long rid = nextScanReservationId++;
scanFileReservations.put(rid, absFilePaths);
Map<FileRef,DataFileValue> ret = new HashMap<>();
for (FileRef path : absFilePaths) {
fileScanReferenceCounts.increment(path, 1);
ret.put(path, datafileSizes.get(path));
}
return new Pair<>(rid, ret);
}
}
void returnFilesForScan(Long reservationId) {
final Set<FileRef> filesToDelete = new HashSet<>();
synchronized (tablet) {
Set<FileRef> absFilePaths = scanFileReservations.remove(reservationId);
if (absFilePaths == null)
throw new IllegalArgumentException("Unknown scan reservation id " + reservationId);
boolean notify = false;
for (FileRef path : absFilePaths) {
long refCount = fileScanReferenceCounts.decrement(path, 1);
if (refCount == 0) {
if (filesToDeleteAfterScan.remove(path))
filesToDelete.add(path);
notify = true;
} else if (refCount < 0)
throw new IllegalStateException("Scan ref count for " + path + " is " + refCount);
}
if (notify)
tablet.notifyAll();
}
if (filesToDelete.size() > 0) {
log.debug("Removing scan refs from metadata " + tablet.getExtent() + " " + filesToDelete);
MetadataTableUtil.removeScanFiles(tablet.getExtent(), filesToDelete, tablet.getTabletServer(), tablet.getTabletServer().getLock());
}
}
void removeFilesAfterScan(Set<FileRef> scanFiles) {
if (scanFiles.size() == 0)
return;
Set<FileRef> filesToDelete = new HashSet<>();
synchronized (tablet) {
for (FileRef path : scanFiles) {
if (fileScanReferenceCounts.get(path) == 0)
filesToDelete.add(path);
else
filesToDeleteAfterScan.add(path);
}
}
if (filesToDelete.size() > 0) {
log.debug("Removing scan refs from metadata " + tablet.getExtent() + " " + filesToDelete);
MetadataTableUtil.removeScanFiles(tablet.getExtent(), filesToDelete, tablet.getTabletServer(), tablet.getTabletServer().getLock());
}
}
private TreeSet<FileRef> waitForScansToFinish(Set<FileRef> pathsToWaitFor, boolean blockNewScans, long maxWaitTime) {
long startTime = System.currentTimeMillis();
TreeSet<FileRef> inUse = new TreeSet<>();
Span waitForScans = Trace.start("waitForScans");
try {
synchronized (tablet) {
if (blockNewScans) {
if (reservationsBlocked)
throw new IllegalStateException();
reservationsBlocked = true;
}
for (FileRef path : pathsToWaitFor) {
while (fileScanReferenceCounts.get(path) > 0 && System.currentTimeMillis() - startTime < maxWaitTime) {
try {
tablet.wait(100);
} catch (InterruptedException e) {
log.warn(e, e);
}
}
}
for (FileRef path : pathsToWaitFor) {
if (fileScanReferenceCounts.get(path) > 0)
inUse.add(path);
}
if (blockNewScans) {
reservationsBlocked = false;
tablet.notifyAll();
}
}
} finally {
waitForScans.stop();
}
return inUse;
}
public void importMapFiles(long tid, Map<FileRef,DataFileValue> pathsString, boolean setTime) throws IOException {
String bulkDir = null;
Map<FileRef,DataFileValue> paths = new HashMap<>();
for (Entry<FileRef,DataFileValue> entry : pathsString.entrySet())
paths.put(entry.getKey(), entry.getValue());
for (FileRef tpath : paths.keySet()) {
boolean inTheRightDirectory = false;
Path parent = tpath.path().getParent().getParent();
for (String tablesDir : ServerConstants.getTablesDirs()) {
if (parent.equals(new Path(tablesDir, tablet.getExtent().getTableId()))) {
inTheRightDirectory = true;
break;
}
}
if (!inTheRightDirectory) {
throw new IOException("Data file " + tpath + " not in table dirs");
}
if (bulkDir == null)
bulkDir = tpath.path().getParent().toString();
else if (!bulkDir.equals(tpath.path().getParent().toString()))
throw new IllegalArgumentException("bulk files in different dirs " + bulkDir + " " + tpath);
}
if (tablet.getExtent().isMeta()) {
throw new IllegalArgumentException("Can not import files to a metadata tablet");
}
synchronized (bulkFileImportLock) {
if (paths.size() > 0) {
long bulkTime = Long.MIN_VALUE;
if (setTime) {
for (DataFileValue dfv : paths.values()) {
long nextTime = tablet.getAndUpdateTime();
if (nextTime < bulkTime)
throw new IllegalStateException("Time went backwards unexpectedly " + nextTime + " " + bulkTime);
bulkTime = nextTime;
dfv.setTime(bulkTime);
}
}
tablet.updatePersistedTime(bulkTime, paths, tid);
}
}
synchronized (tablet) {
for (Entry<FileRef,DataFileValue> tpath : paths.entrySet()) {
if (datafileSizes.containsKey(tpath.getKey())) {
log.error("Adding file that is already in set " + tpath.getKey());
}
datafileSizes.put(tpath.getKey(), tpath.getValue());
}
tablet.getTabletResources().importedMapFiles();
tablet.computeNumEntries();
}
for (Entry<FileRef,DataFileValue> entry : paths.entrySet()) {
log.log(TLevel.TABLET_HIST, tablet.getExtent() + " import " + entry.getKey() + " " + entry.getValue());
}
}
FileRef reserveMergingMinorCompactionFile() {
if (mergingMinorCompactionFile != null)
throw new IllegalStateException("Tried to reserve merging minor compaction file when already reserved : " + mergingMinorCompactionFile);
if (tablet.getExtent().isRootTablet())
return null;
int maxFiles = tablet.getTableConfiguration().getMaxFilesPerTablet();
// when a major compaction is running and we are at max files, write out
// one extra file... want to avoid the case where major compaction is
// compacting everything except for the largest file, and therefore the
// largest file is returned for merging.. the following check mostly
// avoids this case, except for the case where major compactions fail or
// are canceled
if (majorCompactingFiles.size() > 0 && datafileSizes.size() == maxFiles)
return null;
if (datafileSizes.size() >= maxFiles) {
// find the smallest file
long maxFileSize = Long.MAX_VALUE;
maxMergingMinorCompactionFileSize = ConfigurationTypeHelper.getFixedMemoryAsBytes(tablet.getTableConfiguration().get(
Property.TABLE_MINC_MAX_MERGE_FILE_SIZE));
if (maxMergingMinorCompactionFileSize > 0) {
maxFileSize = maxMergingMinorCompactionFileSize;
}
long min = maxFileSize;
FileRef minName = null;
for (Entry<FileRef,DataFileValue> entry : datafileSizes.entrySet()) {
if (entry.getValue().getSize() <= min && !majorCompactingFiles.contains(entry.getKey())) {
min = entry.getValue().getSize();
minName = entry.getKey();
}
}
if (minName == null)
return null;
mergingMinorCompactionFile = minName;
return minName;
}
return null;
}
void unreserveMergingMinorCompactionFile(FileRef file) {
if ((file == null && mergingMinorCompactionFile != null) || (file != null && mergingMinorCompactionFile == null)
|| (file != null && mergingMinorCompactionFile != null && !file.equals(mergingMinorCompactionFile)))
throw new IllegalStateException("Disagreement " + file + " " + mergingMinorCompactionFile);
mergingMinorCompactionFile = null;
}
void bringMinorCompactionOnline(FileRef tmpDatafile, FileRef newDatafile, FileRef absMergeFile, DataFileValue dfv, CommitSession commitSession, long flushId)
throws IOException {
IZooReaderWriter zoo = ZooReaderWriter.getInstance();
if (tablet.getExtent().isRootTablet()) {
try {
if (!zoo.isLockHeld(tablet.getTabletServer().getLock().getLockID())) {
throw new IllegalStateException();
}
} catch (Exception e) {
throw new IllegalStateException("Can not bring major compaction online, lock not held", e);
}
}
// rename before putting in metadata table, so files in metadata table should
// always exist
do {
try {
if (dfv.getNumEntries() == 0) {
tablet.getTabletServer().getFileSystem().deleteRecursively(tmpDatafile.path());
} else {
if (tablet.getTabletServer().getFileSystem().exists(newDatafile.path())) {
log.warn("Target map file already exist " + newDatafile);
tablet.getTabletServer().getFileSystem().deleteRecursively(newDatafile.path());
}
rename(tablet.getTabletServer().getFileSystem(), tmpDatafile.path(), newDatafile.path());
}
break;
} catch (IOException ioe) {
log.warn("Tablet " + tablet.getExtent() + " failed to rename " + newDatafile + " after MinC, will retry in 60 secs...", ioe);
sleepUninterruptibly(1, TimeUnit.MINUTES);
}
} while (true);
long t1, t2;
// the code below always assumes merged files are in use by scans... this must be done
// because the in memory list of files is not updated until after the metadata table
// therefore the file is available to scans until memory is updated, but want to ensure
// the file is not available for garbage collection... if memory were updated
// before this point (like major compactions do), then the following code could wait
// for scans to finish like major compactions do.... used to wait for scans to finish
// here, but that was incorrect because a scan could start after waiting but before
// memory was updated... assuming the file is always in use by scans leads to
// one uneeded metadata update when it was not actually in use
Set<FileRef> filesInUseByScans = Collections.emptySet();
if (absMergeFile != null)
filesInUseByScans = Collections.singleton(absMergeFile);
// very important to write delete entries outside of log lock, because
// this metadata write does not go up... it goes sideways or to itself
if (absMergeFile != null)
MetadataTableUtil.addDeleteEntries(tablet.getExtent(), Collections.singleton(absMergeFile), tablet.getTabletServer());
Set<String> unusedWalLogs = tablet.beginClearingUnusedLogs();
boolean replicate = ReplicationConfigurationUtil.isEnabled(tablet.getExtent(), tablet.getTableConfiguration());
Set<String> logFileOnly = null;
if (replicate) {
// unusedWalLogs is of the form host/fileURI, need to strip off the host portion
logFileOnly = new HashSet<>();
for (String unusedWalLog : unusedWalLogs) {
int index = unusedWalLog.indexOf('/');
if (-1 == index) {
log.warn("Could not find host component to strip from DFSLogger representation of WAL");
} else {
unusedWalLog = unusedWalLog.substring(index + 1);
}
logFileOnly.add(unusedWalLog);
}
}
try {
// the order of writing to metadata and walog is important in the face of machine/process failures
// need to write to metadata before writing to walog, when things are done in the reverse order
// data could be lost... the minor compaction start even should be written before the following metadata
// write is made
tablet.updateTabletDataFile(commitSession.getMaxCommittedTime(), newDatafile, absMergeFile, dfv, unusedWalLogs, filesInUseByScans, flushId);
// Mark that we have data we want to replicate
// This WAL could still be in use by other Tablets *from the same table*, so we can only mark that there is data to replicate,
// but it is *not* closed. We know it is not closed by the fact that this MinC triggered. A MinC cannot happen unless the
// tablet is online and thus these WALs are referenced by that tablet. Therefore, the WAL replication status cannot be 'closed'.
if (replicate) {
if (log.isDebugEnabled()) {
log.debug("Recording that data has been ingested into " + tablet.getExtent() + " using " + logFileOnly);
}
for (String logFile : logFileOnly) {
ReplicationTableUtil.updateFiles(tablet.getTabletServer(), tablet.getExtent(), logFile, StatusUtil.openWithUnknownLength());
}
}
} finally {
tablet.finishClearingUnusedLogs();
}
do {
try {
// the purpose of making this update use the new commit session, instead of the old one passed in,
// is because the new one will reference the logs used by current memory...
tablet.getTabletServer().minorCompactionFinished(tablet.getTabletMemory().getCommitSession(), newDatafile.toString(), commitSession.getWALogSeq() + 2);
break;
} catch (IOException e) {
log.error("Failed to write to write-ahead log " + e.getMessage() + " will retry", e);
sleepUninterruptibly(1, TimeUnit.SECONDS);
}
} while (true);
synchronized (tablet) {
t1 = System.currentTimeMillis();
if (datafileSizes.containsKey(newDatafile)) {
log.error("Adding file that is already in set " + newDatafile);
}
if (dfv.getNumEntries() > 0) {
datafileSizes.put(newDatafile, dfv);
}
if (absMergeFile != null) {
datafileSizes.remove(absMergeFile);
}
unreserveMergingMinorCompactionFile(absMergeFile);
tablet.flushComplete(flushId);
t2 = System.currentTimeMillis();
}
// must do this after list of files in memory is updated above
removeFilesAfterScan(filesInUseByScans);
if (absMergeFile != null)
log.log(TLevel.TABLET_HIST, tablet.getExtent() + " MinC [" + absMergeFile + ",memory] -> " + newDatafile);
else
log.log(TLevel.TABLET_HIST, tablet.getExtent() + " MinC [memory] -> " + newDatafile);
log.debug(String.format("MinC finish lock %.2f secs %s", (t2 - t1) / 1000.0, tablet.getExtent().toString()));
long splitSize = tablet.getTableConfiguration().getAsBytes(Property.TABLE_SPLIT_THRESHOLD);
if (dfv.getSize() > splitSize) {
log.debug(String.format("Minor Compaction wrote out file larger than split threshold. split threshold = %,d file size = %,d", splitSize, dfv.getSize()));
}
}
public void reserveMajorCompactingFiles(Collection<FileRef> files) {
if (majorCompactingFiles.size() != 0)
throw new IllegalStateException("Major compacting files not empty " + majorCompactingFiles);
if (mergingMinorCompactionFile != null && files.contains(mergingMinorCompactionFile))
throw new IllegalStateException("Major compaction tried to resrve file in use by minor compaction " + mergingMinorCompactionFile);
majorCompactingFiles.addAll(files);
}
public void clearMajorCompactingFile() {
majorCompactingFiles.clear();
}
void bringMajorCompactionOnline(Set<FileRef> oldDatafiles, FileRef tmpDatafile, FileRef newDatafile, Long compactionId, DataFileValue dfv) throws IOException {
final KeyExtent extent = tablet.getExtent();
long t1, t2;
if (!extent.isRootTablet()) {
if (tablet.getTabletServer().getFileSystem().exists(newDatafile.path())) {
log.error("Target map file already exist " + newDatafile, new Exception());
throw new IllegalStateException("Target map file already exist " + newDatafile);
}
// rename before putting in metadata table, so files in metadata table should
// always exist
rename(tablet.getTabletServer().getFileSystem(), tmpDatafile.path(), newDatafile.path());
if (dfv.getNumEntries() == 0) {
tablet.getTabletServer().getFileSystem().deleteRecursively(newDatafile.path());
}
}
TServerInstance lastLocation = null;
synchronized (tablet) {
t1 = System.currentTimeMillis();
IZooReaderWriter zoo = ZooReaderWriter.getInstance();
tablet.incrementDataSourceDeletions();
if (extent.isRootTablet()) {
waitForScansToFinish(oldDatafiles, true, Long.MAX_VALUE);
try {
if (!zoo.isLockHeld(tablet.getTabletServer().getLock().getLockID())) {
throw new IllegalStateException();
}
} catch (Exception e) {
throw new IllegalStateException("Can not bring major compaction online, lock not held", e);
}
// mark files as ready for deletion, but
// do not delete them until we successfully
// rename the compacted map file, in case
// the system goes down
RootFiles.replaceFiles(tablet.getTableConfiguration(), tablet.getTabletServer().getFileSystem(), tablet.getLocation(), oldDatafiles, tmpDatafile,
newDatafile);
}
// atomically remove old files and add new file
for (FileRef oldDatafile : oldDatafiles) {
if (!datafileSizes.containsKey(oldDatafile)) {
log.error("file does not exist in set " + oldDatafile);
}
datafileSizes.remove(oldDatafile);
majorCompactingFiles.remove(oldDatafile);
}
if (datafileSizes.containsKey(newDatafile)) {
log.error("Adding file that is already in set " + newDatafile);
}
if (dfv.getNumEntries() > 0) {
datafileSizes.put(newDatafile, dfv);
}
// could be used by a follow on compaction in a multipass compaction
majorCompactingFiles.add(newDatafile);
tablet.computeNumEntries();
lastLocation = tablet.resetLastLocation();
tablet.setLastCompactionID(compactionId);
t2 = System.currentTimeMillis();
}
if (!extent.isRootTablet()) {
Set<FileRef> filesInUseByScans = waitForScansToFinish(oldDatafiles, false, 10000);
if (filesInUseByScans.size() > 0)
log.debug("Adding scan refs to metadata " + extent + " " + filesInUseByScans);
MasterMetadataUtil.replaceDatafiles(tablet.getTabletServer(), extent, oldDatafiles, filesInUseByScans, newDatafile, compactionId, dfv, tablet
.getTabletServer().getClientAddressString(), lastLocation, tablet.getTabletServer().getLock());
removeFilesAfterScan(filesInUseByScans);
}
log.debug(String.format("MajC finish lock %.2f secs", (t2 - t1) / 1000.0));
log.log(TLevel.TABLET_HIST, extent + " MajC " + oldDatafiles + " --> " + newDatafile);
}
public SortedMap<FileRef,DataFileValue> getDatafileSizes() {
synchronized (tablet) {
TreeMap<FileRef,DataFileValue> copy = new TreeMap<>(datafileSizes);
return Collections.unmodifiableSortedMap(copy);
}
}
public Set<FileRef> getFiles() {
synchronized (tablet) {
HashSet<FileRef> files = new HashSet<>(datafileSizes.keySet());
return Collections.unmodifiableSet(files);
}
}
public int getNumFiles() {
return datafileSizes.size();
}
}