package org.dcache.pool.classic;
import com.google.common.collect.Iterables;
import com.google.common.io.Files;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.charset.Charset;
import java.security.NoSuchAlgorithmException;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.EnumSet;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import diskCacheV111.util.CacheException;
import diskCacheV111.util.FileCorruptedCacheException;
import diskCacheV111.util.FileNotInCacheException;
import diskCacheV111.util.NotInTrashCacheException;
import diskCacheV111.util.PnfsId;
import dmg.cells.nucleus.CellCommandListener;
import dmg.cells.nucleus.CellLifeCycleAware;
import dmg.util.command.Argument;
import dmg.util.command.Command;
import org.dcache.alarms.AlarmMarkerFactory;
import org.dcache.alarms.PredefinedAlarm;
import org.dcache.pool.repository.ReplicaState;
import org.dcache.pool.repository.ReplicaDescriptor;
import org.dcache.pool.repository.Repository;
import org.dcache.pool.repository.Repository.OpenFlags;
import org.dcache.util.Checksum;
import static com.google.common.base.Preconditions.checkNotNull;
public class ChecksumScanner
implements CellCommandListener, CellLifeCycleAware
{
private static final Logger _log =
LoggerFactory.getLogger(ChecksumScanner.class);
private final FullScan _fullScan = new FullScan();
private final Scrubber _scrubber = new Scrubber();
private final SingleScan _singleScan = new SingleScan();
private Repository _repository;
private ChecksumModuleV1 _csm;
private String poolName;
private File _scrubberStateFile;
/** Errors found while running 'csm check'.
*/
private final Map<PnfsId,Iterable<Checksum>> _bad =
new ConcurrentHashMap<>();
private final Runnable listener = this::onConfigChange;
private void onConfigChange()
{
if (_csm.hasPolicy(ChecksumModule.PolicyFlag.SCRUB)) {
startScrubber();
} else {
stopScrubber();
}
}
private void startScrubber()
{
_scrubber.start();
}
private void stopScrubber()
{
_scrubber.kill();
}
public void setRepository(Repository repository)
{
_repository = repository;
}
public void setChecksumModule(ChecksumModuleV1 csm)
{
_csm = csm;
}
public void setScrubberStateFile(File path)
{
_scrubberStateFile = path;
}
public void setPoolName(String poolName) {
this.poolName = poolName;
}
private class FullScan extends Singleton
{
private volatile int _totalCount;
private volatile int _badCount;
private volatile int _unableCount;
public FullScan()
{
super("FullScan");
}
@Override
public void runIt() throws Exception
{
stopScrubber();
try {
_totalCount = _badCount = 0;
_bad.clear();
for (PnfsId id: _repository) {
try {
ReplicaDescriptor handle =
_repository.openEntry(id, EnumSet.of(OpenFlags.NOATIME));
try {
_csm.verifyChecksum(handle);
} finally {
handle.close();
}
} catch (FileNotInCacheException | NotInTrashCacheException e) {
/* It was removed before we could get it. No problem.
*/
} catch (FileCorruptedCacheException e) {
if (e.getActualChecksums().isPresent()) {
_bad.put(id, e.getActualChecksums().get());
_badCount++;
} else {
_log.warn("csm scan command unable to verify {}: {}", id, e.getMessage());
_unableCount++;
}
} catch (CacheException e) {
_log.warn("csm scan command unable to verify {}: {}", id, e.getMessage());
_unableCount++;
} catch (IOException e) {
_unableCount++;
throw new IOException("failed to read " + id + ": " + e.getMessage(), e);
}
_totalCount++;
}
} catch (IOException e) {
_log.error("Aborting 'cms check' full-scan: {}", e.getMessage());
setAbortMessage("failure in underlying storage: " + e.getMessage());
} finally {
startScrubber();
}
}
public String toString()
{
return super.toString() + " "
+ _totalCount + " files: "
+ _badCount + " corrupt, "
+ _unableCount + " unable to check";
}
}
private class SingleScan extends Singleton
{
private volatile PnfsId _pnfsId;
private volatile Iterable<Checksum> _actualChecksums;
private volatile Iterable<Checksum> _expectedChecksums;
public SingleScan()
{
super("SingleScan");
}
public synchronized void go(PnfsId pnfsId)
{
_expectedChecksums = null;
_actualChecksums = null;
_pnfsId = pnfsId;
start();
}
@Override
public void runIt()
throws CacheException, InterruptedException, IOException, NoSuchAlgorithmException
{
stopScrubber();
try {
ReplicaDescriptor handle =
_repository.openEntry(_pnfsId, EnumSet.of(OpenFlags.NOATIME));
try {
_actualChecksums = _csm.verifyChecksum(handle);
} catch (FileCorruptedCacheException e) {
_expectedChecksums = e.getExpectedChecksums().get();
_actualChecksums = e.getActualChecksums().get();
_bad.put(_pnfsId, _actualChecksums);
} finally {
handle.close();
}
} finally {
startScrubber();
}
}
public synchronized String toString()
{
StringBuilder sb = new StringBuilder();
sb.append(super.toString());
if (_pnfsId != null) {
sb.append(" ").append(_pnfsId).append(" ");
if (_actualChecksums == null) {
sb.append("BUSY");
} else if (_expectedChecksums == null) {
sb.append("OK ").append(_actualChecksums);
} else {
sb.append("BAD File = ").append(_actualChecksums)
.append(" Expected = ").append(_expectedChecksums);
}
}
return sb.toString();
}
}
private class Scrubber extends Singleton
{
private final long CHECKPOINT_INTERVAL = TimeUnit.MINUTES.toMillis(1);
private final long FAILURE_RATELIMIT_DELAY =
TimeUnit.SECONDS.toMillis(10);
private volatile int _badCount;
private volatile int _numFiles;
private volatile int _totalCount;
private volatile int _unableCount;
private PnfsId _lastFileChecked;
private long _lastCheckpoint;
private long _lastStart;
public Scrubber()
{
super("Scrubber");
}
/**
* Save scrubber state to <code>_scrubberStateFile</code>. The format is
* the start time of the last scrub (<code>_lastStart</code>) separated
* by a whitespace followed by the pnfs id of the file last checked
* (<code>_lastFileChecked</code>). If there's no last checked file,
* write a dash instead.
*/
private void saveState()
{
String line = _lastStart + " " +
((_lastFileChecked == null) ? "-" : _lastFileChecked);
try {
Files.write(line, _scrubberStateFile, Charset.defaultCharset());
} catch (IOException e) {
_log.error("Failed to save scrubber state ({}) to {}: {}", line, _scrubberStateFile, e.getMessage());
}
}
/**
* Read the saved state information from disk written by <code>
* saveState()</code>. The following fields are initialized:<code>
* _lastFileChecked</code> - the pnfs id of the file that was last
* checksummed; <code>_lastStart</code> - time when the last scrub
* started, if there's no saved state it's initialized to the current
* time.
*/
private void initializeFromSavedState()
{
String line;
try {
line = Files.readFirstLine(_scrubberStateFile,
Charset.defaultCharset());
} catch (FileNotFoundException e) {
/**
* ignored - start immediately and check whole pool
*/
_lastStart = System.currentTimeMillis();
return;
} catch (IOException e) {
_log.error("Failed to read scrubber saved state from {}: {}",
_scrubberStateFile, e.getMessage());
return;
}
String[] fields = line.split(" ");
if (fields.length != 2) {
_log.error("scrubber saved state in {} has an invalid format: {}",
_scrubberStateFile, line);
return;
}
try {
_lastStart = Long.parseLong(fields[0]);
} catch (NumberFormatException e) {
_log.error("Failed to read the last scrubber start time from {}: {}",
_scrubberStateFile, e.getMessage());
return;
}
if (PnfsId.isValid(fields[1])) {
_log.debug("Resuming scrubbing from the first file with a pnfs id greater than {}",
fields[1]);
_lastFileChecked = new PnfsId(fields[1]);
} else if (!fields[1].equals("-")) {
_log.error("Last checked pnfs id within {} has an invalid format: {}",
_scrubberStateFile, fields[1]);
}
}
private boolean isFirstStart()
{
return !_scrubberStateFile.exists();
}
private boolean isResuming()
{
return (_lastFileChecked != null);
}
private void waitUntil(long t) throws InterruptedException
{
long now;
while ((now = System.currentTimeMillis()) < t) {
Thread.sleep(t - now);
}
}
@Override
public synchronized void start()
{
if (_csm.hasPolicy(ChecksumModule.PolicyFlag.SCRUB) && !isActive()) {
super.start();
}
}
@Override
public void runIt() throws InterruptedException
{
initializeFromSavedState();
boolean isFinished = !isFirstStart() && !isResuming();
try {
while (true) {
if (isFinished) {
_log.debug("Next scrub start is {}",
new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").
format(new Date(_lastStart +
_csm.getScrubPeriod())));
if (System.currentTimeMillis() - _lastStart > _csm.getScrubPeriod()) {
_log.warn("The last scrub took longer time to finish ({} s.) than the configured period ({} s.) - consider increasing the scrubbing period",
System.currentTimeMillis() - _lastStart,
_csm.getScrubPeriod());
}
waitUntil(_lastStart + _csm.getScrubPeriod());
_lastStart = System.currentTimeMillis();
isFinished = false;
}
try {
PnfsId[] toScan = getFilesToVerify();
_numFiles = toScan.length;
_badCount = 0;
_totalCount = 0;
_unableCount = 0;
scanFiles(toScan);
if (_badCount > 0) {
_log.warn("Finished scrubbing. Found {} bad files of {}",
_badCount, _numFiles);
}
isFinished = true;
} catch (IOException e) {
_log.error("Aborting scrubber run: {}", e.getMessage());
setAbortMessage("failure in underlying storage: " + e.getMessage());
Thread.sleep(FAILURE_RATELIMIT_DELAY);
} catch (IllegalStateException e) {
_log.error("Aborting scrubber run: {}", e.getMessage());
setAbortMessage("illegal state: " + e.getMessage());
Thread.sleep(FAILURE_RATELIMIT_DELAY);
} catch (NoSuchAlgorithmException e) {
_log.error("Aborting scrubber run: {}", e.getMessage());
setAbortMessage("checksum algorithm not supported: " + e.getMessage());
Thread.sleep(FAILURE_RATELIMIT_DELAY);
}
}
} finally {
_log.debug("Stopping scrubber");
saveState();
}
}
/**
* Return array of pnfs id's that has not yet been verified. Any files
* added to the pool after this array has been generated will be
* included the next time the array is generated.
* @return array of pnfs id's that needs to be verified. No check is
* done on in which state the files are in.
*/
private PnfsId[] getFilesToVerify()
{
PnfsId[] repcopy = Iterables.toArray(_repository, PnfsId.class);
Arrays.sort(repcopy);
if (!isResuming()) {
return repcopy;
}
int index = Arrays.binarySearch(repcopy, _lastFileChecked);
if (index >= 0) {
/**
* Found. 0 <= index <= repcopy.length - 1
*/
return Arrays.copyOfRange(repcopy, index + 1, repcopy.length);
} else {
/**
* Not found. insertionPoint == -index - 1
* where: 0 <= insertionPoint <= repcopy.length
*/
return Arrays.copyOfRange(repcopy, -index - 1, repcopy.length);
}
}
/**
* Save state information only every <code>CHECKPOINT_INTERVAL</code>
* period.
*/
private void checkpointIfNeeded()
{
if (System.currentTimeMillis() - _lastCheckpoint > CHECKPOINT_INTERVAL) {
saveState();
_lastCheckpoint = System.currentTimeMillis();
}
}
private void scanFiles(PnfsId[] repository)
throws InterruptedException, NoSuchAlgorithmException, IOException
{
for (PnfsId id : repository) {
try {
if (_repository.getState(id) == ReplicaState.CACHED ||
_repository.getState(id) == ReplicaState.PRECIOUS) {
ReplicaDescriptor handle =
_repository.openEntry(id, EnumSet.of(OpenFlags.NOATIME));
try {
_csm.verifyChecksumWithThroughputLimit(handle);
} finally {
handle.close();
}
}
} catch (FileCorruptedCacheException e) {
_badCount++;
_log.error(AlarmMarkerFactory.getMarker(PredefinedAlarm.CHECKSUM,
id.toString(),
poolName),
"Marking {} on {} as BROKEN: {}",
id,
poolName,
e.getMessage());
try {
_repository.setState(id, ReplicaState.BROKEN);
} catch (CacheException f) {
_log.warn("Failed to mark {} as BROKEN: {}", id, f.getMessage());
}
} catch (IOException e) {
_unableCount++;
throw new IOException("Unable to read " + id + ": " + e.getMessage(), e);
} catch (FileNotInCacheException | NotInTrashCacheException e) {
/* It was removed before we could get it. No problem.
*/
} catch (CacheException e) {
_log.warn("Scrubber unable to verify {}: {}", id, e.getMessage());
_unableCount++;
}
_lastFileChecked = id;
_totalCount++;
checkpointIfNeeded();
}
_lastFileChecked = null;
}
@Override
public String toString()
{
return super.toString() + " processed "
+ _totalCount + " of " + _numFiles + " files: "
+ _badCount + " corrupt, "
+ _unableCount + " unable to check";
}
}
private abstract static class Singleton
{
private final String _name;
private Exception _lastException;
private Thread _currentThread;
private String _abortMessage;
private Singleton(String name)
{
_name = name;
}
protected abstract void runIt() throws Exception;
public synchronized void kill()
{
if (isActive()) {
_currentThread.interrupt();
}
}
public synchronized boolean isActive()
{
return (_currentThread != null);
}
public synchronized void setAbortMessage(String reason)
{
_abortMessage = checkNotNull(reason);
}
private synchronized void stopped()
{
_currentThread = null;
}
public synchronized void setException(Exception e)
{
_lastException = e;
}
public synchronized void start()
{
if (isActive()) {
throw new IllegalStateException("Still active");
}
_abortMessage = null;
_lastException = null;
_currentThread = new Thread(_name) {
@Override
public void run() {
try {
runIt();
} catch (Exception ee) {
setException(ee);
} finally {
stopped();
}
}
};
_currentThread.start();
}
@Override
public synchronized String toString()
{
return _name + " " + getState() + (_lastException == null ? "" : " " + _lastException.toString()) + " ";
}
private synchronized String getState()
{
if (isActive()) {
return "Active";
} else if (_abortMessage != null) {
return "Aborted (" + _abortMessage + ")";
} else {
return "Idle";
}
}
}
@Command(name = "csm check",
hint = "verify checksum of files on this pool",
description = "Do a checksum verification on a file or all files in " +
"this pool. The result of the check can be view using the " +
"'csm status' command.")
public class CsmCheckCommand implements Callable<String>
{
@Argument(valueSpec = "PNFSID|*",
usage = "Specify the pnfsID of the file to scan. If all " +
"files in this pool are to be scan, simply use '*'.")
String pnfsID;
@Override
public String call() throws
CacheException, InterruptedException,
IOException, NoSuchAlgorithmException
{
if (pnfsID.equals("*")) {
_fullScan.start();
} else {
_singleScan.go(new PnfsId(pnfsID));
}
return "Started ...; check 'csm status' for status";
}
}
@Command(name = "csm status",
hint = "get checksum verification result",
description = "Display the detailed result of the checksum check " +
"performed by scanning all or a single file in this pool. " +
"This status report will contain the full scan, single scan " +
"and the scrubber result.")
public class CsmStatusCommand implements Callable<String>
{
@Override
public String call()
{
return _fullScan.toString() + "\n" + _singleScan.toString() + "\n" +
_scrubber.toString();
}
}
@Command(name = "csm show errors",
hint = "show errors found during checksum verification",
description = "Display the list of all errors found while running " +
"the 'csm check' command. The shown information (based on " +
"these errors) will contain the pnfsID and the actual " +
"checksum value of the (corrupted) file. Nothing is returned " +
"when no error is found.")
public class CsmShowErrorsCommand implements Callable<String>
{
@Override
public String call()
{
StringBuilder builder = new StringBuilder();
for (Map.Entry<PnfsId,Iterable<Checksum>> e: _bad.entrySet()) {
builder
.append(e.getKey())
.append(" -> ")
.append(e.getValue())
.append('\n');
}
return builder.toString();
}
}
@Override
public void afterStart()
{
_csm.addListener(listener);
startScrubber();
}
@Override
public void beforeStop()
{
_csm.removeListener(listener);
stopScrubber();
}
}