/******************************************************************************* * Australian National University Data Commons * Copyright (C) 2013 The Australian National University * * This file is part of Australian National University Data Commons. * * Australian National University Data Commons is free software: you * can redistribute it and/or modify it under the terms of the GNU * General Public License as published by the Free Software Foundation, * either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package au.edu.anu.datacommons.storage.verifier; import static java.text.MessageFormat.format; import gov.loc.repository.bagit.Manifest.Algorithm; import gov.loc.repository.bagit.utilities.FilenameHelper; import gov.loc.repository.bagit.utilities.MessageDigestHelper; import java.io.IOException; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import au.edu.anu.datacommons.storage.tagfiles.AbstractKeyValueFile; import au.edu.anu.datacommons.storage.tagfiles.FileMetadataTagFile; import au.edu.anu.datacommons.storage.tagfiles.ManifestMd5TagFile; import au.edu.anu.datacommons.storage.tagfiles.PreservationMapTagFile; import au.edu.anu.datacommons.storage.tagfiles.PronomFormatsTagFile; import au.edu.anu.datacommons.storage.tagfiles.TagFilesService; import au.edu.anu.datacommons.storage.tagfiles.TimestampsTagFile; import au.edu.anu.datacommons.storage.tagfiles.VirusScanTagFile; import au.edu.anu.datacommons.storage.verifier.ResultMessage.Category; import au.edu.anu.datacommons.storage.verifier.ResultMessage.Severity; import au.edu.anu.datacommons.tasks.ThreadPoolService; /** * Task class that verifies the integrity of a Bag as per the BagIt specification. * <p> * <em>This class only </em> * * @author Rahul Khanna * */ public class VerificationTask implements Callable<VerificationResults> { private static final Logger LOGGER = LoggerFactory.getLogger(VerificationTask.class); private Path bagDir; private String pid; private TagFilesService tagFilesSvc; private ThreadPoolService threadPoolSvc; private VerificationResults results; private Map<Path, String> payloadFiles = new HashMap<>(); private Map<Path, String> tagFiles = new HashMap<>(); private Map<Path, String> manifests = new HashMap<>(); /** * Creates an instance of this class that can be submitted to a thread pool for processing in another thread. * * @param pid * Identifier of collection record * @param bagDir * Bag directory * @param tagFilesSvc * Tag Files Service * @param threadPoolSvc * Thread Pool to which subtasks can be submitted */ public VerificationTask(String pid, Path bagDir, TagFilesService tagFilesSvc, ThreadPoolService threadPoolSvc) { this.pid = pid; this.bagDir = bagDir; this.tagFilesSvc = tagFilesSvc; this.threadPoolSvc = threadPoolSvc; this.results = new VerificationResults(bagDir.getFileName().toString()); } @Override public VerificationResults call() throws Exception { tagFiles = enumerateFiles(bagDir, true); payloadFiles = enumerateFiles(bagDir.resolve("data/"), false); iterateManifests(); validateTagFiles(); validatePayloadManifests(); validateChecksums(); checkArtifacts(); return results; } /** * Enumerates all the payload manifests in a bag. */ private void iterateManifests() { for (Entry<Path, String> tagFileEntry : tagFiles.entrySet()) { if (tagFileEntry.getValue().startsWith("manifest-")) { manifests.put(tagFileEntry.getKey(), tagFileEntry.getValue()); } } } /** * Enumerates files in a specified directory. * * @param rootDir * Directory to start walking * @param exclDataDir * true if the data directory (payload directory) should be excluded * @return Map Path as keys and relative path with normalized path separators as values * @throws IOException */ private Map<Path, String> enumerateFiles(Path rootDir, boolean exclDataDir) throws IOException { Map<Path, String> files = new HashMap<Path, String>(); if (Files.isDirectory(rootDir)) { try (DirectoryStream<Path> dirStream = Files.newDirectoryStream(rootDir)) { for (Path dirEntry : dirStream) { if (Files.isDirectory(dirEntry)) { if (!(exclDataDir && dirEntry.getFileName().toString().equals("data"))) { files.putAll(enumerateFiles(dirEntry, false)); } } else if (Files.isRegularFile(dirEntry)) { files.put(dirEntry, FilenameHelper.normalizePathSeparators(bagDir.relativize(dirEntry).toString())); } else { addEntry(Severity.WARN, Category.OTHER, FilenameHelper.normalizePathSeparators(bagDir.relativize(dirEntry).toString()), "Unexpected item found"); } } } } return files; } /** * Verifies entries in each tag file by checking that a payload file exists for each entry and an entry exists * for each payload file. */ private void validateTagFiles() { List<Class<? extends AbstractKeyValueFile>> tagFilesClasses = new ArrayList<Class<? extends AbstractKeyValueFile>>(); tagFilesClasses.add(PronomFormatsTagFile.class); tagFilesClasses.add(VirusScanTagFile.class); tagFilesClasses.add(FileMetadataTagFile.class); tagFilesClasses.add(TimestampsTagFile.class); tagFilesClasses.add(PreservationMapTagFile.class); // Verify each payload file has a corresponding entry in each custom tag file. for (Entry<Path, String> plFile : payloadFiles.entrySet()) { for (Class<? extends AbstractKeyValueFile> clazz : tagFilesClasses) { try { String tagFilename = (String) clazz.getField("FILEPATH").get(clazz); if (!tagFilesSvc.containsKey(pid, clazz, plFile.getValue())) { addEntry(Severity.WARN, Category.TAGFILE_ENTRY_MISSING, plFile.getValue(), format("No entry in {0}", tagFilename)); } } catch (IllegalArgumentException | IllegalAccessException | NoSuchFieldException | SecurityException | IOException e) { addEntry(Severity.ERROR, Category.VALIDATION_EXCEPTION, clazz.getSimpleName(), format("Exception on read: {0}", e.getMessage())); } } } for (Class<? extends AbstractKeyValueFile> clazz : tagFilesClasses) { try { String tagFilename = (String) clazz.getField("FILEPATH").get(clazz); for (String tagFileKey : tagFilesSvc.getAllEntries(pid, clazz).keySet()) { if (!payloadFiles.values().contains(tagFileKey)) { addEntry(Severity.WARN, Category.PAYLOADFILE_NOTFOUND, tagFileKey, format("{0} refers to missing file", tagFilename)); } } } catch (IOException | IllegalArgumentException | IllegalAccessException | NoSuchFieldException | SecurityException e) { addEntry(Severity.ERROR, Category.VALIDATION_EXCEPTION, clazz.getSimpleName(), format("Exception on read: {0}", e.getMessage())); } } } /** * Checks if the bag contains artifacts from previous storage formats. */ private void checkArtifacts() { for (Entry<Path, String> tagFile : tagFiles.entrySet()) { if (tagFile.getValue().startsWith("metadata/")) { addEntry(Severity.WARN, Category.ARTIFACT_FOUND, tagFile.getValue(), null); } } if (Files.isDirectory(bagDir.resolve("data/").resolve("metadata/"))) { addEntry(Severity.WARN, Category.ARTIFACT_FOUND, "metadata/", null); } } /** * Validates the entries in payload manifest. */ private void validatePayloadManifests() { try { Map<String, String> md5Entries = tagFilesSvc.getAllEntries(pid, ManifestMd5TagFile.class); for (String plFilepath : md5Entries.keySet()) { if (!payloadFiles.values().contains(plFilepath)) { addEntry(Severity.ERROR, Category.PAYLOADFILE_NOTFOUND, plFilepath, format("{0} refers to missing file", ManifestMd5TagFile.FILEPATH)); } } for (String plFilepath : payloadFiles.values()) { if (!md5Entries.keySet().contains(plFilepath)) { addEntry(Severity.ERROR, Category.MANIFEST_ENTRY_MISSING, plFilepath, format("{0} refers to missing file", ManifestMd5TagFile.FILEPATH)); } } } catch (IOException e) { addEntry(Severity.ERROR, Category.VALIDATION_EXCEPTION, ManifestMd5TagFile.FILEPATH, format("Exception on read: {0}", e.getMessage())); } } /** * Checks that the entries in the payload manifest actually match the contents of the respective payload file. * * @throws IOException */ private void validateChecksums() throws IOException { Map<String, String> expectedMd5Map = tagFilesSvc.getAllEntries(pid, ManifestMd5TagFile.class); Map<String, Future<String>> calculatedFixityMap = new HashMap<>(); for (Entry<Path, String> plFile : payloadFiles.entrySet()) { if (expectedMd5Map.get(plFile.getValue()) != null) { final Path fPlPath = plFile.getKey(); calculatedFixityMap.put(plFile.getValue(), threadPoolSvc.submit(new Callable<String>() { @Override public String call() throws Exception { return MessageDigestHelper.generateFixity(fPlPath.toFile(), Algorithm.MD5); } })); } } for (Entry<String, Future<String>> entry : calculatedFixityMap.entrySet()) { String relPath = entry.getKey(); String expectedFixity = expectedMd5Map.get(relPath); try { String calculatedFixity = entry.getValue().get(); if (expectedMd5Map.containsKey(relPath) && !expectedMd5Map.get(relPath).equals(calculatedFixity)) { addEntry( Severity.ERROR, Category.CHECKSUM_MISMATCH, relPath, format("Computed MD5 {0} does not match {1}", calculatedFixity, expectedFixity)); } } catch (InterruptedException | ExecutionException e) { addEntry(Severity.ERROR, Category.VALIDATION_EXCEPTION, ManifestMd5TagFile.FILEPATH, format("Exception on read: {0}", e.getMessage())); } } } /** * Adds an issue entry to results. * * @param severity * Severity of the issue * @param category * Category of the issue * @param filepath * Filepath related to the issue * @param msg * Message describing the issue */ private synchronized void addEntry(Severity severity, Category category, String filepath, String msg) { this.results.addMessage(new ResultMessage(severity, category, filepath, msg == null ? "" : msg)); LOGGER.trace("{}-{}: [{}] {}", severity.toString(), category.toString(), filepath, msg == null ? "" : msg); } }