/******************************************************************************* * Australian National University Data Commons * Copyright (C) 2013 The Australian National University * * This file is part of Australian National University Data Commons. * * Australian National University Data Commons is free software: you * can redistribute it and/or modify it under the terms of the GNU * General Public License as published by the Free Software Foundation, * either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package au.edu.anu.datacommons.storage.verifier; import gov.loc.repository.bagit.Manifest.Algorithm; import gov.loc.repository.bagit.utilities.FilenameHelper; import gov.loc.repository.bagit.utilities.MessageDigestHelper; import java.io.IOException; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import au.edu.anu.datacommons.storage.DcStorage; import au.edu.anu.datacommons.storage.event.StorageEventListener; import au.edu.anu.datacommons.storage.event.StorageEventListener.EventTime; import au.edu.anu.datacommons.storage.event.tasks.AbstractTagFileTask; import au.edu.anu.datacommons.storage.event.tasks.MetadataTask; import au.edu.anu.datacommons.storage.event.tasks.PreservationTask; import au.edu.anu.datacommons.storage.event.tasks.PronomTask; import au.edu.anu.datacommons.storage.event.tasks.TimestampTask; import au.edu.anu.datacommons.storage.event.tasks.VirusScanTask; import au.edu.anu.datacommons.storage.provider.StorageProvider; import au.edu.anu.datacommons.storage.tagfiles.AbstractKeyValueFile; import au.edu.anu.datacommons.storage.tagfiles.FileMetadataTagFile; import au.edu.anu.datacommons.storage.tagfiles.ManifestMd5TagFile; import au.edu.anu.datacommons.storage.tagfiles.PreservationMapTagFile; import au.edu.anu.datacommons.storage.tagfiles.PronomFormatsTagFile; import au.edu.anu.datacommons.storage.tagfiles.TagFilesService; import au.edu.anu.datacommons.storage.tagfiles.TimestampsTagFile; import au.edu.anu.datacommons.storage.tagfiles.VirusScanTagFile; import au.edu.anu.datacommons.tasks.ThreadPoolService; import au.edu.anu.datacommons.util.StopWatch; /** * Task class that completes a bag as per the BagIt specification. Occassionally after making changes to files in a * collection record, some tag files may not be updated correctly, or at all, for example due to unable to write to * disk. * * @author Rahul Khanna * */ public class CompletionTask implements Callable<Void>{ private static final Logger LOGGER = LoggerFactory.getLogger(CompletionTask.class); private String pid; private Path bagDir; private StorageProvider storageProvider; private TagFilesService tagFilesSvc; private StorageEventListener eventListener; private ThreadPoolService threadPoolSvc; private DcStorage dcStorage; List<Class<? extends AbstractKeyValueFile>> classes; private boolean dryRun = false; /** * Creates an instance of the completion task object that can be submitted to a thread pool for processing in * another thread. * * @param pid * Identifier of the collection record to complete * @param storageProvider * Bag directory of the record * @param tagFilesSvc * Tag files service * @param eventListener * Storage event listener to which changes to tag files will be notified * @param threadPoolSvc * Thread Pool service to which sub-completion tasks will be submitted * @param dcStorage * DcStorage class */ public CompletionTask(String pid, StorageProvider storageProvider, TagFilesService tagFilesSvc, StorageEventListener eventListener, ThreadPoolService threadPoolSvc, DcStorage dcStorage) { this.pid = pid; this.storageProvider = storageProvider; this.tagFilesSvc = tagFilesSvc; this.eventListener = eventListener; this.threadPoolSvc = threadPoolSvc; this.dcStorage = dcStorage; initTagFileClasses(); } /** * Initialises a list of tag file classes that will be checked for completeness. */ private void initTagFileClasses() { classes = new ArrayList<>(5); classes.add(FileMetadataTagFile.class); classes.add(PronomFormatsTagFile.class); classes.add(TimestampsTagFile.class); classes.add(VirusScanTagFile.class); classes.add(PreservationMapTagFile.class); } /** * Sets if the completion task only performs a dry run of the completion process without actually making any * changes. * * @param dryRun */ public void setDryRun(boolean dryRun) { this.dryRun = dryRun; } @Override public Void call() throws Exception { // StopWatch sw = new StopWatch(); // sw.start(); // LOGGER.info("Completing tag files for {}...", pid); // if (!dryRun) { // eventListener.notify(EventTime.PRE, EventType.TAGFILE_UPDATE, pid, bagDir, null, null); // } // Set<Path> plFiles = listFilesInDir(getPayloadDir()); // checkArtifacts(plFiles); // verifyMessageDigests(plFiles); // verifyTagFiles(plFiles); // if (!dryRun) { // eventListener.notify(EventTime.POST, EventType.TAGFILE_UPDATE, pid, bagDir, null, null); // } // sw.stop(); // LOGGER.info("Tag files completed for {}. Time taken {}", pid, sw.getTimeElapsedFormatted()); // return null; // TODO Implement throw new UnsupportedOperationException(); } /** * Checks for presence of following artifacts from old storage formats: <li> * <ul> * Files in metadata/ directory which previously stored file metadata. Now it is serialised to JSON and stored in * file-metadata.txt</li> * * @param plFiles * Payload files within the bag * @throws IOException */ private void checkArtifacts(Set<Path> plFiles) throws IOException { Path metadataDir = bagDir.resolve("metadata/"); if (Files.isDirectory(metadataDir)) { try { FileUtils.deleteDirectory(metadataDir.toFile()); } catch (IOException e) { LOGGER.error("Unable to delete {}: {}", metadataDir.toString(), e.getMessage()); } } } /** * Verifies the message digests stored in a manifest match file's contents. * * @param plFiles * Payload files within the bag * @throws IOException */ private void verifyMessageDigests(Set<Path> plFiles) throws IOException { Map<Path, Future<String>> calcMd = calcMessageDigests(plFiles); compareMessageDigests(calcMd); } /** * Calculates message digest for payload files provided. * * @param plFiles * Payload files * @return Message digests as <code>Map<Path, Future<String>></code> where Keys are the path to the payload files, * and <code>Future<String></code> * */ private Map<Path, Future<String>> calcMessageDigests(Set<Path> plFiles) { Map<Path, Future<String>> calcMd = new HashMap<>(); for (Path plFile : plFiles) { final Path fPlFile = plFile; Future<String> mdFuture = threadPoolSvc.submit(new Callable<String>() { @Override public String call() throws Exception { return MessageDigestHelper.generateFixity(fPlFile.toFile(), Algorithm.MD5); } }); calcMd.put(plFile, mdFuture); } return calcMd; } /** * Compares provided (calculated) message digests with the MD5 values in the manifest tag file. * * @param calcMd * Calculated Message Digests * @throws IOException */ private void compareMessageDigests(Map<Path, Future<String>> calcMd) throws IOException { // Check that each payload file's calculated MD5 exists in the manifest tag file. Add it if it doesn't. for (Entry<Path, Future<String>> calcMdEntry : calcMd.entrySet()) { try { String dataRelPath = getDataRelPath(calcMdEntry.getKey()); String md5 = calcMdEntry.getValue().get(); String md5InTagFile = tagFilesSvc.getEntryValue(pid, ManifestMd5TagFile.class, dataRelPath); if (md5InTagFile == null) { // Entry doesn't exist in manifest tag file. LOGGER.info("{}/{} didn't contain entry for {}. Adding MD5 {}", pid, ManifestMd5TagFile.FILEPATH, dataRelPath, md5); if (!dryRun) { tagFilesSvc.addEntry(pid, ManifestMd5TagFile.class, dataRelPath, md5); } } else if (!md5InTagFile.equals(md5)) { // Entry has incorrect MD5 in manifest tag file. LOGGER.info("{}/{} contains incorrect entry for {}. Calculated MD5 {} doesn't match specified {}", pid, ManifestMd5TagFile.FILEPATH, dataRelPath, md5, md5InTagFile); if (!dryRun) { tagFilesSvc.addEntry(pid, ManifestMd5TagFile.class, dataRelPath, md5); } } } catch (InterruptedException | ExecutionException e) { LOGGER.error(e.getMessage(), e); } } // Check that for each entry in the payload manifest there's a payload file on disk. Delete entry if one doesn't // exist. removeExtraKeys(calcMd.keySet(), ManifestMd5TagFile.class); } /** * Verifies the contents of each of the tag files. Checks that an entry exists in a tag file for every payload file. * * @param plFiles * Payload files against which tag file entries will be verified. * @throws IOException */ private void verifyTagFiles(Set<Path> plFiles) throws IOException { for (Path plFile : plFiles) { String dataRelPath = getDataRelPath(plFile); for (Class<? extends AbstractKeyValueFile> clazz : classes) { String valueInTagFile = tagFilesSvc.getEntryValue(pid, clazz, dataRelPath); if (valueInTagFile == null) { LOGGER.info("{}/{} doesn't contain entry for {}.", pid, clazz.getSimpleName(), dataRelPath); if (!dryRun) { AbstractTagFileTask task = createTask(clazz, dataRelPath.replaceFirst("^data/", "")); Future<Void> future = threadPoolSvc.submit(task); // Running each task one at a time so as to not fill up the task queue. try { future.get(); } catch (InterruptedException | ExecutionException e) { LOGGER.error(e.getMessage(), e); } } } } } for (Class<? extends AbstractKeyValueFile> clazz : classes) { removeExtraKeys(plFiles, clazz); } } /** * Creates a task object for generating a tag file entry for a payload file. * * @param clazz * Tag file class missing an entry for the payload file * @param relPath * Relative path to the payload file * @return Instance of an appropriate task object that adds an entry into the specified tag file */ private AbstractTagFileTask createTask(Class<? extends AbstractKeyValueFile> clazz, String relPath) { AbstractTagFileTask task = null; if (clazz == FileMetadataTagFile.class) { task = new MetadataTask(pid, storageProvider, relPath, tagFilesSvc); } else if (clazz == PronomFormatsTagFile.class) { task = new PronomTask(pid, storageProvider, relPath, tagFilesSvc); } else if (clazz == TimestampsTagFile.class) { task = new TimestampTask(pid, storageProvider, relPath, tagFilesSvc); } else if (clazz == VirusScanTagFile.class) { task = new VirusScanTask(pid, storageProvider, relPath, tagFilesSvc); } else if (clazz == PreservationMapTagFile.class) { task = new PreservationTask(pid, storageProvider, relPath, tagFilesSvc, null); } else { throw new IllegalArgumentException(clazz.getSimpleName()); } return task; } /** * Removes keys for non existent files in the specified tag file. * * @param plFiles * Payload files. Entries in tag files for files other than these will be removed. * @param clazz * Tag file to from extra keys from * @throws IOException */ private void removeExtraKeys(Set<Path> plFiles, Class<? extends AbstractKeyValueFile> clazz) throws IOException { Set<String> keysForRemoval = new HashSet<>(); for (Entry<String, String> tagFileEntry : tagFilesSvc.getAllEntries(pid, clazz).entrySet()) { Path plFile = bagDir.resolve(tagFileEntry.getKey()); if (!Files.isRegularFile(plFile)) { keysForRemoval.add(tagFileEntry.getKey()); } } if (!dryRun) { for (String key : keysForRemoval) { tagFilesSvc.removeEntry(pid, clazz, key); } } } /** * Extracts a the portion of a path relative to the bag directory. * * @param file * Path to a file * @return Relative path as String */ private String getDataRelPath(Path file) { return FilenameHelper.normalizePathSeparators(bagDir.relativize(file).toString()); } /** * Enumerates files in a directory and its subdirectories. * * @param dir * Directory to walk * @return Set of files in that directory and its subdirectories. * @throws IOException */ private Set<Path> listFilesInDir(Path dir) throws IOException { Set<Path> files = new HashSet<Path>(); if (Files.isDirectory(dir)) { try (DirectoryStream<Path> dirItems = Files.newDirectoryStream(dir)) { for (Path dirItem : dirItems) { if (Files.isDirectory(dirItem)) { files.addAll(listFilesInDir(dirItem)); } else if (Files.isRegularFile(dirItem)){ files.add(dirItem.toAbsolutePath()); } } } } return files; } private Path getPayloadDir() { return bagDir.resolve("data/"); } }