package gov.loc.repository.bagit.conformance;
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.Normalizer;
import java.util.Collection;
import java.util.HashSet;
import java.util.ResourceBundle;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.helpers.MessageFormatter;
import gov.loc.repository.bagit.exceptions.InvalidBagitFileFormatException;
import gov.loc.repository.bagit.util.PathUtils;
/**
* Part of the BagIt conformance suite.
* This checker checks for various problems related to the manifests in a bag.
*/
@SuppressWarnings({"PMD.UseLocaleWithCaseConversions"})
public final class ManifestChecker {
private static final Logger logger = LoggerFactory.getLogger(ManifestChecker.class);
private static final ResourceBundle messages = ResourceBundle.getBundle("MessageBundle");
private static final String THUMBS_DB_FILE = "[Tt][Hh][Uu][Mm][Bb][Ss]\\.[Dd][Bb]";
private static final String DS_STORE_FILE = "\\.[Dd][Ss]_[Ss][Tt][Oo][Rr][Ee]";
private static final String SPOTLIGHT_FILE = "\\.[Ss][Pp][Oo][Tt][Ll][Ii][Gg][Hh][Tt]-[Vv]100";
private static final String TRASHES_FILE = "\\.(_.)?[Tt][Rr][Aa][Ss][Hh][Ee][Ss]";
private static final String FS_EVENTS_FILE = "\\.[Ff][Ss][Ee][Vv][Ee][Nn][Tt][Ss][Dd]";
private static final String OS_FILES_REGEX = ".*data/(" + THUMBS_DB_FILE + "|" + DS_STORE_FILE + "|" + SPOTLIGHT_FILE + "|" + TRASHES_FILE + "|" + FS_EVENTS_FILE + ")";
private ManifestChecker(){
//intentionally left empty
}
/*
* Check for all the manifest specific potential problems
*/
public static void checkManifests(final Path bagitDir, final Charset encoding, final Set<BagitWarning> warnings,
final Collection<BagitWarning> warningsToIgnore) throws IOException, InvalidBagitFileFormatException{
boolean missingTagManifest = true;
try(final DirectoryStream<Path> files = Files.newDirectoryStream(bagitDir)){
for(final Path file : files){
final String filename = PathUtils.getFilename(file);
if(filename.contains("manifest-")){
if(filename.startsWith("manifest-")){
checkData(file, encoding, warnings, warningsToIgnore, true);
}
else{
checkData(file, encoding, warnings, warningsToIgnore, false);
missingTagManifest = false;
}
final String algorithm = filename.split("[-\\.]")[1];
checkAlgorthm(algorithm, warnings, warningsToIgnore);
}
}
}
if(!warningsToIgnore.contains(BagitWarning.MISSING_TAG_MANIFEST) && missingTagManifest){
logger.warn(messages.getString("bag_missing_tag_manifest_warning"), bagitDir);
warnings.add(BagitWarning.MISSING_TAG_MANIFEST);
}
}
/*
* Check for a "bag within a bag" and for relative paths in the manifests
*/
private static void checkData(final Path manifestFile, final Charset encoding, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore, final boolean isPayloadManifest) throws IOException, InvalidBagitFileFormatException{
try(final BufferedReader reader = Files.newBufferedReader(manifestFile, encoding)){
final Set<String> paths = new HashSet<>();
String line = reader.readLine();
while(line != null){
String path = parsePath(line);
path = checkForManifestCreatedWithMD5SumTools(path, warnings, warningsToIgnore);
if(!warningsToIgnore.contains(BagitWarning.DIFFERENT_CASE) && paths.contains(path.toLowerCase())){
logger.warn(messages.getString("different_case_warning"), manifestFile, path);
warnings.add(BagitWarning.DIFFERENT_CASE);
}
paths.add(path.toLowerCase());
if(encoding.name().startsWith("UTF")){
checkNormalization(path, manifestFile.getParent(), warnings, warningsToIgnore);
}
checkForBagWithinBag(line, warnings, warningsToIgnore, isPayloadManifest);
checkForRelativePaths(line, warnings, warningsToIgnore, manifestFile);
checkForOSSpecificFiles(line, warnings, warningsToIgnore, manifestFile);
line = reader.readLine();
}
}
}
static String parsePath(final String line) throws InvalidBagitFileFormatException{
final String[] parts = line.split("\\s+", 2);
if(parts.length < 2){
final String formattedMessage = messages.getString("manifest_line_violated_spec_error");
throw new InvalidBagitFileFormatException(MessageFormatter.format(formattedMessage, line).getMessage());
}
return parts[1];
}
private static String checkForManifestCreatedWithMD5SumTools(final String path, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore){
String fixedPath = path;
final boolean startsWithStar = path.charAt(0) == '*';
if(startsWithStar){
fixedPath = path.substring(1);
}
if(!warningsToIgnore.contains(BagitWarning.MD5SUM_TOOL_GENERATED_MANIFEST) && startsWithStar){
logger.warn(messages.getString("md5sum_generated_line_warning"), path);
warnings.add(BagitWarning.MD5SUM_TOOL_GENERATED_MANIFEST);
}
return fixedPath;
}
/*
* Check that the file specified has not changed its normalization (i.e. have the bytes changed but it still looks the same?)
*/
private static void checkNormalization(final String path, final Path rootDir, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore) throws IOException{
if(!warningsToIgnore.contains(BagitWarning.DIFFERENT_NORMALIZATION)){
final Path fileToCheck = rootDir.resolve(path).normalize();
final Path dirToCheck = fileToCheck.getParent();
if(dirToCheck == null){
final String formattedMessage = messages.getString("cannot_access_parent_path_error");
throw new IOException(MessageFormatter.format(formattedMessage, fileToCheck).getMessage()); //to satisfy findbugs
}
final String normalizedFileToCheck = normalizePathToNFD(fileToCheck);
try(final DirectoryStream<Path> files = Files.newDirectoryStream(dirToCheck)){
for(final Path file : files){
final String normalizedFile = normalizePathToNFD(file);
if(!file.equals(fileToCheck) && normalizedFileToCheck.equals(normalizedFile)){
logger.warn(messages.getString("different_normalization_in_manifest_warning"), fileToCheck);
warnings.add(BagitWarning.DIFFERENT_NORMALIZATION);
}
}
}
}
}
/*
* Normalize to Canonical decomposition.
*/
static String normalizePathToNFD(final Path path){
return Normalizer.normalize(path.toString(), Normalizer.Form.NFD);
}
/*
* check for a bag within a bag
*/
private static void checkForBagWithinBag(final String line, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore, final boolean isPayloadManifest){
if(!warningsToIgnore.contains(BagitWarning.BAG_WITHIN_A_BAG) && isPayloadManifest && line.contains("manifest-")){
logger.warn(messages.getString("bag_within_bag_warning"));
warnings.add(BagitWarning.BAG_WITHIN_A_BAG);
}
}
/*
* Check for relative paths (i.e. ./) in the manifest
*/
private static void checkForRelativePaths(final String line, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore, final Path manifestFile){
if(!warningsToIgnore.contains(BagitWarning.LEADING_DOT_SLASH) && line.contains("./")){
logger.warn(messages.getString("leading_dot_slash_warning"), manifestFile, line);
warnings.add(BagitWarning.LEADING_DOT_SLASH);
}
}
/*
* like .DS_Store or Thumbs.db
*/
private static void checkForOSSpecificFiles(final String line, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore, final Path manifestFile){
if(!warningsToIgnore.contains(BagitWarning.OS_SPECIFIC_FILES) && line.matches(OS_FILES_REGEX)){
logger.warn(messages.getString("os_specific_files_warning"), manifestFile, line);
warnings.add(BagitWarning.OS_SPECIFIC_FILES);
}
}
/*
* Check for anything weaker than SHA-512
*/
static void checkAlgorthm(final String algorithm, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore){
final String upperCaseAlg = algorithm.toUpperCase();
if(!warningsToIgnore.contains(BagitWarning.WEAK_CHECKSUM_ALGORITHM) &&
(upperCaseAlg.startsWith("MD") || upperCaseAlg.matches("SHA(1|224|256|384)?"))){
logger.warn(messages.getString("weak_algorithm_warning"), algorithm);
warnings.add(BagitWarning.WEAK_CHECKSUM_ALGORITHM);
}
else if(!warningsToIgnore.contains(BagitWarning.NON_STANDARD_ALGORITHM) && !"SHA-512".equals(upperCaseAlg)){
logger.warn(messages.getString("non_standard_algorithm_warning"), algorithm);
warnings.add(BagitWarning.NON_STANDARD_ALGORITHM);
}
}
//for unit test only
static String getOsFilesRegex() {
return OS_FILES_REGEX;
}
}