package dk.statsbiblioteket.medieplatform.autonomous.iterator.filesystem.transforming;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.AbstractFileFilter;
import org.apache.commons.io.filefilter.DirectoryFileFilter;
import dk.statsbiblioteket.medieplatform.autonomous.iterator.common.DelegatingTreeIterator;
import dk.statsbiblioteket.util.Pair;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
/**
* This is the transforming iterator for filesystems. It allows one to iterate over a tree structure on the file system
* but having it transformed inline to a format that is suitable to ingest into doms.
*
* The transformations are
*
* 1. All data files (ie. the ones matched by the dataFilePattern) will be made into special folders. The contents
* of the datafile will reside in a virtual file called contents in that folder
* 2. Prefix grouping. If a folder contains a number of files with a common prefix, these will be grouped into a
* virtual
* folder, named as the prefix. This only happens if there are more than one common prefix.
* 2b. If only one of the groups contain no datafiles, this group will be cancelled, and the files will reside in the
* real folder.
*
* There will be no virtual folders inside virtual folders.
*/
public class TransformingIteratorForFileSystems extends CommonTransformingIterator {
public static final String CHECKSUM_POSTFIX_DEFAULT_VALUE = ".md5";
public static final String IGNORED_FILES_DEFAULT_VALUE = "transfer_complete,transfer_acknowledged,delete_ok";
public static final String DATA_FILE_PATTERN_JP2_VALUE = ".*\\.jp2$";
public static final String GROUPING_PATTERN_DEFAULT_VALUE = "\\.";
private final List<String> ignoredFiles;
protected List<DelegatingTreeIterator> virtualChildren;
/**
* Create the transforming Iterator for file systems
*
* @param id The root folder
* @param groupingPattern The grouping regular expression, ie. the char used as separator between prefix and
* postfix.
* Should be "\\."
* @param dataFilePattern a regular expression that should match the names of all datafiles
* @param checksumPostfix this is the postfix for the checksum files. Note, THIS IS NOT A PATTERN
* @param ignoredFiles Files to ignore during transformation.
*/
public TransformingIteratorForFileSystems(File id, String groupingPattern, String dataFilePattern,
String checksumPostfix, List<String> ignoredFiles) {
this(id, id.getParentFile(), groupingPattern, dataFilePattern, checksumPostfix, ignoredFiles);
}
/**
* Create the transforming Iterator for file systems
*
* @param id The root folder
* @param prefix The prefix folder
* @param groupingPattern The grouping regular expression, ie. the char used as separator between prefix and
* postfix.
* Should be "\\."
* @param dataFilePattern a regular expression that should match the names of all datafiles
* @param checksumPostfix this is the postfix for the checksum files. Note, THIS IS NOT A PATTERN
* @param ignoredFiles Files to ignore during transformation.
*/
protected TransformingIteratorForFileSystems(File id, File prefix, String groupingPattern, String dataFilePattern,
String checksumPostfix, List<String> ignoredFiles) {
super(id, prefix, dataFilePattern, checksumPostfix, groupingPattern);
virtualChildren = new ArrayList<>();
this.ignoredFiles = ignoredFiles;
}
@Override
protected Iterator<DelegatingTreeIterator> initializeChildrenIterator() {
File[] children = id.listFiles((FileFilter) DirectoryFileFilter.DIRECTORY);
ArrayList<DelegatingTreeIterator> result = new ArrayList<>(children.length + virtualChildren.size());
for (File child : children) {
result.add(
new TransformingIteratorForFileSystems(
child, getBatchFolder(), getGroupingChar(), getDataFilePattern(), getChecksumPostfix(),
ignoredFiles));
}
for (DelegatingTreeIterator virtualChild : virtualChildren) {
result.add(virtualChild);
}
return result.iterator();
}
@Override
protected Iterator<File> initilizeAttributeIterator() throws IOException {
if (!(id.isDirectory() && id.canRead())) {
throw new IOException("Failed to read directory '" + id.getAbsolutePath() + "'");
}
Collection<File> attributes = FileUtils.listFiles(
id, new AbstractFileFilter() {
@Override
public boolean accept(File file) {
boolean isFile = file.isFile();
boolean isNotChecksum = !file.getName().endsWith(getChecksumPostfix());
boolean isNotIgnored = !ignoredFiles.contains(file.getName());
return isFile && isNotChecksum && isNotIgnored;
}
}, null);
//If there is any datafiles, we group by prefix. If there are no datafiles, we expect the structure to be flat
if (containsDatafiles(attributes)) {
Map<String, List<File>> groupedByPrefix = groupByPrefix(attributes);
Pair<String, List<File>> noDataGroup = getShortestNoDataFilesGroup(groupedByPrefix);
if (noDataGroup != null) {
attributes = noDataGroup.getRight();
}
for (String prefix : groupedByPrefix.keySet()) {
if (noDataGroup != null && prefix.equals(noDataGroup.getLeft())) {
continue;
}
List<File> group = groupedByPrefix.get(prefix);
virtualChildren.add(
new VirtualIteratorForFileSystems(
id,
prefix,
getBatchFolder(),
getDataFilePattern(),
group,
getGroupingChar(),
getChecksumPostfix()));
attributes.removeAll(group);
}
}
return attributes.iterator();
}
/**
* group a collection of files according to their prefix
*
* @param files the files to group
*
* @return a map of prefixes to lists of files
* @see #getPrefix(java.io.File)
*/
private Map<String, List<File>> groupByPrefix(Collection<File> files) {
Map<String, List<File>> prefixToFile = new HashMap<>();
for (File file : files) {
String prefix = getPrefix(file);
List<File> fileList = prefixToFile.get(prefix);
if (fileList == null) {
fileList = new ArrayList<>();
}
fileList.add(file);
prefixToFile.put(prefix, fileList);
}
return prefixToFile;
}
}