/*
* Copyright 2010-2015 Global Biodiversity Informatics Facility.
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.gbif.dwca.io;
import org.gbif.dwc.terms.DcTerm;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.terms.Term;
import org.gbif.dwc.terms.TermFactory;
import org.gbif.dwca.io.ArchiveField.DataType;
import org.gbif.util.DownloadUtil;
import org.gbif.utils.file.CompressionUtil;
import org.gbif.utils.file.csv.CSVReader;
import org.gbif.utils.file.csv.CSVReaderFactory;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.io.Files;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOCase;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.io.filefilter.HiddenFileFilter;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.commons.io.input.BOMInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Factory used to build {@link Archive} object from a DarwinCore Archive file.
*
* @author mdoering
*
*/
public class ArchiveFactory {
private static final TermFactory TERM_FACTORY = TermFactory.instance();
private static final Logger LOG = LoggerFactory.getLogger(ArchiveFactory.class);
private static final List<String> DATA_FILE_SUFFICES = ImmutableList.of(".csv", ".txt", ".tsv", ".tab", ".text", ".data", ".dwca");
/**
* Predefined mapping between {@link Term} and its rowType.
* Ordering is important since the first found will be used.
*/
private static final Map<Term, Term> TERM_TO_ROW_TYPE;
static {
Map<Term, Term> idToRowType = new LinkedHashMap<>();
idToRowType.put(DwcTerm.occurrenceID, DwcTerm.Occurrence);
idToRowType.put(DwcTerm.taxonID, DwcTerm.Taxon);
idToRowType.put(DwcTerm.eventID, DwcTerm.Event);
TERM_TO_ROW_TYPE = Collections.unmodifiableMap(idToRowType);
}
/**
* Terms that can represent an identifier within a file
*/
private static final List<Term> ID_TERMS = Collections.unmodifiableList(
Arrays.asList(DwcTerm.occurrenceID, DwcTerm.taxonID, DwcTerm.eventID, DcTerm.identifier));
private static final SAXParserFactory SAX_FACTORY = SAXParserFactory.newInstance();
static {
SAX_FACTORY.setNamespaceAware(true);
SAX_FACTORY.setValidating(false);
}
/**
* Opens an archive from a URL, downloading and decompressing it.
*
* @param archiveUrl the location of a compressed archive or single data file
* @param workingDir writable directory to download to and decompress archive
*/
public static Archive openArchive(URL archiveUrl, File workingDir) throws IOException, UnsupportedArchiveException {
File downloadTo = new File(workingDir, "dwca-download");
File dwca = new File(workingDir, "dwca");
DownloadUtil.download(archiveUrl, downloadTo);
return openArchive(downloadTo, dwca);
}
/**
* Opens an archive from a local file and decompresses or copies it into the given archive directory.
* Make sure the archive directory does not contain files already, any existing files will be removed!
*
* If the source archive is an uncompressed, single data file and a valid archive, it will be copied as is
* to the archiveDir.
*
* @param archiveFile the location of a compressed archive or single data file
* @param archiveDir empty, writable directory used to keep decompress archive in
*/
public static Archive openArchive(File archiveFile, File archiveDir) throws IOException, UnsupportedArchiveException {
if (archiveDir.exists()) {
// clean up any existing folder
LOG.debug("Deleting existing archive folder [{}]", archiveDir.getAbsolutePath());
org.gbif.utils.file.FileUtils.deleteDirectoryRecursively(archiveDir);
}
FileUtils.forceMkdir(archiveDir);
// try to decompress archive
try {
CompressionUtil.decompressFile(archiveDir, archiveFile, true);
// we keep subfolder, but often the entire archive is within one subfolder. Remove that root folder if present
File[] rootFiles = archiveDir.listFiles((FileFilter) HiddenFileFilter.VISIBLE);
if (rootFiles.length == 1) {
File root = rootFiles[0];
if (root.isDirectory()) {
// single root dir, flatten structure
LOG.debug("Removing single root folder {} found in decompressed archive", root.getAbsoluteFile());
for (File f : FileUtils.listFiles(root, TrueFileFilter.TRUE, null)) {
File f2 = new File(archiveDir, f.getName());
f.renameTo(f2);
}
}
}
// continue to read archive from the tmp dir
return openArchive(archiveDir);
} catch (CompressionUtil.UnsupportedCompressionType e) {
LOG.debug("Could not uncompress archive [{}], try to read as single text file", archiveFile, e);
// If its a text file only we will get this exception - but also for corrupt compressions
// try to open as text file only and if successful copy file to archive dir
Archive arch = openArchiveDataFile(archiveFile);
Files.copy(archiveFile, new File(archiveDir, archiveFile.getName()));
return arch;
}
}
/**
* Opens a dwca archive which is just a single decompressed data file with headers, e.g. a csv or tab delimited file
*/
public static Archive openArchiveDataFile(File dataFile) throws IOException, UnsupportedArchiveException {
Archive archive = new Archive();
archive.setLocation(dataFile);
ArchiveFile coreFile = readFileHeaders(dataFile);
archive.setCore(coreFile);
// check if we also have a metadata file next to this data file
discoverMetadataFile(archive, dataFile.getParentFile());
// final validation
return validateArchive(archive);
}
/**
* @param dwcaFolder the location of an expanded dwc archive directory or just a single dwc text file
*/
public static Archive openArchive(File dwcaFolder) throws IOException, UnsupportedArchiveException {
if (!dwcaFolder.exists()) {
throw new FileNotFoundException("Archive folder not existing: " + dwcaFolder.getAbsolutePath());
}
// delegate to open data file method if its a single file, not a folder
if (dwcaFolder.isFile()) {
return openArchiveDataFile(dwcaFolder);
}
Archive archive = new Archive();
archive.setLocation(dwcaFolder);
// Accommodate archives coming from legacy IPTs which put a "\" before each filename
// http://dev.gbif.org/issues/browse/POR-2396
// https://code.google.com/p/gbif-providertoolkit/issues/detail?id=1015
Iterator<File> iter = FileUtils.iterateFiles(dwcaFolder, new String[] {"xml", "txt"}, false);
while (iter.hasNext()) {
File f = iter.next();
if (f.getName().startsWith("\\")) {
String orig = f.getName();
String replacement = f.getName().replaceFirst("\\\\", "");
LOG.info("Renaming file from {} to {}", orig, replacement);
f.renameTo(new File(dwcaFolder, replacement));
}
}
// read metadata
File mf = new File(dwcaFolder, Archive.META_FN);
if (mf.exists()) {
// read metafile
readMetaDescriptor(archive, new FileInputStream(mf));
} else {
// meta.xml lacking.
// Try to detect data files ourselves as best as we can.
// look for a single, visible text data file
List<File> dataFiles = new ArrayList<File>();
for (String suffix : DATA_FILE_SUFFICES) {
FileFilter ff = FileFilterUtils.and(
FileFilterUtils.suffixFileFilter(suffix, IOCase.INSENSITIVE), HiddenFileFilter.VISIBLE
);
dataFiles.addAll(Arrays.asList(dwcaFolder.listFiles(ff)));
}
if (dataFiles.size() == 1) {
File dataFile = new File(dwcaFolder, dataFiles.get(0).getName());
ArchiveFile coreFile = readFileHeaders(dataFile);
coreFile.getLocations().clear();
coreFile.addLocation(dataFile.getName());
archive.setCore(coreFile);
} else {
throw new UnsupportedArchiveException(
"The archive given is a folder with more or less than 1 data files having a csv, txt or tab suffix");
}
}
// check if we also have a metadata file next to this data file
discoverMetadataFile(archive, mf.getParentFile());
// final validation
return validateArchive(archive);
}
private static void discoverMetadataFile(Archive archive, File folder) {
if (archive.getMetadataLocation() == null) {
// search for popular metadata filenames
for (String metadataFN : Lists.newArrayList("eml.xml", "metadata.xml")) {
File emlFile = new File(folder, metadataFN);
if (emlFile.exists()) {
archive.setMetadataLocation(metadataFN);
break;
}
}
}
}
private static ArchiveFile readFileHeaders(File dataFile) throws UnsupportedArchiveException, IOException {
ArchiveFile dwcFile = new ArchiveFile();
dwcFile.addLocation(null);
dwcFile.setIgnoreHeaderLines(1);
String[] headers;
try (CSVReader reader = CSVReaderFactory.build(dataFile)) {
// copy found delimiters & encoding
dwcFile.setEncoding(reader.encoding);
dwcFile.setFieldsTerminatedBy(reader.delimiter);
dwcFile.setFieldsEnclosedBy(reader.quoteChar);
headers = reader.getHeader();
}
// detect dwc terms as good as we can based on header row
int index = 0;
for (String head : headers) {
// there are never any quotes in term names - remove them just in case the csvreader didnt recognize them
if (head != null && head.length() > 1) {
try {
Term dt = TERM_FACTORY.findTerm(head);
ArchiveField field = new ArchiveField(index, dt, null, DataType.string);
dwcFile.addField(field);
} catch (IllegalArgumentException e) {
LOG.warn("Illegal term name >>{}<< found in header, ignore column {}", head, index);
}
}
index++;
}
List<Term> headerAsTerm = dwcFile.getFields().keySet()
.stream()
.collect(Collectors.toList());
determineRecordIdentifier(headerAsTerm).ifPresent(
t -> dwcFile.setId(dwcFile.getField(t))
);
determineRowType(headerAsTerm).ifPresent(
t -> dwcFile.setRowType(t)
);
return dwcFile;
}
@VisibleForTesting
protected static void readMetaDescriptor(Archive archive, InputStream metaDescriptor) throws UnsupportedArchiveException {
try {
SAXParser p = SAX_FACTORY.newSAXParser();
MetaXMLSaxHandler mh = new MetaXMLSaxHandler(archive);
LOG.debug("Reading archive metadata file");
p.parse(new BOMInputStream(metaDescriptor), mh);
} catch (Exception e1) {
LOG.warn("Exception caught", e1);
throw new UnsupportedArchiveException(e1);
}
}
private static Archive validateArchive(Archive archive) throws UnsupportedArchiveException {
validateCoreFile(archive.getCore(), !archive.getExtensions().isEmpty());
for (ArchiveFile af : archive.getExtensions()) {
validateExtensionFile(af);
}
// report basic stats
LOG.debug("Archive contains " + archive.getExtensions().size() + " described extension files");
LOG.debug("Archive contains " + archive.getCore().getFields().size() + " core properties");
return archive;
}
private static void validateCoreFile(ArchiveFile f, boolean hasExtensions) throws UnsupportedArchiveException {
if (hasExtensions) {
if (f.getId() == null) {
LOG.warn(
"DwC-A core data file " + f.getTitle() + " is lacking an id column. No extensions allowed in this case");
}
}
validateFile(f);
}
private static void validateExtensionFile(ArchiveFile f) throws UnsupportedArchiveException {
if (f.getId() == null) {
throw new UnsupportedArchiveException(
"DwC-A data file " + f.getTitle() + " requires an id or foreign key to the core id");
}
validateFile(f);
}
private static void validateFile(ArchiveFile f) throws UnsupportedArchiveException {
if (f == null) {
throw new UnsupportedArchiveException("DwC-A data file is NULL");
}
if (f.getLocationFile() == null) {
throw new UnsupportedArchiveException("DwC-A data file " + f.getTitle() + " requires a location");
}
if (f.getEncoding() == null) {
throw new UnsupportedArchiveException("DwC-A data file " + f.getTitle() + " requires a character encoding");
}
}
/**
* Tries to determine the rowType based on a list of {@link Term}.
*
* @param terms the list can contain null values
*
* @return {@link Term} as {@code Optional} or {@code Optional.empty()} if can not be determined
*/
static Optional<Term> determineRowType(List<Term> terms) {
return TERM_TO_ROW_TYPE.entrySet().stream()
.filter(ke -> terms.contains(ke.getKey()))
.map(Map.Entry::getValue).findFirst();
}
/**
* Tries to determine the record identifier based on a list of {@link Term}.
*
* @param terms the list can contain null values
*
* @return {@link Term} as {@code Optional} or {@code Optional.empty()} if can not be determined
*/
static Optional<Term> determineRecordIdentifier(List<Term> terms) {
//try to find the first matching term respecting the order defined by ID_TERMS
return ID_TERMS.stream()
.filter(t -> terms.contains(t))
.findFirst();
}
}