/*
* This is eMonocot, a global online biodiversity information resource.
*
* Copyright © 2011–2015 The Board of Trustees of the Royal Botanic Gardens, Kew and The University of Oxford
*
* eMonocot is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* eMonocot is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
* the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* The complete text of the GNU Affero General Public License is in the source repository as the file
* ‘COPYING’. It is also available from <http://www.gnu.org/licenses/>.
*/
package org.emonocot.job.dwc.read;
import org.emonocot.api.job.TermFactory;
import org.gbif.dwc.terms.Term;
import org.gbif.dwc.terms.DcTerm;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.text.Archive;
import org.gbif.dwc.text.ArchiveField;
import org.gbif.dwc.text.ArchiveField.DataType;
import org.gbif.dwc.text.ArchiveFile;
import org.gbif.dwc.text.UnsupportedArchiveException;
import org.gbif.file.CSVReader;
import org.gbif.file.CSVReaderFactory;
import org.gbif.file.DownloadUtil;
import org.gbif.metadata.handler.BasicMetadataSaxHandler;
import org.gbif.utils.file.BomSafeInputStreamWrapper;
import org.gbif.utils.file.CompressionUtil;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.io.IOCase;
import org.apache.commons.io.filefilter.SuffixFileFilter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
public class ArchiveFactory {
/**
* SAX handler to parse a meta.xml descriptor for dwc archives. It populates a given archive instance and ignores
* namespaces. The parser needs to be namespace aware!
*/
static class MetaHandler extends BasicMetadataSaxHandler {
private static final String NS_DWCA = "http://rs.tdwg.org/dwc/text/";
private Archive archive;
private ArchiveFile af;
protected MetaHandler(Archive archive) {
this.archive = archive;
}
private static Character getFirstChar(String x) throws UnsupportedArchiveException {
if (x == null || x.length() == 0) {
return null;
}
if (x.length() == 1) {
return x.charAt(0);
}
if (x.equalsIgnoreCase("\\t")) {
return '\t';
}
if (x.equalsIgnoreCase("\\n")) {
return '\n';
}
if (x.equalsIgnoreCase("\\r")) {
return '\r';
}
if (x.length() > 1) {
throw new UnsupportedArchiveException(
"Only darwin core archives with a single quotation character are supported, but found >>>" + x + "<<<");
}
return ' ';
}
private static void makeLocationPathsAbsolute(ArchiveFile af, File root) {
// I know this is verbose and stupid, but its easy coded now without the hassle of deep copying lists, etc...
List<String> newLocs = new ArrayList<String>();
for (String loc : af.getLocations()) {
newLocs.add(new File(root, af.getLocation()).getAbsolutePath());
}
af.getLocations().clear();
for (String loc : newLocs) {
af.getLocations().add(loc);
}
}
private static String unescapeBackslash(String x) {
if (x == null || x.length() == 0) {
return null;
}
return x.replaceAll("\\\\t", String.valueOf('\t')).replaceAll("\\\\n", String.valueOf('\n'))
.replaceAll("\\\\r", String.valueOf('\r')).replaceAll("\\\\f", String.valueOf('\f'));
}
private ArchiveFile buildArchiveFile(Attributes attr) throws UnsupportedArchiveException {
ArchiveFile dwcFile = new ArchiveFile();
// extract the File attributes
if (getAttr(attr, "encoding") != null) {
dwcFile.setEncoding(getAttr(attr, "encoding"));
}
if (getAttr(attr, "fieldsTerminatedBy") != null) {
dwcFile.setFieldsTerminatedBy(unescapeBackslash(getAttr(attr, "fieldsTerminatedBy")));
}
if (getAttr(attr, "fieldsEnclosedBy") != null) {
dwcFile.setFieldsEnclosedBy(getFirstChar(getAttr(attr, "fieldsEnclosedBy")));
}
if (getAttr(attr, "linesTerminatedBy") != null) {
dwcFile.setLinesTerminatedBy(unescapeBackslash(getAttr(attr, "linesTerminatedBy")));
}
if (getAttr(attr, "rowType") != null) {
dwcFile.setRowType(getAttr(attr, "rowType"));
}
String ignoreHeaderLines = getAttr(attr, "ignoreHeaderLines");
try {
dwcFile.setIgnoreHeaderLines(Integer.parseInt(ignoreHeaderLines));
} catch (NumberFormatException ignored) { // swallow null or bad value
}
return dwcFile;
}
/**
* Build an ArchiveField object based on xml attributes.
*/
private ArchiveField buildField(Attributes attributes) {
// build field
Term term = TermFactory.findTerm(getAttr(attributes, "term"));
String defaultValue = getAttr(attributes, "default");
DataType type = DataType.findByXmlSchemaType(getAttr(attributes, "type"));
if (type == null) {
type = DataType.string;
}
String indexAsString = getAttr(attributes, "index");
Integer index = null;
if (indexAsString != null) {
// let bad errors be thrown up
try {
index = Integer.parseInt(indexAsString);
} catch (NumberFormatException e) {
throw new UnsupportedArchiveException(e);
}
}
return new ArchiveField(index, term, defaultValue, type);
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
// calling the super method to stringify the character buffer
super.endElement(uri, localName, qName);
if (localName.equalsIgnoreCase("archive")) {
// archive
} else if (localName.equalsIgnoreCase("core")) {
// update location to absolute path incl archive path
// makeLocationPathsAbsolute(af, archive.getLocation());
archive.setCore(af);
} else if (localName.equalsIgnoreCase("extension")) {
// update location to absolute path incl archive path
// makeLocationPathsAbsolute(af, archive.getLocation());
archive.addExtension(af);
} else if (localName.equalsIgnoreCase("location")) {
// a file location
af.addLocation(content);
}
}
private String getAttr(Attributes attributes, String key) {
String val = null;
if (attributes != null) {
// try without NS
val = attributes.getValue("", key);
if (val == null) {
// try with dwca NS if nothing found
val = attributes.getValue(NS_DWCA, key);
}
}
return val;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
super.startElement(uri, localName, qName, attributes);
if (localName.equalsIgnoreCase("archive") || localName.equalsIgnoreCase("stararchive")) {
// metadata location
archive.setMetadataLocation(getAttr(attributes, "metadata"));
} else if (localName.equalsIgnoreCase("core") || localName.equalsIgnoreCase("extension")) {
// archive/extension
af = new ArchiveFile();
if (localName.equalsIgnoreCase("core") || localName.equalsIgnoreCase("extension")) {
// archive/core or archive/extension
af = buildArchiveFile(attributes);
}
} else if (localName.equalsIgnoreCase("coreid") || localName.equalsIgnoreCase("id")) {
ArchiveField field = buildField(attributes);
if (af != null) {
af.setId(field);
} else {
log.warn(localName + " field found outside of an archive file");
}
} else if (localName.equalsIgnoreCase("field")) {
ArchiveField field = buildField(attributes);
if (af != null) {
af.addField(field);
} else {
log.warn("field found outside of an archive file");
}
}
}
}
private static final TermFactory TERM_FACTORY = new TermFactory();
private static final Logger LOG = LoggerFactory.getLogger(ArchiveFactory.class);
private static final SAXParserFactory SAX_FACTORY = SAXParserFactory.newInstance();
static {
SAX_FACTORY.setNamespaceAware(true);
SAX_FACTORY.setValidating(false);
}
/**
* Opens an archive from a URL, downloading and decompressing it.
*
* @param archiveUrl the location of a compressed archive or single data file
* @param workingDir writable directory to download to and decompress archive
*/
public static Archive openArchive(URL archiveUrl, File workingDir) throws IOException, UnsupportedArchiveException {
File downloadTo = new File(workingDir, "dwca-download");
File dwca = new File(workingDir, "dwca");
DownloadUtil.download(archiveUrl, downloadTo);
return openArchive(downloadTo, dwca);
}
/**
* Opens an archive from a local file and decompresses or copies it into the given archive directory.
* Make sure the archive directory does not contain files already!
*
* @param archiveFile the location of a compressed archive or single data file
* @param archiveDir empty, writable directory used to keep decompress archive in
*/
public static Archive openArchive(File archiveFile, File archiveDir) throws IOException, UnsupportedArchiveException {
// try to decompress archive
try {
List<File> files = CompressionUtil.decompressFile(archiveDir, archiveFile);
// continue to read archive from the tmp dir
return openArchive(archiveDir);
} catch (CompressionUtil.UnsupportedCompressionType e) {
// If its a text file only we will get this exception - but also for corrupt compressions
// try to open as text file only
return openArchive(archiveFile);
}
}
/**
* @param unzippedFolderLocation the location of an expanded archive directory or just a single dwc text file
*/
public static Archive openArchive(File unzippedFolderLocation) throws IOException, UnsupportedArchiveException {
Archive archive = new Archive();
archive.setLocation(unzippedFolderLocation);
File mf = null;
// see if we can find a meta.xml descriptor file
if (unzippedFolderLocation.isFile()) {
String suffix = unzippedFolderLocation.getName().substring(unzippedFolderLocation.getName().lastIndexOf("."));
if (suffix.equalsIgnoreCase(".xml")) {
// could be a metafile on its own pointing to remote data files...
mf = unzippedFolderLocation;
}
} else {
mf = new File(unzippedFolderLocation, "meta.xml");
}
// read metadata
if (mf != null && mf.exists()) {
// read metafile
readMetaDescriptor(archive, new FileInputStream(mf), true);
if (archive.getMetadataLocation() == null) {
// search for known metadata filenames
File emlFile = new File(mf.getParentFile(), "eml.xml");
if (emlFile.exists()) {
archive.setMetadataLocation("eml.xml");
}
}
} else {
// try to detect data files ourselves as best as we can...
// currently support a single data file or a folder which contains a single data file
if (unzippedFolderLocation.isFile()) {
ArchiveFile coreFile = readFileHeaders(unzippedFolderLocation);
archive.setCore(coreFile);
} else {
// folder. see if we got only 1 file in there...
List<File> dataFiles = new ArrayList<File>();
FilenameFilter ff = new SuffixFileFilter(".csv", IOCase.INSENSITIVE);
dataFiles.addAll(Arrays.asList(unzippedFolderLocation.listFiles(ff)));
ff = new SuffixFileFilter(".txt", IOCase.INSENSITIVE);
dataFiles.addAll(Arrays.asList(unzippedFolderLocation.listFiles(ff)));
if (dataFiles.size() == 1) {
// set pointer to data file
File dataFile = new File(unzippedFolderLocation, dataFiles.get(0).getName());
archive.setLocation(unzippedFolderLocation);
if (archive.getMetadataLocation() == null && unzippedFolderLocation.isDirectory()) {
// search for known metadata filenames
File emlFile = new File(unzippedFolderLocation, "eml.xml");
if (emlFile.exists()) {
archive.setMetadataLocation("eml.xml");
}
}
ArchiveFile coreFile = readFileHeaders(dataFile);
coreFile.getLocations().clear();
coreFile.addLocation(dataFile.getName());
archive.setCore(coreFile);
} else {
throw new UnsupportedArchiveException(
"The archive given is a folder with more or less than 1 data files having a txt or csv suffix");
}
}
}
// final validation
validateArchive(archive);
// report basic stats
LOG.debug("Archive contains " + archive.getExtensions().size() + " described extension files");
LOG.debug("Archive contains " + archive.getCore().getFields().size() + " core properties");
return archive;
}
/**
* Use internal term factory to find/build a new ConceptTerm based on its qualified name.
*
* @param termName the qualified term name
*
* @return the ConceptTerm either as one of the existing enums or an UnknownTerm singleton
*/
public static Term findTerm(String termName) {
return TERM_FACTORY.findTerm(termName);
}
private static ArchiveFile readFileHeaders(File dataFile) throws UnsupportedArchiveException, IOException {
ArchiveFile dwcFile = new ArchiveFile();
dwcFile.addLocation(null);
dwcFile.setIgnoreHeaderLines(1);
CSVReader reader = CSVReaderFactory.build(dataFile);
// copy found delimiters & encoding
dwcFile.setEncoding(reader.encoding);
dwcFile.setFieldsTerminatedBy(reader.delimiter);
dwcFile.setFieldsEnclosedBy(reader.quoteChar);
// detect dwc terms as good as we can based on header row
String[] headers = reader.header;
int index = 0;
for (String head : headers) {
// there are never any quotes in term names - remove them just in case the csvreader didnt recognize them
if (head != null && head.length() > 1) {
Term dt = TERM_FACTORY.findTerm(head);
if (dt != null) {
ArchiveField field = new ArchiveField(index, dt, null, DataType.string);
if (dwcFile.getId() == null &&
(dt.equals(DwcTerm.occurrenceID) || dt.equals(DwcTerm.taxonID) || dt.equals(DcTerm.identifier))) {
dwcFile.setId(field);
} else {
dwcFile.addField(field);
}
}
}
index++;
}
return dwcFile;
}
private static void readMetaDescriptor(Archive archive, InputStream metaDescriptor, boolean normaliseTerms)
throws UnsupportedArchiveException {
try {
SAXParser p = SAX_FACTORY.newSAXParser();
MetaHandler mh = new MetaHandler(archive);
LOG.debug("Reading archive metadata file");
// p.parse(metaDescriptor, mh);
p.parse(new BomSafeInputStreamWrapper(metaDescriptor), mh);
} catch (Exception e1) {
LOG.warn("Exception caught", e1);
throw new UnsupportedArchiveException(e1);
}
}
private static void validateArchive(Archive archive) throws UnsupportedArchiveException {
validateCoreFile(archive.getCore(), !archive.getExtensions().isEmpty());
for (ArchiveFile af : archive.getExtensions()) {
validateExtensionFile(af);
}
}
private static void validateCoreFile(ArchiveFile f, boolean hasExtensions) throws UnsupportedArchiveException {
if (hasExtensions) {
if (f.getId() == null) {
LOG.warn(
"DwC-A core data file " + f.getTitle() + " is lacking an id column. No extensions allowed in this case");
}
}
validateFile(f);
}
private static void validateExtensionFile(ArchiveFile f) throws UnsupportedArchiveException {
if (f.getId() == null) {
throw new UnsupportedArchiveException(
"DwC-A data file " + f.getTitle() + " requires an id or foreign key to the core id");
}
validateFile(f);
}
private static void validateFile(ArchiveFile f) throws UnsupportedArchiveException {
if (f == null) {
throw new UnsupportedArchiveException("DwC-A data file is NULL");
}
if (f.getLocationFile() == null) {
throw new UnsupportedArchiveException("DwC-A data file " + f.getTitle() + " requires a location");
}
if (f.getEncoding() == null) {
throw new UnsupportedArchiveException("DwC-A data file " + f.getTitle() + " requires a character encoding");
}
}
}