package org.gbif.dwca.io;
import org.gbif.api.model.registry.Dataset;
import org.gbif.dwc.terms.DcTerm;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.terms.Term;
import org.gbif.dwca.record.DarwinCoreRecord;
import org.gbif.dwca.record.Record;
import org.gbif.dwca.record.RecordImpl;
import org.gbif.dwca.record.RecordIterator;
import org.gbif.dwca.record.StarRecord;
import org.gbif.dwca.record.StarRecordImpl;
import org.gbif.registry.metadata.parse.DatasetParser;
import org.gbif.utils.file.ClosableIterator;
import org.gbif.utils.file.FileUtils;
import org.gbif.utils.file.csv.CSVReader;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.google.common.base.Strings;
import com.google.common.collect.Iterators;
import com.google.common.collect.Maps;
import com.google.common.collect.PeekingIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A darwin core star archive allowing easy reading and iteration over a core record with all its extensions.
*
* @see <a href="http://tdwg.github.io/dwc/terms/guides/text/">Darwin Core Text Guide</a>
*/
public class Archive implements Iterable<StarRecord> {
public static final String CONSTITUENT_DIR = "dataset";
public static final String META_FN = "meta.xml";
/**
* An iterator of fixed DarwinCoreRecords over the core file only. This iterator doesn't need any sorted data files
* as it doesn't deal with extensions.
*/
static class ArchiveDwcIterator implements ClosableIterator<DarwinCoreRecord> {
private CSVReader coreReader;
private ArchiveFile core;
private int lineCount = 0;
private final RecordImpl record;
private boolean hasNext = true;
private final Set<Term> mappedTerms = new HashSet<Term>();
ArchiveDwcIterator(Archive archive) {
record = new RecordImpl(archive.getCore(), true, true);
core = archive.getCore();
// remember used DwC and DC terms
for (DwcTerm term : DwcTerm.values()) {
if (core.hasTerm(term)) {
mappedTerms.add(term);
}
}
for (DcTerm term : DcTerm.values()) {
if (core.hasTerm(term)) {
mappedTerms.add(term);
}
}
try {
coreReader = archive.getCore().getCSVReader();
// read first core row
record.setRow(coreReader.next());
if (!record.hasRow()) {
hasNext = false;
}
} catch (Exception e) {
hasNext = false;
LOG.warn("Exception caught", e);
}
}
public void close() {
coreReader.close();
}
public boolean hasNext() {
return hasNext;
}
public DarwinCoreRecord next() {
DarwinCoreRecord dwc = new DarwinCoreRecord();
lineCount++;
try {
for (Term term : mappedTerms) {
dwc.setProperty(term, record.value(term));
}
dwc.setId(record.id());
// read next line to see if it exists at all
record.setRow(coreReader.next());
if (!record.hasRow()) {
hasNext = false;
}
} catch (Exception e) {
LOG.warn("Bad row somewhere around core line: {}", lineCount, e);
}
return dwc;
}
public void remove() {
throw new UnsupportedOperationException("Cannot remove a row from archive files");
}
}
/**
* An iterator over core records of the archive that returns StarRecords, i.e. a single core record with all its
* related extension records attached. This is a convenient way to iterate over an entire archive accessing all
* information including all extensions.
*
* Requires underlying data files to be sorted by the coreid column if iteration spans multiple data files, i.e. if
* extensions exist.
* Extension rows with a non existing coreid are skipped. This requires that we use the same sorting order in the
* java code as we use for sorting the data files!
*/
class ArchiveIterator implements ClosableIterator<StarRecord> {
private final StarRecordImpl rec;
private RecordIterator coreIter;
private Set<RecordIterator> closables = new HashSet<RecordIterator>();
private Map<Term, PeekingIterator<Record>> extensionIters = new HashMap<Term, PeekingIterator<Record>>();
private Map<Term, Integer> extensionRecordsSkipped = new HashMap<Term, Integer>();
/**
* @param replaceNulls if true replaces common literal null values in all records
*/
ArchiveIterator(Archive archive, boolean replaceNulls, boolean replaceEntities) {
List<Term> rowTypes = new ArrayList<Term>();
try {
if (extensions.isEmpty()) {
// no need to sort
coreIter = RecordIterator.build(archive.getCore(), replaceNulls, replaceEntities);
} else {
// sort data files to align extension records into a single star record
if (!archive.sorted) {
archive.sortFiles();
}
coreIter = buildSortedIterator(archive.getCore(), replaceNulls, replaceEntities);
}
} catch (IOException e) {
LOG.warn("IOException opening core file", e);
}
for (ArchiveFile af : archive.getExtensions()) {
rowTypes.add(af.getRowType());
RecordIterator iter =
extensions.isEmpty() ? RecordIterator.build(af, replaceNulls, replaceEntities) : buildSortedIterator(af,
replaceNulls, replaceNulls);
closables.add(iter);
extensionIters.put(af.getRowType(), Iterators.peekingIterator(iter));
extensionRecordsSkipped.put(af.getRowType(), 0);
}
rec = new StarRecordImpl(rowTypes);
}
private RecordIterator buildSortedIterator(ArchiveFile af, boolean replaceNulls, boolean replaceEntities) {
// we need to sort the data files
String original = af.getLocation();
// temporarily modify archive file to create iterator over sorted file
af.getLocations().clear();
af.addLocation(ArchiveFile.getLocationSorted(original));
RecordIterator iter = RecordIterator.build(af, replaceNulls, replaceEntities);
// revert to original
af.getLocations().clear();
af.addLocation(original);
return iter;
}
@Override
public void close() {
coreIter.close();
for (ClosableIterator<Record> it : closables) {
try {
it.close();
} catch (Exception e) {
LOG.debug("Can't close ClosableIterator", e);
}
}
for (Map.Entry<Term, Integer> stringIntegerEntry : extensionRecordsSkipped.entrySet()) {
Integer skipped = stringIntegerEntry.getValue();
if (skipped > 0) {
LOG.debug("{} {} extension records without matching core", skipped, stringIntegerEntry.getKey());
}
}
}
public boolean hasNext() {
return coreIter.hasNext();
}
public StarRecord next() {
Record core = coreIter.next();
rec.newCoreRecord(core);
// add extension records if core id exists
if (core.id() != null) {
String id = core.id();
for (Map.Entry<Term, PeekingIterator<Record>> ext : extensionIters.entrySet()) {
PeekingIterator<Record> it = ext.getValue();
Term rowType = ext.getKey();
while (it.hasNext()) {
String extId = it.peek().id();
// make sure we have an extid
if (Strings.isNullOrEmpty(extId)) {
it.next();
continue;
}
if (id.equals(extId)) {
// extension row belongs to this core record
rec.addRecord(rowType, it.next());
} else if (id.compareTo(extId) > 0) {
// TODO: we need to use the exact same sorting order, ie comparator, as we use for sorting the data files!!!
// this extension id is smaller than the core id and should have been picked up by a core record already
// seems to have no matching core record, so lets skip it
it.next();
extensionRecordsSkipped.put(rowType, extensionRecordsSkipped.get(rowType) + 1);
} else {
// higher id, we need to wait for this one
break;
}
}
}
}
return rec;
}
public void remove() {
throw new UnsupportedOperationException("Cannot remove a row from archive files");
}
}
private static final Logger LOG = LoggerFactory.getLogger(Archive.class);
private String metadataLocation;
private Dataset metadata;
private File location;
private ArchiveFile core;
private Set<ArchiveFile> extensions = new HashSet<ArchiveFile>();
private boolean sorted = false;
public void addExtension(ArchiveFile extension) {
extension.setArchive(this);
extensions.add(extension);
}
public ArchiveFile getCore() {
return core;
}
/**
* Get an extension by its rowType.
* @param rowType
* @return ArchiveFile or {@code null} is not found
*/
public ArchiveFile getExtension(Term rowType) {
for (ArchiveFile af : extensions) {
if (af.getRowType() != null && af.getRowType().equals(rowType)) {
return af;
}
}
return null;
}
public Set<ArchiveFile> getExtensions() {
return extensions;
}
public File getLocation() {
return location;
}
public Dataset getMetadata() throws MetadataException {
if (metadata == null) {
File mf = getMetadataLocationFile();
try {
InputStream stream;
if (mf.exists()) {
stream = FileUtils.getInputStream(mf);
} else {
// try as url
URL url = new URL(metadataLocation);
stream = url.openStream();
}
metadata = DatasetParser.build(stream);
} catch (IOException e) {
throw new MetadataException(e);
} catch (RuntimeException e) {
throw new MetadataException(e);
}
}
return metadata;
}
public String getMetadataLocation() {
return metadataLocation;
}
public File getMetadataLocationFile() {
if (metadataLocation != null) {
return new File(location, metadataLocation);
}
return null;
}
/**
* Scans the archive for a semi standard support of dataset constituent metadata.
* A dataset constituent is a subdataset which is referenced via dwc:datasetID in the data.
* The agreement first introduced by catalogue of life for their GSDs is to have a new folder "dataset" that keeps
* a metadata file for each constituent named just as the datasetID and suffixed with .xml.
*
* @return map of constituent datasetID to metadata file inside the archive
*/
public Map<String, File> getConstituentMetadata() {
Map<String, File> constituents = Maps.newHashMap();
File constDir = new File(location, CONSTITUENT_DIR);
if (constDir.exists()) {
File[] files = constDir.listFiles(new FilenameFilter() {
public boolean accept(File dir, String filename) { return filename.endsWith(".xml"); }
});
if (files != null) {
for (File cf : files) {
String name = cf.getName().split("\\.")[0];
constituents.put(name, cf);
}
}
}
return constituents;
}
/**
* @return a complete iterator using star records with all extension records that replace literal null values and
* html entities.
*/
public ClosableIterator<StarRecord> iterator() {
return new ArchiveIterator(this, true, true);
}
/**
* @return a complete iterator using star records with all extension records that are not replacing literal null
* values or html entities.
*/
public ClosableIterator<StarRecord> iteratorRaw() {
return new ArchiveIterator(this, false, false);
}
/**
* @return an iterator over simple darwin core records based on the core data file(s). The DarwinCoreRecord instance
* is reused to give better performance, so create a clone before referencing it.
*/
public ClosableIterator<DarwinCoreRecord> iteratorDwc() {
return new ArchiveDwcIterator(this);
}
public void setCore(ArchiveFile core) {
core.setArchive(this);
this.core = core;
}
public void setExtensions(Set<ArchiveFile> extensions) {
this.extensions = extensions;
}
public void setLocation(File location) {
this.location = location;
}
public void setMetadataLocation(String metadataLocation) {
this.metadataLocation = metadataLocation;
}
/**
* Sorts all files according to file id, so that we can easily iterate over all files at once.
*/
private void sortFiles() throws IOException {
FileUtils futil = new FileUtils();
// core
try {
futil.sort(core.getLocationFile(), ArchiveFile.getLocationFileSorted(core.getLocationFile()), core.getEncoding(),
core.getId().getIndex(), core.getFieldsTerminatedBy(), core.getFieldsEnclosedBy(), core.getLinesTerminatedBy(),
core.getIgnoreHeaderLines());
} catch (IOException e) {
LOG.error("Error sorting core file " + core.getLocationFile() + " : " + e.getMessage());
throw e;
} catch (RuntimeException e) {
LOG.error("Error sorting core file " + core.getLocationFile() + " : " + e.getMessage());
throw e;
}
// extensions
for (ArchiveFile ext : extensions) {
try {
futil.sort(ext.getLocationFile(), ArchiveFile.getLocationFileSorted(ext.getLocationFile()), ext.getEncoding(),
ext.getId().getIndex(), ext.getFieldsTerminatedBy(), ext.getFieldsEnclosedBy(), ext.getLinesTerminatedBy(),
ext.getIgnoreHeaderLines());
} catch (IOException e) {
LOG.error("Error sorting extension file " + ext.getLocationFile() + " : " + e.getMessage());
throw e;
} catch (RuntimeException e) {
LOG.error("Error sorting extension file " + ext.getLocationFile() + " : " + e.getMessage());
throw e;
}
}
sorted = true;
}
@Override
public String toString() {
String result = "";
result += location == null ? "no archive file" : location.getAbsoluteFile();
return result;
}
}