package org.gbif.dwca.io;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.Maps;
import com.google.common.io.Closeables;
import com.google.common.io.Closer;
import org.apache.commons.io.IOUtils;
import org.gbif.api.model.registry.Dataset;
import org.gbif.dwc.terms.Term;
import org.gbif.io.TabWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.io.Closeable;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Map;
/**
* An archive writer that writes entire data files at once and does not check integrity of coreids.
* In large archives using extensions this yields a much, much better performance than writing star record by star record.
*/
public class DwcaStreamWriter implements AutoCloseable {
private static final Logger LOG = LoggerFactory.getLogger(DwcaStreamWriter.class);
private final File dir;
private final Term core;
private final Term coreIdTerm;
private final boolean useHeaders;
private final Archive archive = new Archive();
private Dataset metadata;
private Map<String, Dataset> constituents = Maps.newHashMap();
/**
* @param dir the directory to use as the archive
* @param coreRowType the archives core row type
* @param coreIdTerm if given used to map the id column of the core
* @param useHeaders if true write a single header row for each data file
*/
public DwcaStreamWriter(File dir, Term coreRowType, @Nullable Term coreIdTerm, boolean useHeaders) {
this.dir = dir;
this.core = coreRowType;
this.coreIdTerm = coreIdTerm;
this.useHeaders = useHeaders;
archive.setLocation(dir);
}
private File dataFile(Term rowType) {
return new File(dir, rowType.simpleName()+".tsv");
}
private static ArchiveField idField(int column) {
ArchiveField field = new ArchiveField();
field.setIndex(column);
return field;
}
/**
* @param coreIdColumn zero based index to the rows coreid
* @param mapping zero based indexed of the rows
*/
public void write(Term rowType, int coreIdColumn, Map<Term, Integer> mapping, Iterable<String[]> rows) {
Preconditions.checkNotNull(rows);
final int maxMapping = maxMappingColumn(mapping);
try (TabWriter writer = addArchiveFile(rowType, coreIdColumn, mapping, maxMapping) ) {
// write data
for (String[] row : rows) {
write(writer, row, maxMapping);
}
} catch (IOException e) {
Throwables.propagate(e);
}
}
private static void write(TabWriter writer, String[] row, int maxMapping) throws IOException {
if (row != null && row.length < maxMapping) {
throw new IllegalArgumentException("Input rows are smaller than the defined mapping of " + maxMapping + " columns.");
}
writer.write(row);
}
private int maxMappingColumn(Map<Term, Integer> mapping) {
return mapping.values().stream().max(Integer::compareTo).get();
}
private TabWriter addArchiveFile(Term rowType, int coreIdColumn, Map<Term, Integer> mapping, int maxMapping) throws IOException {
Preconditions.checkNotNull(rowType);
Preconditions.checkNotNull(mapping);
Preconditions.checkArgument(!mapping.isEmpty());
Preconditions.checkArgument(coreIdColumn >= 0);
final File dataFile = dataFile(rowType);
ArchiveFile af = ArchiveFile.buildTabFile();
af.setEncoding("UTF8");
af.setRowType(rowType);
af.addLocation(dataFile.getName());
af.setIgnoreHeaderLines(useHeaders ? 1 : 0);
af.setId(idField(coreIdColumn));
for (Map.Entry<Term, Integer> entry : mapping.entrySet()) {
ArchiveField field = new ArchiveField();
field.setTerm(entry.getKey());
field.setIndex(entry.getValue());
af.addField(field);
}
if (core.equals(rowType)) {
if (coreIdTerm != null) {
af.getId().setTerm(coreIdTerm);
}
archive.setCore(af);
} else {
archive.addExtension(af);
}
// write headers
TabWriter writer = TabWriter.fromFile(dataFile);
if (useHeaders){
String[] header = new String[maxMapping+1];
mapping.entrySet().stream().sorted(Map.Entry.comparingByValue()).forEach((e)->{
header[e.getValue()] = e.getKey().simpleName();
});
if (coreIdTerm != null) {
header[0] = coreIdTerm.simpleName();
}
writer.write(header);
}
return writer;
}
public interface RowWriteHandler extends AutoCloseable {
void write(String[] row);
}
private class RowWriteHandlerImpl implements RowWriteHandler {
private final TabWriter writer;
private final int minColumns;
RowWriteHandlerImpl(TabWriter writer, int minColumns) {
this.writer = writer;
this.minColumns = minColumns;
}
public void write(String[] row) {
if (row != null && row.length < minColumns) {
throw new IllegalArgumentException("Input rows are smaller than the defined mapping of " + minColumns + " columns.");
}
try {
writer.write(row);
} catch (IOException e) {
throw new IllegalStateException(e);
}
}
@Override
public void close() throws IOException {
writer.close();
}
}
/**
* Useful for glueing mybatis result handlers directly into the dwca writer.
* Make sure to close the write handler properly!
* @param rowType
* @param coreIdColumn
* @param mapping
* @return a handler accepting single rows to write into the data file
*/
public RowWriteHandler writeHandler(Term rowType, int coreIdColumn, Map<Term, Integer> mapping) {
try {
final int maxMapping = maxMappingColumn(mapping);
TabWriter writer = addArchiveFile(rowType, coreIdColumn, mapping, maxMapping);
return new RowWriteHandlerImpl(writer, maxMapping);
} catch (IOException e) {
throw new IllegalStateException(e);
}
}
public void setMetadata(Dataset d) {
metadata = d;
}
/**
* Adds a constituent dataset using the dataset key as the datasetID
*/
public void addConstituent(Dataset eml) {
addConstituent(eml.getKey().toString(), eml);
}
/**
* Adds a constituent dataset.
* The eml file will be called as the datasetID which has to be unique.
*/
public void addConstituent(String datasetID, Dataset eml) {
this.constituents.put(datasetID, eml);
}
/**
* Writes meta.xml and eml.xml to the archive.
*/
@Override
public void close() throws IOException {
checkCoreRowType();
addEml();
addConstituents();
MetaDescriptorWriter.writeMetaFile(archive);
LOG.info("Wrote archive to {}", archive.getLocation().getAbsolutePath());
}
/**
* check if core row type has been written
*/
private void checkCoreRowType() {
if (archive.getCore() == null) {
throw new IllegalStateException("The core data file has not yet been written for " + core.qualifiedName());
}
}
private void addEml() throws IOException {
if (metadata != null) {
DwcaWriter.writeEml(metadata, new File(dir, "eml.xml"));
archive.setMetadataLocation("eml.xml");
}
}
private void addConstituents() throws IOException {
if (!constituents.isEmpty()) {
File ddir = new File(dir, Archive.CONSTITUENT_DIR);
ddir.mkdirs();
for (Map.Entry<String, Dataset> de : constituents.entrySet()) {
DwcaWriter.writeEml(de.getValue(), new File(ddir, de.getKey()+".xml"));
}
}
}
}