package org.gbif.dwca.io;
/*
* Copyright 2011 Global Biodiversity Information Facility (GBIF)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.gbif.api.model.registry.Dataset;
import org.gbif.dwc.terms.DcTerm;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.terms.Term;
import org.gbif.dwca.record.Record;
import org.gbif.io.TabWriter;
import org.gbif.registry.metadata.EMLWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.io.Closeables;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Simple writer class to create valid dwc archives using tab data files.
* The meta.xml descriptor is generated automatically and an optional EML metadata document can be added.
* The archive is NOT compressed but the final product is a directory with all the necessary files.
* For usage of this class please @see DwcaWriterTest.
*/
public class DwcaWriter {
private Logger log = LoggerFactory.getLogger(DwcaWriter.class);
private final File dir;
private final boolean useHeaders;
private long recordNum;
private String coreId;
private Map<Term, String> coreRow;
private final Term coreRowType;
private final Term coreIdTerm;
private final Map<Term, TabWriter> writers = Maps.newHashMap();
private final Set<Term> headersOut = Sets.newHashSet();
private final Map<Term, String> dataFileNames = Maps.newHashMap();
// key=rowType, value=columns
private final Map<Term, List<Term>> terms = Maps.newHashMap();
// key=rowType, value=default values per column
private final Map<Term, Map<Term, String>> defaultValues = Maps.newHashMap();
private Dataset eml;
private Map<String, Dataset> constituents = Maps.newHashMap();
/**
* Creates a new writer without header rows.
* @param coreRowType the core row type.
* @param dir the directory to create the archive in.
*/
public DwcaWriter(Term coreRowType, File dir) throws IOException {
this(coreRowType, dir, false);
}
/**
* If headers are used the first record must include all terms ever used for that file.
* If in subsequent rows additional terms are introduced an IllegalArgumentException is thrown.
*
* @param coreRowType the core row type
* @param dir the directory to create the archive in
* @param useHeaders if true the first row in every data file will include headers
*/
public DwcaWriter(Term coreRowType, File dir, boolean useHeaders) throws IOException {
this(coreRowType, null, dir, useHeaders);
}
/**
* If headers are used the first record must include all terms ever used for that file.
* If in subsequent rows additional terms are introduced an IllegalArgumentException is thrown.
*
* @param coreRowType the core row type
* @param coreIdTerm the term of the id column
* @param dir the directory to create the archive in
* @param useHeaders if true the first row in every data file will include headers
*/
public DwcaWriter(Term coreRowType, Term coreIdTerm, File dir, boolean useHeaders) throws IOException {
this.dir = dir;
this.coreRowType = coreRowType;
this.coreIdTerm = coreIdTerm;
this.useHeaders = useHeaders;
addRowType(coreRowType);
}
public static Map<Term, String> recordToMap(Record rec, ArchiveFile af) {
Map<Term, String> map = new HashMap<Term, String>();
for (Term t : af.getTerms()) {
map.put(t, rec.value(t));
}
return map;
}
public static String dataFileName(Term rowType) {
return rowType.simpleName().toLowerCase() + ".txt";
}
private void addRowType(Term rowType) throws IOException {
terms.put(rowType, new ArrayList<Term>());
String dfn = dataFileName(rowType);
dataFileNames.put(rowType, dfn);
File df = new File(dir, dfn);
FileUtils.forceMkdir(df.getParentFile());
OutputStream out = new FileOutputStream(df);
TabWriter wr = new TabWriter(out);
writers.put(rowType, wr);
}
/**
* A new core record is started and the last core and all extension records are written.
* @param id the new records id
* @throws IOException
*/
public void newRecord(String id) throws IOException {
// flush last record
flushLastCoreRecord();
// start new
recordNum++;
coreId = id;
coreRow = new HashMap<Term, String>();
}
private void flushLastCoreRecord() throws IOException {
if (coreRow != null) {
writeRow(coreRow, coreRowType);
}
}
public long getRecordsWritten() {
return recordNum;
}
private void writeRow(Map<Term, String> rowMap, Term rowType) throws IOException {
TabWriter writer = writers.get(rowType);
List<Term> columns = terms.get(rowType);
if (useHeaders && !headersOut.contains(rowType)){
// write header row
writeHeader(writer, rowType, columns);
}
// make sure coreId is not null for extensions
if (coreRowType != rowType && coreId == null){
log.warn("Adding an {} extension record to a core without an Id! Skip this record", rowType);
} else {
String[] row = new String[columns.size() + 1];
row[0] = coreId;
for (Map.Entry<Term, String> conceptTermStringEntry : rowMap.entrySet()) {
int column = 1 + columns.indexOf(conceptTermStringEntry.getKey());
row[column] = conceptTermStringEntry.getValue();
}
writer.write(row);
}
}
private void writeHeader(TabWriter writer, Term rowType, List<Term> columns) throws IOException {
int idx = 0;
String[] row = new String[columns.size() + 1];
Term idTerm;
if (DwcTerm.Taxon == coreRowType){
idTerm = DwcTerm.taxonID;
} else if (DwcTerm.Occurrence == coreRowType){
idTerm = DwcTerm.occurrenceID;
} else if (DwcTerm.Identification == coreRowType){
idTerm = DwcTerm.identificationID;
} else if (DwcTerm.Event == coreRowType){
idTerm = DwcTerm.eventID;
} else {
// default to generic dc identifier for id column
idTerm = DcTerm.identifier;
}
row[idx] = idTerm.simpleName();
for (Term term : columns) {
idx ++;
row[idx] = term.simpleName();
}
writer.write(row);
headersOut.add(rowType);
}
/**
* Add a single value for the current core record.
* Calling this method requires that #newRecord() has been called at least once,
* otherwise an IllegalStateException is thrown.
* @param term
* @param value
*/
public void addCoreColumn(Term term, String value) {
// ensure we do not overwrite the coreIdTerm if one is defined
if (coreIdTerm != null && coreIdTerm.equals(term)) {
throw new IllegalStateException("You cannot add a term that was specified as coreId term");
}
List<Term> coreTerms = terms.get(coreRowType);
if (!coreTerms.contains(term)) {
if (useHeaders && recordNum>1){
throw new IllegalStateException("You cannot add new terms after the first row when headers are enabled");
}
coreTerms.add(term);
}
try {
coreRow.put(term, value);
} catch (NullPointerException e) {
// no core record has been started yet
throw new IllegalStateException("No core record has been created yet. Call newRecord() at least once");
}
}
/**
* Convenience method to add an empty core column.
*/
public void addCoreColumn(Term term) {
addCoreColumn(term, (String) null);
}
/**
* Null safe convenience method to write integers.
* See addCoreColumn(Term term, String value) for docs
*/
public void addCoreColumn(Term term, @Nullable Integer value) {
addCoreColumn(term, value == null ? null : value.toString());
}
/**
* Null safe convenience method to write booleans.
* See addCoreColumn(Term term, String value) for docs
*/
public void addCoreColumn(Term term, @Nullable Boolean value) {
addCoreColumn(term, value == null ? null : value.toString());
}
/**
* Null safe convenience method to write enumeration values.
* See addCoreColumn(Term term, String value) for docs
*/
public void addCoreColumn(Term term, @Nullable Enum value) {
addCoreColumn(term, value == null ? null : value.name().toLowerCase().replaceAll("_", " "));
}
/**
* Null safe convenience method to write object values using the toString method.
* See addCoreColumn(Term term, String value) for docs
*/
public void addCoreColumn(Term term, @Nullable Object value) {
addCoreColumn(term, value == null ? null : value.toString());
}
/**
* Add a default value to a term of the core.
*
* @param term
* @param defaultValue
*/
public void addCoreDefaultValue(Term term, String defaultValue){
addDefaultValue(coreRowType, term, defaultValue);
}
/**
* Add a default value to a term of the provided rowType.
*
* @param rowType
* @param term
* @param defaultValue
*/
public void addDefaultValue(Term rowType, Term term, String defaultValue){
if(!defaultValues.containsKey(rowType)){
defaultValues.put(rowType, new HashMap<Term, String>());
}
Map<Term,String> currentDefaultValues= defaultValues.get(rowType);
if(currentDefaultValues.containsKey(term)){
throw new IllegalStateException("The default value of term "+ term + " is already defined");
}
currentDefaultValues.put(term, defaultValue);
}
/**
* @return new map of all current data file names by their rowTypes.
*/
public Map<Term, String> getDataFiles() {
return Maps.newHashMap(dataFileNames);
}
/**
* Add an extension record associated with the current core record.
*
* @param rowType
* @param row
* @throws IOException
*/
public void addExtensionRecord(Term rowType, Map<Term, String> row) throws IOException {
// make sure we know the extension rowtype
if (!terms.containsKey(rowType)) {
addRowType(rowType);
}
// make sure we know all terms
List<Term> knownTerms = terms.get(rowType);
final boolean isFirst = knownTerms.isEmpty();
for (Term term : row.keySet()) {
if (!knownTerms.contains(term)) {
if (useHeaders && !isFirst){
throw new IllegalStateException("You cannot add new terms after the first row when headers are enabled");
}
knownTerms.add(term);
}
}
// write extension record
writeRow(row, rowType);
}
public void setEml(Dataset eml) {
this.eml = eml;
}
/**
* Adds a constituent dataset using the dataset key as the datasetID
*/
public void addConstituent(Dataset eml) {
addConstituent(eml.getKey().toString(), eml);
}
/**
* Adds a constituent dataset.
* The eml file will be called as the datasetID which has to be unique.
*/
public void addConstituent(String datasetID, Dataset eml) {
this.constituents.put(datasetID, eml);
}
/**
* @return the set of available rowTypes in this archive
*/
public Set<Term> getRowTypes() {
return terms.keySet();
}
/**
* @return the list of term columns as used for the given row type
*/
public List<Term> getTerms(Term rowType) {
if (terms.containsKey(rowType)) {
return terms.get(rowType);
}
return Lists.newArrayList();
}
/**
* Writes meta.xml and eml.xml to the archive and closes tab writers.
*
*/
public void close() throws IOException {
addEml();
addConstituents();
addMeta();
// flush last record
flushLastCoreRecord();
// TODO: add missing columns in second iteration of data files
// close writers
for (TabWriter w : writers.values()) {
w.close();
}
}
protected static void writeEml(Dataset d, File f) throws IOException {
if (d != null) {
try (Writer writer = new FileWriter(f)){
EMLWriter.newInstance().writeTo(d, writer);
}
}
}
private void addEml() throws IOException {
writeEml(eml, new File(dir, "eml.xml"));
}
private void addConstituents() throws IOException {
if (!constituents.isEmpty()) {
File ddir = new File(dir, Archive.CONSTITUENT_DIR);
ddir.mkdirs();
for (Map.Entry<String, Dataset> de : constituents.entrySet()) {
writeEml(de.getValue(), new File(ddir, de.getKey()+".xml"));
}
}
}
private void addMeta() throws IOException {
File metaFile = new File(dir, Archive.META_FN);
Archive arch = new Archive();
if (eml != null) {
arch.setMetadataLocation("eml.xml");
}
arch.setCore(buildArchiveFile(arch, coreRowType, coreIdTerm));
for (Term rowType : this.terms.keySet()) {
if (!coreRowType.equals(rowType)) {
arch.addExtension(buildArchiveFile(arch, rowType, null));
}
}
MetaDescriptorWriter.writeMetaFile(metaFile, arch);
}
/**
* Build an ArchiveFile for core or extension(s).
*
* @param archive
* @param rowType
* @param idTerm the term of the id column, may be null
* @return
*/
private ArchiveFile buildArchiveFile(Archive archive, Term rowType, Term idTerm) {
ArchiveFile af = ArchiveFile.buildTabFile();
af.setArchive(archive);
af.addLocation(dataFileNames.get(rowType));
af.setEncoding("utf-8");
af.setIgnoreHeaderLines(useHeaders ? 1 : 0);
af.setRowType(rowType);
ArchiveField id = new ArchiveField();
id.setIndex(0);
af.setId(id);
// always use the index 0 for idTerm
if (idTerm != null) {
af.addField(buildArchiveField(0, idTerm));
}
Map<Term,String> termDefaultValueMap = defaultValues.get(rowType);
List<Term> rowTypeTerms = terms.get(rowType);
int idx = 0;
String defaultValue;
for (Term c : rowTypeTerms) {
idx++;
defaultValue = (termDefaultValueMap !=null ? termDefaultValueMap.get(c) : null);
af.addField(buildArchiveField(idx, c, defaultValue));
}
// check if default values are provided for this rowType
if(termDefaultValueMap != null){
for (Term t : termDefaultValueMap.keySet()) {
if(!rowTypeTerms.contains(t)){
af.addField(buildArchiveFieldDefaultValue(t, termDefaultValueMap.get(t)));
}
}
}
return af;
}
/**
* Build an ArchiveField with a defaultValue and no index.
*
* @param term
* @param defaultValue
* @return
*/
private ArchiveField buildArchiveFieldDefaultValue(Term term, String defaultValue){
Preconditions.checkNotNull(term, "Can't use a null term");
Preconditions.checkNotNull(defaultValue, "Can't use a null defaultValue");
return new ArchiveField(term, defaultValue);
}
/**
* Build an ArchiveField with no defaultValue.
*
* @param idx
* @param term
* @return
*/
private ArchiveField buildArchiveField(Integer idx, Term term){
return buildArchiveField(idx, term, null);
}
/**
*
* Build an ArchiveField from optional parameters.
*
* @param idx index or null
* @param term term or null
* @param defaultValue default value or null
* @return
*/
private ArchiveField buildArchiveField(Integer idx, Term term, String defaultValue){
Preconditions.checkNotNull(idx, "Can't use a null index");
Preconditions.checkNotNull(term, "Can't use a null term");
ArchiveField field = new ArchiveField(idx, term);
if (StringUtils.isNotBlank(defaultValue)){
field.setDefaultValue(defaultValue);
}
return field;
}
}