/*
* This is eMonocot, a global online biodiversity information resource.
*
* Copyright © 2011–2015 The Board of Trustees of the Royal Botanic Gardens, Kew and The University of Oxford
*
* eMonocot is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* eMonocot is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
* the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* The complete text of the GNU Affero General Public License is in the source repository as the file
* ‘COPYING’. It is also available from <http://www.gnu.org/licenses/>.
*/
package org.emonocot.job.dwc.read;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.validation.ConstraintViolation;
import javax.validation.Validator;
import org.emonocot.api.OrganisationService;
import org.emonocot.api.job.SkosTerm;
import org.emonocot.model.registry.Organisation;
import org.gbif.dwc.terms.Term;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.terms.GbifTerm;
import org.gbif.dwc.text.Archive;
import org.gbif.dwc.text.ArchiveField;
import org.gbif.dwc.text.ArchiveFile;
import org.gbif.dwc.text.UnsupportedArchiveException;
import org.gbif.metadata.BasicMetadata;
import org.gbif.metadata.eml.Eml;
import org.gbif.metadata.eml.EmlFactory;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.batch.core.ExitStatus;
import org.springframework.batch.core.StepExecution;
import org.springframework.batch.core.StepExecutionListener;
import org.springframework.batch.item.ExecutionContext;
import org.springframework.beans.factory.annotation.Autowired;
import org.xml.sax.SAXException;
/**
*
* @author ben
*
*/
public class ArchiveMetadataReader implements StepExecutionListener {
/**
*
*/
private Logger logger = LoggerFactory
.getLogger(ArchiveMetadataReader.class);
/**
*
*/
private StepExecution stepExecution;
/**
*
*/
private String sourceName;
/**
*
*/
private OrganisationService organisationService;
/**
*
*/
private Validator validator;
/**
* @param sourceService the sourceService to set
*/
@Autowired
public final void setSourceService(final OrganisationService sourceService) {
this.organisationService = sourceService;
}
/**
* @param validator the validator to set
*/
@Autowired
public final void setValidator(Validator validator) {
this.validator = validator;
}
/**
*
* @param archiveDirectoryName
* The directory where the DwC Archive has been unpacked to
* @param sourceName Set the name of the source
* @param failOnError Whether to fail if there is an error
* @return An exit status indicating whether the step has been completed or
* failed
*/
public final ExitStatus readMetadata(final String archiveDirectoryName,
final String sourceName, String metaErrorsFail) {
this.sourceName = sourceName;
boolean failOnError = "false".equalsIgnoreCase(metaErrorsFail) ? false : true;
try {
File archiveDirectory = new File(archiveDirectoryName);
File metaDir = getMetaDirectory(archiveDirectory);
if(metaDir == null) {
logger.error("Could not find metadata directory in " + archiveDirectoryName);
return ExitStatus.FAILED;
}
Archive archive = ArchiveFactory.openArchive(metaDir);
ArchiveFile core = archive.getCore();
if (archive.getMetadataLocation() != null) {
String metadataFileName = archiveDirectoryName + File.separator + archive.getMetadataLocation();
try {
Eml eml = EmlFactory.build(new FileInputStream(metadataFileName));
updateSourceMetadata(eml);
} catch (SAXException e) {
logger.error(e.getMessage(), e);
}
}
getMetadata(core, "core", DwcTerm.taxonID, failOnError);
if (archive.getExtension(GbifTerm.Description) != null) {
getMetadata(archive.getExtension(GbifTerm.Description),
"description", DwcTerm.taxonID, failOnError);
}
if (archive.getExtension(GbifTerm.Distribution) != null) {
getMetadata(archive.getExtension(GbifTerm.Distribution),
"distribution", DwcTerm.taxonID, failOnError);
}
if (archive.getExtension(GbifTerm.Image) != null) {
getMetadata(archive.getExtension(GbifTerm.Image), "image",
DwcTerm.taxonID, failOnError);
}
if (archive.getExtension(GbifTerm.Reference) != null) {
getMetadata(archive.getExtension(GbifTerm.Reference),
"reference", DwcTerm.taxonID, failOnError);
}
if (archive.getExtension(GbifTerm.Identifier) != null) {
getMetadata(archive.getExtension(GbifTerm.Identifier),
"identifier", DwcTerm.taxonID, failOnError);
}
if (archive.getExtension(DwcTerm.MeasurementOrFact) != null) {
getMetadata(archive.getExtension(DwcTerm.MeasurementOrFact),
"measurementOrFact", DwcTerm.taxonID, failOnError);
}
if (archive.getExtension(GbifTerm.VernacularName) != null) {
getMetadata(archive.getExtension(GbifTerm.VernacularName),
"vernacularName", DwcTerm.taxonID, failOnError);
}
if (archive.getExtension(GbifTerm.TypesAndSpecimen) != null) {
getMetadata(archive.getExtension(GbifTerm.TypesAndSpecimen),
"typeAndSpecimen", DwcTerm.taxonID, failOnError);
}
if (archive.getExtension(SkosTerm.Concept) != null) {
getMetadata(archive.getExtension(SkosTerm.Concept),
"term", DwcTerm.taxonID, failOnError);
}
if (archive.getExtension(GbifTerm.Multimedia) != null) {
getMetadata(archive.getExtension(GbifTerm.Multimedia), "multimedia", DwcTerm.taxonID, failOnError);
}
} catch (UnsupportedArchiveException uae) {
logger.error("Unsupported Archive Exception reading "
+ archiveDirectoryName + " " + uae.getLocalizedMessage());
return ExitStatus.FAILED;
} catch (IOException ioe) {
logger.error("Input Output Exception reading " + archiveDirectoryName
+ " " + ioe.getLocalizedMessage());
return ExitStatus.FAILED;
}
return ExitStatus.COMPLETED;
}
/**
* Recurses over a directory system and returns the first enclosing
* directory which contains a file with the suffix *.xml
*
* @param archiveDirectory
* @return a file or null if no meta.xml file can be found
*/
private File getMetaDirectory(File archiveDirectory) {
for(File f : archiveDirectory.listFiles()) {
if(f.getName().endsWith(".xml")) {
return archiveDirectory;
} else {
if(f.isDirectory()) {
File dir = getMetaDirectory(f);
if(dir != null) {
return dir;
}
}
}
}
return null;
}
/**
*
* @param basicMetadata
* Set the metadata
*/
private void updateSourceMetadata(final BasicMetadata basicMetadata) {
boolean update = false;
Organisation source = organisationService.find(sourceName);
if (!nullSafeEquals(source.getBibliographicCitation(),
basicMetadata.getCitationString())) {
source.setBibliographicCitation(basicMetadata.getCitationString());
update = true;
}
if (!nullSafeEquals(source.getCreatorEmail(), basicMetadata.getCreatorEmail())) {
source.setCreatorEmail(basicMetadata.getCreatorEmail());
update = true;
}
if (!nullSafeEquals(source.getCreator(),
basicMetadata.getCreatorName())) {
source.setCreator(basicMetadata.getCreatorName());
update = true;
}
if (!nullSafeEquals(source.getDescription(),
basicMetadata.getDescription())) {
source.setDescription(basicMetadata.getDescription());
update = true;
}
if (!nullSafeEquals(source.getUri(),
basicMetadata.getHomepageUrl())) {
source.setUri(basicMetadata.getHomepageUrl());
update = true;
}
if (!nullSafeEquals(source.getLogoUrl(),
basicMetadata.getLogoUrl())) {
source.setLogoUrl(basicMetadata.getLogoUrl());
update = true;
}
if (!nullSafeEquals(source.getPublisherEmail(),
basicMetadata.getPublisherEmail())) {
source.setPublisherEmail(basicMetadata.getPublisherEmail());
update = true;
}
if (!nullSafeEquals(source.getPublisherName(),
basicMetadata.getPublisherName())) {
source.setPublisherName(basicMetadata.getPublisherName());
update = true;
}
if (!nullSafeEquals(source.getSubject(),
basicMetadata.getSubject())) {
source.setSubject(basicMetadata.getSubject());
update = true;
}
if (!nullSafeEquals(source.getTitle(),
basicMetadata.getTitle())) {
source.setTitle(basicMetadata.getTitle());
update = true;
}
if (!nullSafeEquals(source.getRights(),
basicMetadata.getRights())) {
source.setRights(basicMetadata.getRights());
update = true;
}
if (basicMetadata.getPublished() != null) {
DateTime published = new DateTime(basicMetadata.getPublished());
if (source.getCreated() == null) {
source.setCreated(published);
update = true;
} else if (published.isAfter(source.getCreated())) {
source.setModified(published);
update = true;
}
}
if (update) {
Set<ConstraintViolation<Organisation>> violations = validator.validate(source);
if (violations.isEmpty()) {
logger.info("Updating metadata for source " + sourceName);
organisationService.saveOrUpdate(source);
} else {
for (ConstraintViolation<Organisation> violation : violations) {
logger.error(violation.getMessage());
}
}
}
}
/**
*
* @param string1 Set the first string
* @param string2 Set the second string
* @return true if the strings are equal, false otherwise
*/
private boolean nullSafeEquals(final String string1, final String string2) {
if (string1 == null) {
return string1 == string2;
} else {
return string1.equals(string2);
}
}
/**
*
* @param archiveFile
* The archive file to examine
* @param prefix
* the prefix to append to the properties
* @param identifierTerm
* the name of the identifier property
* @param failOnError Whether to fail if there is an error
* @throws IOException
*/
private void getMetadata(final ArchiveFile archiveFile,
final String prefix, final Term identifierTerm, boolean failOnError) throws IOException {
logger.info("Processing " + archiveFile.getRowType());
ExecutionContext executionContext = this.stepExecution
.getJobExecution().getExecutionContext();
executionContext.put("dwca." + prefix + ".file", archiveFile
.getLocationFile().getAbsolutePath());
executionContext.put("dwca." + prefix + ".fieldsTerminatedBy",
archiveFile.getFieldsTerminatedBy());
executionContext.put("dwca." + prefix + ".linesTerminatedBy",
archiveFile.getLinesTerminatedBy());
if(archiveFile.getFieldsEnclosedBy() != null) {
executionContext.put("dwca." + prefix + ".fieldsEnclosedBy", archiveFile.getFieldsEnclosedBy());
} else {
executionContext.put("dwca." + prefix + ".fieldsEnclosedBy", '\u0000');
}
executionContext.put("dwca." + prefix + ".encoding",
archiveFile.getEncoding());
Integer headerLinesToSkip = 0;
if (archiveFile.getIgnoreHeaderLines() != null) {
headerLinesToSkip = archiveFile.getIgnoreHeaderLines();
}
executionContext.put("dwca." + prefix + ".ignoreHeaderLines", headerLinesToSkip);
ArchiveField idField = archiveFile.getId();
idField.setTerm(identifierTerm);
List<ArchiveField> fields = archiveFile.getFieldsSorted();
/**
* Its not clear if you should include the id field twice but if it is
* present twice, ignore it.
*/
boolean idListed = false;
for (ArchiveField field : fields) {
if (field.getIndex() != null
&& field.getIndex().equals(idField.getIndex())) {
idListed = true;
break;
}
}
if (!idListed) {
fields.add(idField.getIndex(), idField);
}
File file = archiveFile.getLocationFile();
FileInputStream fileInputStream = new FileInputStream(file);
LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(
fileInputStream,Charset.forName(archiveFile.getEncoding())));
String firstDataLine = null;
String line = null;
while((line = lineNumberReader.readLine()) != null) {
if(lineNumberReader.getLineNumber() == (headerLinesToSkip + 1)) {
firstDataLine = line;
}
}
Integer totalColumns = 0;
Integer maxIndex = 0;
for(ArchiveField field : fields) {
if(field.getIndex() != null && field.getIndex() > maxIndex) {
maxIndex = field.getIndex();
}
}
if(firstDataLine != null) {
// Note java.lang.String.split does not include trailing empty string
totalColumns = firstDataLine.split(archiveFile.getFieldsTerminatedBy(), -1).length;
} else {
totalColumns = maxIndex + 1;
}
if((maxIndex + 1) > totalColumns) {
if(failOnError) {
throw new RuntimeException("Metadata for " + archiveFile.getRowType()
+ " indicates that there should be at least "+ (maxIndex + 1)
+ " columns but the first data line in the file has only " + totalColumns + " values");
} else {
logger.error("Error reading metadata", new RuntimeException("Metadata for " + archiveFile.getRowType() +
" indicates that there should be at least " + (maxIndex + 1) +
" columns but the first data line in the file has only "+ totalColumns + " values"));
executionContext.put(prefix + ".processing.mode", "SKIP_WITH_ERROR");
return;
}
}
executionContext.put("dwca." + prefix + ".totalColumns", totalColumns);
executionContext.put("dwca." + prefix + ".totalRecords", lineNumberReader.getLineNumber() - headerLinesToSkip);
executionContext.put("dwca." + prefix + ".fieldNames", toFieldNames(fields, totalColumns));
executionContext.put("dwca." + prefix + ".defaultValues", getDefaultValues(fields));
}
/**
* Method which extracts the default fields from the archive and exposes
* them.
*
* @param fields
* the list of fields
* @return a map of property names -> default values
*/
private Map<String, String> getDefaultValues(final List<ArchiveField> fields) {
Map<String, String> defaultValues = new HashMap<String, String>();
for (ArchiveField field : fields) {
if (field.getDefaultValue() != null && field.getIndex() == null) {
defaultValues.put(field.getTerm().qualifiedName(),
field.getDefaultValue());
}
}
return defaultValues;
}
/**
* Method which converts from archive fields to string field names. To be
* used by fieldSet mapper
*
* @param fields
* the DwC/A fields
* @param totalColumns TODO
* @return an array of string names
*/
private String[] toFieldNames(final List<ArchiveField> fields, Integer totalColumns) {
List<String> names = new ArrayList<String>(totalColumns);
for(int i = 0; i < totalColumns; i++) {
names.add("");
}
for (ArchiveField field : fields) {
logger.info("Archive contains field " + field.getTerm().qualifiedName());
if(field.getIndex() != null) {
if(field.getDefaultValue() != null) {
names.set(field.getIndex(), field.getTerm().qualifiedName() + " " + field.getDefaultValue());
} else {
names.set(field.getIndex(), field.getTerm().qualifiedName());
}
}
}
logger.info("Archive contains field names " + names);
return names.toArray(new String[names.size()]);
}
/**
* @param newStepExecution Set the step execution
* @return the exit status
*/
public final ExitStatus afterStep(final StepExecution newStepExecution) {
return null;
}
/**
* @param newStepExecution Set the step execution
*/
public final void beforeStep(final StepExecution newStepExecution) {
this.stepExecution = newStepExecution;
}
}