ArchiveMetadataReader.java example

Explorer
eMonocot-master
/*
 * This is eMonocot, a global online biodiversity information resource.
 *
 * Copyright © 2011–2015 The Board of Trustees of the Royal Botanic Gardens, Kew and The University of Oxford
 *
 * eMonocot is free software: you can redistribute it and/or modify it under the terms of the
 * GNU Affero General Public License as published by the Free Software Foundation, either version 3
 * of the License, or (at your option) any later version.
 *
 * eMonocot is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
 * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * The complete text of the GNU Affero General Public License is in the source repository as the file
 * ‘COPYING’.  It is also available from <http://www.gnu.org/licenses/>.
 */
package org.emonocot.job.dwc.read;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.validation.ConstraintViolation;
import javax.validation.Validator;

import org.emonocot.api.OrganisationService;
import org.emonocot.api.job.SkosTerm;
import org.emonocot.model.registry.Organisation;
import org.gbif.dwc.terms.Term;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.terms.GbifTerm;
import org.gbif.dwc.text.Archive;
import org.gbif.dwc.text.ArchiveField;
import org.gbif.dwc.text.ArchiveFile;
import org.gbif.dwc.text.UnsupportedArchiveException;
import org.gbif.metadata.BasicMetadata;
import org.gbif.metadata.eml.Eml;
import org.gbif.metadata.eml.EmlFactory;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.batch.core.ExitStatus;
import org.springframework.batch.core.StepExecution;
import org.springframework.batch.core.StepExecutionListener;
import org.springframework.batch.item.ExecutionContext;
import org.springframework.beans.factory.annotation.Autowired;
import org.xml.sax.SAXException;

/**
 *
 * @author ben
 *
 */
public class ArchiveMetadataReader implements StepExecutionListener {
	/**
	 *
	 */
	private Logger logger = LoggerFactory
			.getLogger(ArchiveMetadataReader.class);

	/**
	 *
	 */
	private StepExecution stepExecution;

	/**
	 *
	 */
	private String sourceName;

	/**
	 *
	 */
	private OrganisationService organisationService;

	/**
	 *
	 */
	private Validator validator;

	/**
	 * @param sourceService the sourceService to set
	 */
	@Autowired
	public final void setSourceService(final OrganisationService sourceService) {
		this.organisationService = sourceService;
	}

	/**
	 * @param validator the validator to set
	 */
	@Autowired
	public final void setValidator(Validator validator) {
		this.validator = validator;
	}

	/**
	 *
	 * @param archiveDirectoryName
	 *            The directory where the DwC Archive has been unpacked to
	 * @param sourceName Set the name of the source
	 * @param failOnError Whether to fail if there is an error
	 * @return An exit status indicating whether the step has been completed or
	 *         failed
	 */
	public final ExitStatus readMetadata(final String archiveDirectoryName,
			final String sourceName, String metaErrorsFail) {
		this.sourceName = sourceName;
		boolean failOnError = "false".equalsIgnoreCase(metaErrorsFail) ? false : true;
		try {
			File archiveDirectory = new File(archiveDirectoryName);
			File metaDir = getMetaDirectory(archiveDirectory);
			if(metaDir == null) {
				logger.error("Could not find metadata directory in " +  archiveDirectoryName);
				return ExitStatus.FAILED;
			}
			Archive archive = ArchiveFactory.openArchive(metaDir);

			ArchiveFile core = archive.getCore();

			if (archive.getMetadataLocation() != null) {
				String metadataFileName = archiveDirectoryName + File.separator  + archive.getMetadataLocation();
				try {
					Eml eml = EmlFactory.build(new FileInputStream(metadataFileName));
					updateSourceMetadata(eml);
				} catch (SAXException e) {
					logger.error(e.getMessage(), e);
				}
			}
			getMetadata(core, "core", DwcTerm.taxonID, failOnError);

			if (archive.getExtension(GbifTerm.Description) != null) {
				getMetadata(archive.getExtension(GbifTerm.Description),
						"description", DwcTerm.taxonID, failOnError);
			}

			if (archive.getExtension(GbifTerm.Distribution) != null) {
				getMetadata(archive.getExtension(GbifTerm.Distribution),
						"distribution", DwcTerm.taxonID, failOnError);
			}

			if (archive.getExtension(GbifTerm.Image) != null) {
				getMetadata(archive.getExtension(GbifTerm.Image), "image",
						DwcTerm.taxonID, failOnError);
			}

			if (archive.getExtension(GbifTerm.Reference) != null) {
				getMetadata(archive.getExtension(GbifTerm.Reference),
						"reference", DwcTerm.taxonID, failOnError);
			}
			if (archive.getExtension(GbifTerm.Identifier) != null) {
				getMetadata(archive.getExtension(GbifTerm.Identifier),
						"identifier", DwcTerm.taxonID, failOnError);
			}
			if (archive.getExtension(DwcTerm.MeasurementOrFact) != null) {
				getMetadata(archive.getExtension(DwcTerm.MeasurementOrFact),
						"measurementOrFact", DwcTerm.taxonID, failOnError);
			}
			if (archive.getExtension(GbifTerm.VernacularName) != null) {
				getMetadata(archive.getExtension(GbifTerm.VernacularName),
						"vernacularName", DwcTerm.taxonID, failOnError);
			}
			if (archive.getExtension(GbifTerm.TypesAndSpecimen) != null) {
				getMetadata(archive.getExtension(GbifTerm.TypesAndSpecimen),
						"typeAndSpecimen", DwcTerm.taxonID, failOnError);
			}
			if (archive.getExtension(SkosTerm.Concept) != null) {
				getMetadata(archive.getExtension(SkosTerm.Concept),
						"term", DwcTerm.taxonID, failOnError);
			}
			if (archive.getExtension(GbifTerm.Multimedia) != null) {
				getMetadata(archive.getExtension(GbifTerm.Multimedia), "multimedia", DwcTerm.taxonID, failOnError);
			}
		} catch (UnsupportedArchiveException uae) {
			logger.error("Unsupported Archive Exception reading "
					+ archiveDirectoryName + " " + uae.getLocalizedMessage());
			return ExitStatus.FAILED;
		} catch (IOException ioe) {
			logger.error("Input Output Exception reading " + archiveDirectoryName
					+ " " + ioe.getLocalizedMessage());
			return ExitStatus.FAILED;
		}

		return ExitStatus.COMPLETED;
	}

	/**
	 * Recurses over a directory system and returns the first enclosing
	 * directory which contains a file with the suffix *.xml
	 *
	 * @param archiveDirectory
	 * @return a file or null if no meta.xml file can be found
	 */
	private File getMetaDirectory(File archiveDirectory) {
		for(File f : archiveDirectory.listFiles()) {
			if(f.getName().endsWith(".xml")) {
				return archiveDirectory;
			} else {
				if(f.isDirectory()) {
					File dir = getMetaDirectory(f);
					if(dir != null) {
						return dir;
					}
				}
			}
		}
		return null;
	}

	/**
	 *
	 * @param basicMetadata
	 *            Set the metadata
	 */
	private void updateSourceMetadata(final BasicMetadata basicMetadata) {
		boolean update = false;
		Organisation source = organisationService.find(sourceName);
		if (!nullSafeEquals(source.getBibliographicCitation(),
				basicMetadata.getCitationString())) {
			source.setBibliographicCitation(basicMetadata.getCitationString());
			update = true;
		}
		if (!nullSafeEquals(source.getCreatorEmail(), basicMetadata.getCreatorEmail())) {
			source.setCreatorEmail(basicMetadata.getCreatorEmail());
			update = true;
		}
		if (!nullSafeEquals(source.getCreator(),
				basicMetadata.getCreatorName())) {
			source.setCreator(basicMetadata.getCreatorName());
			update = true;
		}
		if (!nullSafeEquals(source.getDescription(),
				basicMetadata.getDescription())) {
			source.setDescription(basicMetadata.getDescription());
			update = true;
		}
		if (!nullSafeEquals(source.getUri(),
				basicMetadata.getHomepageUrl())) {
			source.setUri(basicMetadata.getHomepageUrl());
			update = true;
		}
		if (!nullSafeEquals(source.getLogoUrl(),
				basicMetadata.getLogoUrl())) {
			source.setLogoUrl(basicMetadata.getLogoUrl());
			update = true;
		}
		if (!nullSafeEquals(source.getPublisherEmail(),
				basicMetadata.getPublisherEmail())) {
			source.setPublisherEmail(basicMetadata.getPublisherEmail());
			update = true;
		}
		if (!nullSafeEquals(source.getPublisherName(),
				basicMetadata.getPublisherName())) {
			source.setPublisherName(basicMetadata.getPublisherName());
			update = true;
		}
		if (!nullSafeEquals(source.getSubject(),
				basicMetadata.getSubject())) {
			source.setSubject(basicMetadata.getSubject());
			update = true;
		}
		if (!nullSafeEquals(source.getTitle(),
				basicMetadata.getTitle())) {
			source.setTitle(basicMetadata.getTitle());
			update = true;
		}
		if (!nullSafeEquals(source.getRights(),
				basicMetadata.getRights())) {
			source.setRights(basicMetadata.getRights());
			update = true;
		}
		if (basicMetadata.getPublished() != null) {
			DateTime published = new DateTime(basicMetadata.getPublished());
			if (source.getCreated() == null) {
				source.setCreated(published);
				update = true;
			} else if (published.isAfter(source.getCreated())) {
				source.setModified(published);
				update = true;
			}
		}

		if (update) {
			Set<ConstraintViolation<Organisation>> violations = validator.validate(source);
			if (violations.isEmpty()) {
				logger.info("Updating metadata for source " + sourceName);
						organisationService.saveOrUpdate(source);
			} else {
				for (ConstraintViolation<Organisation> violation : violations) {
					logger.error(violation.getMessage());
				}
			}
		}
	}

	/**
	 *
	 * @param string1 Set the first string
	 * @param string2 Set the second string
	 * @return true if the strings are equal, false otherwise
	 */
	private boolean nullSafeEquals(final String string1, final String string2) {
		if (string1 == null) {
			return string1 == string2;
		} else {
			return string1.equals(string2);
		}
	}

	/**
	 *
	 * @param archiveFile
	 *            The archive file to examine
	 * @param prefix
	 *            the prefix to append to the properties
	 * @param identifierTerm
	 *            the name of the identifier property
	 * @param failOnError Whether to fail if there is an error
	 * @throws IOException
	 */
	private void getMetadata(final ArchiveFile archiveFile,
			final String prefix, final Term identifierTerm, boolean failOnError) throws IOException {
		logger.info("Processing " + archiveFile.getRowType());
		ExecutionContext executionContext = this.stepExecution
				.getJobExecution().getExecutionContext();

		executionContext.put("dwca." + prefix + ".file", archiveFile
				.getLocationFile().getAbsolutePath());
		executionContext.put("dwca." + prefix + ".fieldsTerminatedBy",
				archiveFile.getFieldsTerminatedBy());
		executionContext.put("dwca." + prefix + ".linesTerminatedBy",
				archiveFile.getLinesTerminatedBy());
		if(archiveFile.getFieldsEnclosedBy() != null) {
			executionContext.put("dwca." + prefix + ".fieldsEnclosedBy", archiveFile.getFieldsEnclosedBy());
		} else {
			executionContext.put("dwca." + prefix + ".fieldsEnclosedBy", '\u0000');
		}
		executionContext.put("dwca." + prefix + ".encoding",
				archiveFile.getEncoding());

		Integer headerLinesToSkip = 0;
		if (archiveFile.getIgnoreHeaderLines() != null) {
			headerLinesToSkip = archiveFile.getIgnoreHeaderLines();
		}

		executionContext.put("dwca." + prefix + ".ignoreHeaderLines", headerLinesToSkip);
		ArchiveField idField = archiveFile.getId();
		idField.setTerm(identifierTerm);


		List<ArchiveField> fields = archiveFile.getFieldsSorted();
		/**
		 * Its not clear if you should include the id field twice but if it is
		 * present twice, ignore it.
		 */
		boolean idListed = false;
		for (ArchiveField field : fields) {
			if (field.getIndex() != null
					&& field.getIndex().equals(idField.getIndex())) {
				idListed = true;
				break;
			}
		}
		if (!idListed) {
			fields.add(idField.getIndex(), idField);
		}

		File file = archiveFile.getLocationFile();
		FileInputStream fileInputStream = new FileInputStream(file);
		LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(
				fileInputStream,Charset.forName(archiveFile.getEncoding())));

		String firstDataLine = null;
		String line = null;
		while((line = lineNumberReader.readLine()) != null) {
			if(lineNumberReader.getLineNumber() == (headerLinesToSkip + 1)) {
				firstDataLine = line;
			}
		}

		Integer totalColumns = 0;
		Integer maxIndex = 0;
		for(ArchiveField field : fields) {
			if(field.getIndex() != null && field.getIndex() > maxIndex) {
				maxIndex = field.getIndex();
			}
		}
		if(firstDataLine != null) {
			// Note java.lang.String.split does not include trailing empty string
			totalColumns = firstDataLine.split(archiveFile.getFieldsTerminatedBy(), -1).length;
		} else {
			totalColumns = maxIndex + 1;
		}

		if((maxIndex + 1) > totalColumns) {
			if(failOnError) {
				throw new RuntimeException("Metadata for " + archiveFile.getRowType()
						+ " indicates that there should be at least "+ (maxIndex + 1)
						+ " columns but the first data line in the file has only " + totalColumns + " values");
			} else {
				logger.error("Error reading metadata", new RuntimeException("Metadata for " + archiveFile.getRowType() +
						" indicates that there should be at least " + (maxIndex + 1) +
						" columns but the first data line in the file has only "+ totalColumns + " values"));

				executionContext.put(prefix + ".processing.mode", "SKIP_WITH_ERROR");
				return;
			}
		}

		executionContext.put("dwca." + prefix + ".totalColumns", totalColumns);

		executionContext.put("dwca." + prefix + ".totalRecords", lineNumberReader.getLineNumber() - headerLinesToSkip);

		executionContext.put("dwca." + prefix + ".fieldNames", toFieldNames(fields, totalColumns));

		executionContext.put("dwca." + prefix + ".defaultValues",  getDefaultValues(fields));
	}

	/**
	 * Method which extracts the default fields from the archive and exposes
	 * them.
	 *
	 * @param fields
	 *            the list of fields
	 * @return a map of property names -> default values
	 */
	private Map<String, String> getDefaultValues(final List<ArchiveField> fields) {
		Map<String, String> defaultValues = new HashMap<String, String>();
		for (ArchiveField field : fields) {
			if (field.getDefaultValue() != null && field.getIndex() == null) {
				defaultValues.put(field.getTerm().qualifiedName(),
						field.getDefaultValue());
			}
		}
		return defaultValues;
	}

	/**
	 * Method which converts from archive fields to string field names. To be
	 * used by fieldSet mapper
	 *
	 * @param fields
	 *            the DwC/A fields
	 * @param totalColumns TODO
	 * @return an array of string names
	 */
	private String[] toFieldNames(final List<ArchiveField> fields, Integer totalColumns) {

		List<String> names = new ArrayList<String>(totalColumns);
		for(int i = 0; i < totalColumns; i++) {
			names.add("");
		}

		for (ArchiveField field : fields) {
			logger.info("Archive contains field " + field.getTerm().qualifiedName());
			if(field.getIndex() != null) {
				if(field.getDefaultValue() != null) {
					names.set(field.getIndex(), field.getTerm().qualifiedName() + " " + field.getDefaultValue());
				} else {
					names.set(field.getIndex(), field.getTerm().qualifiedName());
				}
			}
		}
		logger.info("Archive contains field names " + names);
		return names.toArray(new String[names.size()]);
	}

	/**
	 * @param newStepExecution Set the step execution
	 * @return the exit status
	 */
	public final ExitStatus afterStep(final StepExecution newStepExecution) {
		return null;
	}

	/**
	 * @param newStepExecution Set the step execution
	 */
	public final void beforeStep(final StepExecution newStepExecution) {
		this.stepExecution = newStepExecution;
	}

}