/*
* This is eMonocot, a global online biodiversity information resource.
*
* Copyright © 2011–2015 The Board of Trustees of the Royal Botanic Gardens, Kew and The University of Oxford
*
* eMonocot is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* eMonocot is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
* the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* The complete text of the GNU Affero General Public License is in the source repository as the file
* ‘COPYING’. It is also available from <http://www.gnu.org/licenses/>.
*/
package org.emonocot.harvest.media;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import javax.validation.ConstraintViolation;
import javax.validation.Validator;
import org.apache.commons.lang3.StringUtils;
import org.apache.jempbox.xmp.XMPMetadata;
import org.apache.jempbox.xmp.XMPSchema;
import org.apache.jempbox.xmp.XMPSchemaDublinCore;
import org.apache.jempbox.xmp.XMPSchemaIptc4xmpCore;
import org.apache.jempbox.xmp.XMPSchemaPhotoshop;
import org.apache.jempbox.xmp.XMPSchemaRightsManagement;
import org.apache.sanselan.ImageReadException;
import org.apache.sanselan.Sanselan;
import org.apache.sanselan.common.IImageMetadata;
import org.apache.sanselan.common.ImageMetadata;
import org.apache.sanselan.formats.jpeg.JpegImageMetadata;
import org.apache.sanselan.formats.tiff.TiffImageMetadata;
import org.apache.sanselan.formats.tiff.constants.TiffConstants;
import org.emonocot.harvest.common.HtmlSanitizer;
import org.emonocot.job.dwc.exception.InvalidValuesException;
import org.emonocot.model.Image;
import org.emonocot.model.constants.AnnotationCode;
import org.emonocot.model.constants.AnnotationType;
import org.emonocot.model.constants.MediaFormat;
import org.emonocot.model.constants.RecordType;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.batch.item.ItemProcessor;
import org.springframework.beans.factory.annotation.Autowired;
import org.xml.sax.InputSource;
/**
*
* @author ben
*
*/
public class ImageMetadataExtractorImpl implements ItemProcessor<Image, Image>, ImageMetadataExtractor {
private Logger logger = LoggerFactory.getLogger(ImageMetadataExtractorImpl.class);
private HtmlSanitizer sanitizer;
private String imageDirectory;
private ImageAnnotator imageAnnotator;
private Validator validator;
private List<DateTimeFormatter> dateTimeFormatters = new ArrayList<DateTimeFormatter>();
public ImageMetadataExtractorImpl() {
dateTimeFormatters.add(ISODateTimeFormat.dateTimeParser());
dateTimeFormatters.add(DateTimeFormat.fullDate());
dateTimeFormatters.add(DateTimeFormat.fullDateTime());
dateTimeFormatters.add(DateTimeFormat.shortDate());
dateTimeFormatters.add(DateTimeFormat.shortDateTime());
dateTimeFormatters.add(DateTimeFormat.mediumDate());
dateTimeFormatters.add(DateTimeFormat.mediumDateTime());
}
/**
* An ordered array of metadata schemas to use in adding metadata to the image
*/
private Class[] schemas = {XMPSchemaIptc4xmpCore.class, XMPSchemaRightsManagement.class,
XMPSchemaDublinCore.class, XMPSchemaPhotoshop.class};
/**
* @param sanitizer the sanitizer to set
*/
@Autowired
public void setSanitizer(HtmlSanitizer sanitizer) {
this.sanitizer = sanitizer;
}
@Autowired
public void setValidator(Validator validator) {
this.validator = validator;
}
/**
*
* @param newImageDirectory
* Set the image directory
*/
public void setImageDirectory(String newImageDirectory) {
this.imageDirectory = newImageDirectory;
}
/**
* @param imageAnnotator the imageAnnotator to set
*/
public void setImageAnnotator(ImageAnnotator imageAnnotator) {
this.imageAnnotator = imageAnnotator;
}
/**
* @param schemas the schemas to set
*/
public void setSchemas(Class[] schemas) {
this.schemas = schemas;
}
/* (non-Javadoc)
* @see org.emonocot.harvest.media.ImageMetadataExtractor#process(org.emonocot.model.Image)
*/
@Override
public Image process(Image image) throws Exception {
String imageFileName = imageDirectory + File.separatorChar + image.getId() + '.' + image.getFormat();
File file = new File(imageFileName);
logger.debug("Image File " + imageFileName);
if (!file.exists()) {
logger.error("File {} does not exist in image directory for image ({}), {}, skipping record",
file.getCanonicalPath(), image.getId(), image);
imageAnnotator.annotate(image, AnnotationType.Error, AnnotationCode.BadField, "Unable to get embedded metadata as the local file was not found");
return null;
}
boolean metadataFound = false;
//Search for additional metadata
Image embeddedMetadata = new Image();
String xmpXml = Sanselan.getXmpXml(file);
if (xmpXml != null && !xmpXml.isEmpty()) {
logger.debug("Attempting to extract metadata from xmp-xml:\n" + xmpXml);
try {
XMPMetadata xmp = XMPMetadata.load(new InputSource(new StringReader(xmpXml)));
for (Class schemaClass : schemas) {
XMPSchema schema = xmp.getSchemaByClass(schemaClass);
if (schema instanceof XMPSchemaIptc4xmpCore) {
XMPSchemaIptc4xmpCore iptcSchema = (XMPSchemaIptc4xmpCore) schema;
metadataFound = addIptcProperies(iptcSchema,embeddedMetadata) || metadataFound;
logger.debug("Known schema that will be added:" + schema.toString() + "\n"
+ schema.getElement().getTextContent());
} else if (schema instanceof XMPSchemaDublinCore) {
XMPSchemaDublinCore dcSchema = (XMPSchemaDublinCore) schema;
metadataFound = addDcProperies(dcSchema, embeddedMetadata) || metadataFound;
logger.debug("Known schema that will be added:" + schema.toString() + "\n"
+ schema.getElement().getTextContent());
} else if (schema instanceof XMPSchemaRightsManagement) {
XMPSchemaRightsManagement rightsSchema = (XMPSchemaRightsManagement) schema;
metadataFound = addRightsProprties(rightsSchema,embeddedMetadata) || metadataFound;
logger.debug("Known schema that will be added:" + schema.toString() + "\n"
+ schema.getElement().getTextContent());
} else if (schema instanceof XMPSchemaPhotoshop) {
XMPSchemaPhotoshop photoshopSchema = (XMPSchemaPhotoshop) schema;
metadataFound = addPhotoshopProperties(photoshopSchema, embeddedMetadata, image) || metadataFound;
logger.debug("Known schema that will be added:" + schema.toString() + "\n"
+ schema.getElement().getTextContent());
} else {
logger.info("Unable to process a schema of: " + schemaClass);
}
}
} catch (IOException ioe) {
logger.error("Exception parsing XMP XML for image (" + image.getId() + ") " + image
+ " The XML was:\n" + xmpXml, ioe);
imageAnnotator.annotate(image, AnnotationType.Warn, AnnotationCode.BadField, "There was an issue with the XMP metadata");
}
} else {
logger.debug("Image " + file + " does not contain embedded XMP metadata");
}
try {
IImageMetadata metadata = Sanselan.getMetadata(new File(imageFileName));
if(metadata != null) {
logger.debug("The metadata visible to Sanselan is: " + metadata.toString("*"));
metadataFound = addSanselanProperties(metadata, embeddedMetadata) || metadataFound;
} else {
logger.debug("There is no metadata available from Sanselan");
}
} catch (IOException | ImageReadException e) {
logger.error("Error extracting information with Sanselan for image (" + image.getId() + ") " + image, e);
imageAnnotator.annotate(image, AnnotationType.Warn, AnnotationCode.BadField, "There was an issue with EXIF metadata");
}
//Apply any supplementary metadata
if(metadataFound && update(image, embeddedMetadata)) {
validate(image);
return image;
} else {
logger.debug("No metadata was updated, skipping");
return null;
}
}
/**
* @param image The persisted image to update
* @param embeddedMetadata The image containing supplementary values
* @return Whether any metadata was updated on the persisted image
*/
private boolean update(Image image, Image embeddedMetadata) {
boolean updated = false;
if(image.getTitle() == null && embeddedMetadata.getTitle() != null) {
image.setTitle(embeddedMetadata.getTitle());
updated = true;
}
if(image.getDescription() == null && embeddedMetadata.getDescription() != null) {
image.setDescription(embeddedMetadata.getDescription());
updated = true;
}
if(embeddedMetadata.getSubject() != null) {
if(image.getSubject() == null) {
image.setSubject(embeddedMetadata.getSubject());
updated = true;
} else {
StringBuffer newSubject = new StringBuffer();
newSubject.append(image.getSubject());
for(String subject : embeddedMetadata.getSubject().split(",")) {
if(!newSubject.toString().contains(subject.trim())) {
newSubject.append(", " + subject.trim());
}
}
image.setSubject(newSubject.toString());
updated = true; //Not strictly always true
}
}
if(image.getCreator() == null && embeddedMetadata.getCreator() != null) {
image.setCreator(embeddedMetadata.getCreator());
updated = true;
}
if(image.getFormat() == null && embeddedMetadata.getFormat() != null) {
image.setFormat(embeddedMetadata.getFormat());
updated = true;
}
if(image.getSpatial() == null && embeddedMetadata.getSpatial() != null) {
image.setSpatial(embeddedMetadata.getSpatial());
updated = true;
}
if(image.getCreated() == null && embeddedMetadata.getCreated() != null) {
image.setCreated(embeddedMetadata.getCreated());
updated = true;
}
if(image.getRights() == null && embeddedMetadata.getRights() != null) {
image.setRights(embeddedMetadata.getRights());
updated = true;
}
if(image.getRightsHolder() == null && embeddedMetadata.getRightsHolder() != null) {
image.setRightsHolder(embeddedMetadata.getRightsHolder());
updated = true;
}
if(image.getLicense() == null && embeddedMetadata.getLicense() != null) {
image.setLicense(embeddedMetadata.getLicense());
updated = true;
}
if(image.getLocation() == null && embeddedMetadata.getLocation() != null) {
image.setLocation(embeddedMetadata.getLocation());
updated = true;
}
return updated;
}
protected void validate(Image image) {
Set<ConstraintViolation<Image>> violations = validator.validate(image);
if(!violations.isEmpty()) {
StringBuffer stringBuffer = new StringBuffer();
stringBuffer.append(violations.size()).append(" constraint violations:");
for(ConstraintViolation<Image> violation : violations) {
stringBuffer.append(violation.getPropertyPath() + " " + violation.getMessage());
}
throw new InvalidValuesException(stringBuffer.toString(), RecordType.Image, -1);
}
}
/**
* @param dcSchema
* @param image
* @return Whether any properties has been updated
*/
private boolean addDcProperies(XMPSchemaDublinCore dcSchema, Image image) {
boolean isSomethingDifferent = false;
if(image.getTitle() == null && StringUtils.isNotBlank(dcSchema.getTitle())) {
image.setTitle(sanitizer.sanitize(dcSchema.getTitle()));
isSomethingDifferent = true;
}
if(image.getDescription() == null && StringUtils.isNotBlank(dcSchema.getDescription())) {
image.setDescription(sanitizer.sanitize(dcSchema.getDescription()));
isSomethingDifferent = true;
}
//N.B. Additional subjects are currently added rather than being ignored or overwriting
List<String> subjects = dcSchema.getSubjects();
if(subjects != null && subjects.size() > 0) {
StringBuffer uncleanSubject = new StringBuffer();
int startAt = 0;
if(image.getSubject() != null) {
uncleanSubject.append(image.getSubject());
} else {
uncleanSubject.append(sanitizer.sanitize(subjects.get(startAt++)));
}
for (int i = startAt; i < subjects.size(); i++) {
String subject = sanitizer.sanitize(subjects.get(i)); //We need to check the sanitized string
if(StringUtils.isNotBlank(subject) && !uncleanSubject.toString().contains(subject)) {
uncleanSubject.append(", " + subject);
}
}
if(image.getSubject() == null || uncleanSubject.length() > image.getSubject().length()) {
image.setSubject(uncleanSubject.toString()); //Sanitized earlier
isSomethingDifferent = true;
}
}
List<String> creators = dcSchema.getCreators();
if(image.getCreator() == null && creators != null && creators.size() > 0) {
StringBuffer uncleanCreator = new StringBuffer();
uncleanCreator.append(creators.get(0));
for (int i = 1; i < creators.size(); i++) {
uncleanCreator.append(", " + creators.get(i));
}
image.setCreator(sanitizer.sanitize(uncleanCreator.toString()));
isSomethingDifferent = true;
}
if(image.getFormat() == null && StringUtils.isNotBlank(dcSchema.getFormat())) {
String format = dcSchema.getFormat();
if(format.contains("gif")) {
image.setFormat(MediaFormat.gif);
isSomethingDifferent = true;
} else if(format.contains("jpeg")) {
image.setFormat(MediaFormat.jpg);
isSomethingDifferent = true;
} else if(format.contains("png")) {
image.setFormat(MediaFormat.png);
isSomethingDifferent = true;
} else if (format.contains("tiff")) {
image.setFormat(MediaFormat.tif);
isSomethingDifferent = true;
}
}
return isSomethingDifferent;
}
/**
* @param iptcSchema
* @param image
* @return Whether any properties has been updated
*/
private boolean addIptcProperies(XMPSchemaIptc4xmpCore iptcSchema, Image image) {
boolean isSomethingDifferent = false;
if(image.getSpatial() == null && StringUtils.isNotBlank(iptcSchema.getLocation())) {
image.setSpatial(sanitizer.sanitize(iptcSchema.getLocation()));
isSomethingDifferent = true;
}
return isSomethingDifferent;
}
/**
* @param photoshopSchema
* @param embeddedMetadata
* @return Whether any properties has been updated
*/
private boolean addPhotoshopProperties(XMPSchemaPhotoshop photoshopSchema, Image embeddedMetadata, Image image) {
boolean isSomethingDifferent = false;
StringBuffer newSpatial = new StringBuffer();
if(StringUtils.isNotBlank(embeddedMetadata.getSpatial())) {
newSpatial.append(embeddedMetadata.getSpatial());
}
if(StringUtils.isNotBlank(photoshopSchema.getState())) {
if(newSpatial.length() > 0 ) {
newSpatial.append(", ");
}
newSpatial.append(sanitizer.sanitize(photoshopSchema.getState()));
}
if(StringUtils.isNotBlank(photoshopSchema.getCountry())) {
if(newSpatial.length() > 0 ) {
newSpatial.append(", ");
}
newSpatial.append(sanitizer.sanitize(photoshopSchema.getCountry()));
}
if(!newSpatial.toString().equals(embeddedMetadata.getSpatial())) {
embeddedMetadata.setSpatial(newSpatial.toString());
isSomethingDifferent = true;
}
if(StringUtils.isNotBlank(photoshopSchema.getInstructions())) {
//N.B. We could try and use the taxon matcher to associate an additional taxon (or multiple taxa if we are clear about the separator)
logger.info("Photoshop instruction found: " + photoshopSchema.getInstructions());
//TODO Match Taxon?
}
if(embeddedMetadata.getCreated() == null && photoshopSchema.getDateCreated() != null) {
IllegalArgumentException iae = null;
DateTime dateCreated = null;
for (DateTimeFormatter dateTimeFormatter : dateTimeFormatters) {
try {
dateCreated = dateTimeFormatter.parseDateTime(photoshopSchema.getDateCreated());
} catch (IllegalArgumentException e) {
iae = e;
}
}
if(dateCreated == null) {
imageAnnotator.annotate(image, AnnotationType.Warn, AnnotationCode.BadField, photoshopSchema.getDateCreated() + " is not a well-formed date");
logger.warn("Unable to set the Date Created for image" + embeddedMetadata.getId() + " identifier: " + embeddedMetadata.getIdentifier(), iae);
} else {
embeddedMetadata.setCreated(dateCreated);
}
}
return isSomethingDifferent;
}
/**
* @param rightsSchema
* @param image
* @return Whether any properties has been updated
*/
private boolean addRightsProprties(XMPSchemaRightsManagement rightsSchema, Image image) {
boolean isSomethingDifferent = false;
String copyright = sanitizer.sanitize(rightsSchema.getCopyright());
if(image.getRights() == null && StringUtils.isNotBlank(copyright)) {
image.setRights(copyright);
isSomethingDifferent = true;
}
List<String> owners = rightsSchema.getOwners();
if(image.getRightsHolder() == null && owners != null && owners.size() > 0) {
StringBuffer ownerList = new StringBuffer();
ownerList.append(owners.get(0));
for (int i = 1; i < owners.size(); i++) {
ownerList.append(", " + owners.get(i));
}
image.setRightsHolder(sanitizer.sanitize(ownerList.toString()));
isSomethingDifferent = true;
}
logger.debug("URL: " + rightsSchema.getWebStatement() + "for Usage terms/License: "
+ rightsSchema.getUsageTerms());
if(image.getLicense() == null) {
StringBuffer uncleanLicense = new StringBuffer();
URI licenseURI = null;
try {
licenseURI = new URI(rightsSchema.getUsageTerms());
} catch (NullPointerException e) {
logger.debug(rightsSchema.getUsageTerms() + " is not a valid URI");
} catch (URISyntaxException e) {
logger.debug(rightsSchema.getUsageTerms() + " is not a valid URI");
}
if(licenseURI != null) {
uncleanLicense.append(rightsSchema.getUsageTerms());
} else {
if(StringUtils.isNotBlank(rightsSchema.getWebStatement())) {
uncleanLicense.append(rightsSchema.getWebStatement());
}
if(StringUtils.isNotBlank(rightsSchema.getUsageTerms())) {
if(uncleanLicense.length() > 0) {
uncleanLicense.append("#");
}
uncleanLicense.append(rightsSchema.getUsageTerms());
}
}
String license = sanitizer.sanitize(uncleanLicense.toString());
if(StringUtils.isNotBlank(license)) {
image.setLicense(license);
isSomethingDifferent = true;
}
}
return isSomethingDifferent;
}
/**
* @param metadata
* @param image
* @return Whether any properties has been updated
*/
private boolean addSanselanProperties(IImageMetadata metadata, Image image) throws Exception {
boolean isSomethingDifferent = false;
if (metadata instanceof JpegImageMetadata) {
JpegImageMetadata jpegMetadata = (JpegImageMetadata) metadata;
StringBuffer keywords = null;
StringBuffer spatial = null;
for (Object o : jpegMetadata.getItems()) {
if (o instanceof ImageMetadata.Item) {
ImageMetadata.Item item = (ImageMetadata.Item) o;
if (item.getKeyword().equals("Object Name") && image.getTitle() == null) {
image.setTitle(sanitizer.sanitize(item.getText()));
isSomethingDifferent = true;
} else if (item.getKeyword().equals("Keywords")) {
if (keywords == null) {
keywords = new StringBuffer();
keywords.append(item.getText());
} else {
keywords.append(", " + item.getText());
}
} else if (item.getKeyword().equals("Sublocation")
|| item.getKeyword().equals("Province/State")
|| item.getKeyword().equals("Country/Primary Location Name")) {
if (spatial == null) {
spatial = new StringBuffer();
spatial.append(item.getText());
} else {
spatial.append(", " + item.getText());
}
}
}
}
if (spatial != null && image.getSpatial() == null) {
image.setSpatial(sanitizer.sanitize(spatial.toString()));
isSomethingDifferent = true;
}
if (keywords != null && image.getSubject() == null) {
image.setSubject(sanitizer.sanitize(keywords.toString()));
isSomethingDifferent = true;
}
if (jpegMetadata.findEXIFValue(TiffConstants.TIFF_TAG_ARTIST) != null
&& image.getCreator() == null) {
image.setCreator(sanitizer.sanitize(jpegMetadata.findEXIFValue(
TiffConstants.TIFF_TAG_ARTIST).getStringValue()));
isSomethingDifferent = true;
}
if (jpegMetadata.findEXIFValue(TiffConstants.TIFF_TAG_COPYRIGHT) != null
&& image.getRights() == null) {
image.setRights(sanitizer.sanitize(jpegMetadata.findEXIFValue(
TiffConstants.TIFF_TAG_COPYRIGHT).getStringValue()));
isSomethingDifferent = true;
}
if (jpegMetadata.findEXIFValue(TiffConstants.TIFF_TAG_IMAGE_DESCRIPTION) != null
&& image.getDescription() == null) {
image.setDescription(sanitizer.sanitize(jpegMetadata.findEXIFValue(
TiffConstants.TIFF_TAG_IMAGE_DESCRIPTION)
.getStringValue()));
isSomethingDifferent = true;
}
TiffImageMetadata exifMetadata = jpegMetadata.getExif();
if (exifMetadata != null) {
TiffImageMetadata.GPSInfo gpsInfo = exifMetadata.getGPS();
if (gpsInfo != null && image.getLocation() == null) {
image.setLongitude(gpsInfo.getLongitudeAsDegreesEast());
image.setLatitude(gpsInfo.getLatitudeAsDegreesNorth());
isSomethingDifferent = true;
}
}
}
return isSomethingDifferent ;
}
public void afterPropertiesSet() throws Exception {
assert imageDirectory != null;
if (sanitizer == null) {
sanitizer = new HtmlSanitizer();
sanitizer.afterPropertiesSet();
}
}
}