/*
* ModeShape (http://www.modeshape.org)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.modeshape.sequencer.pdf;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Calendar;
import java.util.List;
import java.util.TimeZone;
import java.util.stream.Collectors;
import javax.jcr.Binary;
import javax.jcr.NamespaceRegistry;
import javax.jcr.Node;
import javax.jcr.Property;
import javax.jcr.RepositoryException;
import javax.jcr.Value;
import javax.jcr.ValueFactory;
import org.modeshape.common.util.CheckArg;
import org.modeshape.common.util.StringUtil;
import org.modeshape.jcr.api.JcrConstants;
import org.modeshape.jcr.api.nodetype.NodeTypeManager;
import org.modeshape.jcr.api.sequencer.Sequencer;
/**
* A sequencer that processes the binary content of an PDF file, extracts the metadata, and then writes that
* metadata to the repository.
* <p>
* This sequencer produces data that corresponds to the following structure:
* <ul>
* <li><strong>pdf:metadata</strong> node of type <code>pdf:metadata</code>
* <ul>
* <li><strong>jcr:mimeType</strong> - optional string property for the mime type of the image</li>
* <li><strong>pdf:pageCount</strong> - mandatory long property specifying number of pages</li>
* <li><strong>pdf:encrypted</strong> - mandatory boolean property specifying whether the document is encrypted</li>
* <li><strong>pdf:version</strong> - mandatory string property for the version of the PDF format</li>
* <li><strong>pdf:orientation</strong> - mandatory string property specifying the orientation of the paper (landscape, portrait, reverse landscape)</li>
* <li><strong>pdf:author</strong> - optional string property for the author of the document</li>
* <li><strong>pdf:creationDate</strong> - optional date property for the creation date of the document</li>
* <li><strong>pdf:creator</strong> - optional string property for the creator of the document</li>
* <li><strong>pdf:keywords</strong> - optional string property for the keywords of the document (comma delimited)</li>
* <li><strong>pdf:modificationDate</strong> - optional date property for the modification date of document</li>
* <li><strong>pdf:producer</strong> - optional string property for the producer of the document</li>
* <li><strong>pdf:subject</strong> - optional string property for the subject of the document</li>
* <li><strong>pdf:title</strong> - optional string property for the title of the document</li>
* <li><strong>pdf:xmp</strong> - optional child node for the metadata fields from XMP block
* <ul>
* <li><strong>xmp:baseURL</strong> - optional string property for the baseURL</li>
* <li><strong>xmp:createDate</strong> - optional date property for modification date of this object</li>
* <li><strong>xmp:creatorTool</strong> - optional string property specifying the creator tool used to make this document</li></li>
* <li><strong>xmp:identifier</strong> - optional multi-valued string property for the identifiers of the object</li>
* <li><strong>xmp:label</strong> - optional string property for the label of the object</li>
* <li><strong>xmp:metadataDate</strong> - optional date property for creation date of this metadata</li>
* <li><strong>xmp:modifyDate</strong> - optional date property for modification date of this object</li>
* <li><strong>xmp:nickname</strong> - optional string property for the nickname</li>
* <li><strong>xmp:rating</strong> - optional string property for the nickname</li>
* <li><strong>xmp:label</strong> - optional string property for the label</li>
* </ul>
* </li>
* <li><strong>pdf:page</strong> - optional child node for the metadata fields related to individual pages
* <ul>
* <li><strong>pdf:pageNumber</strong> - mandatory long property for the number of this page</li>
* <li><strong>pdf:attachement</strong> - optional child node for the metadata fields related to attachment
* <ul>
* <li><strong>pdf:creationDate</strong> - optional date property for creation date of this attachment</li>
* <li><strong>pdf:modificationDate</strong> - optional date property for modification date of this attachment</li>
* <li><strong>pdf:subject</strong> - optional string property for the subject of this attachment</li>
* <li><strong>pdf:name</strong> - optional string property for the name of this attachment</li>
* <li><strong>jcr:mimeType</strong> - optional string property for the mime type of this attachment</li>
* <li><strong>jcr:data</strong> - optional binary property for the content of this attachment</li>
* </ul>
* </li>
* </ul>
* </li>
* </ul>
* </p>
*
* @since 5.1
*/
public class PdfMetadataSequencer extends Sequencer {
@Override
public void initialize( NamespaceRegistry registry,
NodeTypeManager nodeTypeManager ) throws RepositoryException, IOException {
super.registerNodeTypes("pdf.cnd", nodeTypeManager, true);
registerDefaultMimeTypes(PdfBasicMetadata.MIME_TYPE_STRING);
}
@Override
public boolean execute( Property inputProperty,
Node outputNode,
Context context ) throws Exception {
Binary binaryValue = inputProperty.getBinary();
CheckArg.isNotNull(binaryValue, "binary");
Node sequencedNode = getPdfMetadataNode(outputNode);
try {
if (processBasicMetadata(sequencedNode, binaryValue)) {
processXMPMetadata(sequencedNode, binaryValue);
return true;
} else {
getLogger().warn("Ignoring pdf from node {0} because basic metadata cannot be extracted",
inputProperty.getParent().getPath());
return false;
}
} catch (java.lang.NoClassDefFoundError ncdfe) {
if (ncdfe.getMessage().toLowerCase().contains("bouncycastle")) {
getLogger().warn("Ignoring pdf from node {0} because it's encrypted and encrypted PDFs are not supported",
inputProperty.getParent().getPath());
return false;
}
throw ncdfe;
}
}
private boolean processBasicMetadata( Node sequencedNode,
Binary binaryValue) {
PdfBasicMetadata metadata = null;
try (InputStream stream = binaryValue.getStream()) {
metadata = new PdfBasicMetadata(stream);
if (metadata.check()) {
setPropertyIfMetadataPresent(sequencedNode, JcrConstants.JCR_MIME_TYPE, PdfBasicMetadata.MIME_TYPE_STRING);
setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.PAGE_COUNT, metadata.getPageCount());
setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.ORIENTATION, metadata.getOrientation());
setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.ENCRYPTED, metadata.isEncrypted());
setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.VERSION, metadata.getVersion());
setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.AUTHOR, metadata.getAuthor());
setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.CREATION_DATE, metadata.getCreationDate());
setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.CREATOR, metadata.getCreator());
setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.KEYWORDS, metadata.getKeywords());
setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.MODIFICATION_DATE, metadata.getModificationDate());
setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.PRODUCER, metadata.getProducer());
setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.SUBJECT, metadata.getSubject());
setPropertyIfMetadataPresent(sequencedNode, PdfMetadataLexicon.TITLE, metadata.getTitle());
for (PdfPageMetadata pageMetadata : metadata.getPages()) {
Node pageNode = sequencedNode.addNode(PdfMetadataLexicon.PAGE_NODE, PdfMetadataLexicon.PAGE_NODE);
setPropertyIfMetadataPresent(pageNode, PdfMetadataLexicon.PAGE_NUMBER, pageMetadata.getPageNumber());
for (PdfAttachmentMetadata attachmentMetadata : pageMetadata.getAttachments()) {
Node attachmentNode = pageNode.addNode(PdfMetadataLexicon.ATTACHMENT_NODE, PdfMetadataLexicon.ATTACHMENT_NODE);
setPropertyIfMetadataPresent(attachmentNode, JcrConstants.JCR_MIME_TYPE, attachmentMetadata.getMimeType());
setPropertyIfMetadataPresent(attachmentNode, PdfMetadataLexicon.CREATION_DATE, attachmentMetadata.getCreationDate());
setPropertyIfMetadataPresent(attachmentNode, PdfMetadataLexicon.MODIFICATION_DATE, attachmentMetadata.getModificationDate());
setPropertyIfMetadataPresent(attachmentNode, PdfMetadataLexicon.SUBJECT, attachmentMetadata.getSubject());
setPropertyIfMetadataPresent(attachmentNode, PdfMetadataLexicon.NAME, attachmentMetadata.getName());
setPropertyIfMetadataPresent(attachmentNode, JcrConstants.JCR_DATA, attachmentMetadata.getData());
}
}
return true;
}
} catch (Exception e) {
getLogger().error(e, "Couldn't process stream.");
}
return false;
}
private boolean processXMPMetadata( Node sequencedNode,
Binary binaryValue) {
PdfXmpMetadata metadata = null;
try (InputStream stream = binaryValue.getStream()) {
metadata = new PdfXmpMetadata(stream);
if (metadata.check()) {
Node xmpNode = sequencedNode.addNode(PdfMetadataLexicon.XMP_NODE, PdfMetadataLexicon.XMP_NODE);
setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.BASE_URL, metadata.getBaseURL());
setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.CREATE_DATE, metadata.getCreateDate());
setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.CREATOR_TOOL, metadata.getCreatorTool());
setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.IDENTIFIER, metadata.getIdentifier());
setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.METADATA_DATE, metadata.getMetadataDate());
setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.MODIFY_DATE, metadata.getModifyDate());
setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.NICKNAME, metadata.getNickname());
setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.RATING, metadata.getRating());
setPropertyIfMetadataPresent(xmpNode, XmpMetadataLexicon.LABEL, metadata.getLabel());
return true;
}
} catch (Exception e) {
getLogger().error(e, "Couldn't process stream.");
}
return false;
}
private Node getPdfMetadataNode( Node outputNode ) throws RepositoryException {
if (outputNode.isNew()) {
outputNode.setPrimaryType(PdfMetadataLexicon.METADATA_NODE);
return outputNode;
}
return outputNode.addNode(PdfMetadataLexicon.METADATA_NODE, PdfMetadataLexicon.METADATA_NODE);
}
private void setPropertyIfMetadataPresent( Node node,
String propertyName,
Object value ) throws RepositoryException {
if (value != null) {
if (value instanceof String && !StringUtil.isBlank((String) value)) {
node.setProperty(propertyName, (String) value);
} else if (value instanceof Boolean) {
node.setProperty(propertyName, (Boolean) value);
} else if (value instanceof Long) {
node.setProperty(propertyName, (Long) value);
} else if (value instanceof Integer) {
node.setProperty(propertyName, new Long((Integer) value));
} else if (value instanceof Calendar) {
// pdfbox 1.8.x doesn't parse the timezones correctly...
// see PDFBOX-3352
Calendar calendarValue = (Calendar) value;
if (calendarValue.getTimeZone().getID().toLowerCase().equals("unknown")) {
calendarValue.setTimeZone(TimeZone.getDefault());
}
node.setProperty(propertyName, calendarValue);
} else if (value instanceof byte[]) {
InputStream is = new ByteArrayInputStream((byte []) value);
Binary binaryProperty = node.getSession().getValueFactory().createBinary(is);
node.setProperty(propertyName, binaryProperty);
} else if (value instanceof List<?>) {
ValueFactory vf = node.getSession().getValueFactory();
List<Value> values = ((List<?>) value).stream()
.filter(val -> val instanceof String)
.map(val -> vf.createValue((String) val))
.collect(Collectors.toList());
if (!values.isEmpty()) {
node.setProperty(propertyName, values.toArray(new Value[values.size()]));
}
} else {
throw new IllegalArgumentException(String.format("The value of the property %s has unknown type and couldn't be saved.", propertyName));
}
}
}
}