/*
* ModeShape (http://www.modeshape.org)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.modeshape.sequencer.msoffice;
import static org.modeshape.jcr.api.JcrConstants.JCR_MIME_TYPE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.AUTHOR;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.CHARACTERS;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.COMMENT;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.CREATED;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.CREATING_APPLICATION;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.EXCEL_SHEET;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.EXCEL_SHEET_NODE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.FULL_CONTENT;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.HEADING_LEVEL;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.HEADING_NAME;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.HEADING_NODE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.KEYWORDS;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.LAST_PRINTED;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.METADATA_NODE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.NOTES;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.PAGES;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.REVISION;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.SAVED;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.SHEET_NAME;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.SLIDE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.SLIDE_NODE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.SUBJECT;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.TEMPLATE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.TEXT;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.THUMBNAIL;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.TITLE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.TOTAL_EDITING_TIME;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.WORDS;
import java.io.IOException;
import java.io.InputStream;
import javax.jcr.NamespaceRegistry;
import javax.jcr.Node;
import javax.jcr.Property;
import javax.jcr.RepositoryException;
import javax.jcr.Value;
import org.modeshape.common.util.CheckArg;
import org.modeshape.jcr.api.Binary;
import org.modeshape.jcr.api.nodetype.NodeTypeManager;
import org.modeshape.jcr.api.sequencer.Sequencer;
import org.modeshape.sequencer.msoffice.excel.ExcelMetadata;
import org.modeshape.sequencer.msoffice.excel.ExcelMetadataReader;
import org.modeshape.sequencer.msoffice.excel.ExcelSheetMetadata;
import org.modeshape.sequencer.msoffice.powerpoint.PowerPointMetadataReader;
import org.modeshape.sequencer.msoffice.powerpoint.PowerpointMetadata;
import org.modeshape.sequencer.msoffice.powerpoint.SlideMetadata;
import org.modeshape.sequencer.msoffice.word.WordMetadata;
import org.modeshape.sequencer.msoffice.word.WordMetadataReader;
/**
* A sequencer that processes the content of an MS Office document, extracts the metadata for the file, and then writes that
* metadata to the repository.
* <p>
* This sequencer produces data that corresponds to the following structure:
* <ul>
* <li><strong>msoffice:metadata</strong> node of type <code>msoffice:metadata</code>
* <ul>
* <li><strong>msoffice:title</strong> optional string property for the title of the documnt</li>
* <li><strong>msoffice:subject</strong> optional string property for the subject of the document</li>
* <li><strong>msoffice:author</strong> optional string property for the author of the document</li>
* <li><strong>msoffice:keywords</strong> optional string property for the document keywords</li>
* <li><strong>msoffice:comment</strong> optional string property for the document comment</li>
* <li><strong>msoffice:template</strong> optional string property for the template from which this document originates</li>
* <li><strong>msoffice:last_saved_by</strong> optional string property for the person that last saved this document</li>
* <li><strong>msoffice:revision</strong> optional string property for this document revision</li>
* <li><strong>msoffice:total_editing_time</strong> optional long property for the length this document has been edited</li>
* <li><strong>msoffice:last_printed</strong> optional date property for the date of last printing this document</li>
* <li><strong>msoffice:created</strong> date property for the date of creation of the document</li>
* <li><strong>msoffice:saved</strong> date property for the date of last save of this document</li>
* <li><strong>msoffice:pages</strong> long property for the number of pages of this document</li>
* <li><strong>msoffice:words</strong> long property for the number of words in this document</li>
* <li><strong>msoffice:characters</strong> long property for the number of characters in this document</li>
* <li><strong>msoffice:creating_application</strong> string property for the application used to create this document</li>
* <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbanail of this document</li>
* <li><strong>msoffice:full_contents</strong> optional String property holding the text contents of an excel file</li>
* <li><strong>msoffice:sheet_name</strong> optional String property for the name of a sheet in excel (multiple)</li>
* </ul>
* </li>
* <li><strong>msoffice:slide</strong> node of type <code>msoffice:pptslide</code>
* <ul>
* <li><strong>msoffice:title</strong> optional String property for the title of a slide</li>
* <li><strong>msoffice:notes</strong> optional String property for the notes of a slide</li>
* <li><strong>msoffice:text</strong> optional String property for the text of a slide</li>
* <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbnail of a slide (PNG image)</li>
* </ul>
* </li>
* </ul>
* </p>
*/
public class MSOfficeMetadataSequencer extends Sequencer {
public static final class MimeTypeConstants {
public static final String MICROSOFT_APPLICATION_MS_WORD = "application/msword";
public static final String MICROSOFT_WORD = "application/vnd.ms-word";
public static final String MICROSOFT_EXCEL = "application/vnd.ms-excel";
public static final String MICROSOFT_POWERPOINT = "application/vnd.ms-powerpoint";
}
@Override
public void initialize( NamespaceRegistry registry,
NodeTypeManager nodeTypeManager ) throws RepositoryException, IOException {
registerNodeTypes("msoffice.cnd", nodeTypeManager, true);
registerDefaultMimeTypes(MimeTypeConstants.MICROSOFT_EXCEL,
MimeTypeConstants.MICROSOFT_POWERPOINT,
MimeTypeConstants.MICROSOFT_WORD,
MimeTypeConstants.MICROSOFT_APPLICATION_MS_WORD);
}
@Override
public boolean execute( Property inputProperty,
Node outputNode,
Context context ) throws Exception {
Binary binaryValue = (Binary)inputProperty.getBinary();
CheckArg.isNotNull(binaryValue, "binary");
String inputFileName = getInputFileName(inputProperty);
String mimeType = binaryValue.getMimeType(inputFileName);
Node sequencedNode = outputNode;
if (outputNode.isNew()) {
outputNode.setPrimaryType(METADATA_NODE);
} else {
sequencedNode = outputNode.addNode(METADATA_NODE, METADATA_NODE);
}
if (mimeType != null) {
setProperty(sequencedNode, JCR_MIME_TYPE, mimeType);
}
if (isPowerpoint(mimeType)) {
try (InputStream stream = binaryValue.getStream()) {
sequencePowerpoint(sequencedNode, context.valueFactory(), stream);
return true;
}
}
if (isWord(mimeType)) {
try (InputStream stream = binaryValue.getStream()) {
sequenceWord(sequencedNode, context.valueFactory(), stream);
return true;
}
}
if (isExcel(mimeType)) {
try (InputStream stream = binaryValue.getStream()) {
sequenceExcel(sequencedNode, context.valueFactory(), stream);
return true;
}
}
getLogger().warn("Unknown mimetype: {0} for microsoft office", mimeType);
return false;
}
private String getInputFileName( Property inputProperty ) throws RepositoryException {
return inputProperty.getParent().getParent().getName();
}
private boolean isExcel( String mimeType ) {
return MimeTypeConstants.MICROSOFT_EXCEL.equalsIgnoreCase(mimeType);
}
private void sequenceExcel( Node sequencedNode,
org.modeshape.jcr.api.ValueFactory valueFactory,
InputStream stream ) throws IOException, RepositoryException {
ExcelMetadata excelMetadata = ExcelMetadataReader.instance(stream);
recordMetadata(sequencedNode, valueFactory, excelMetadata.getMetadata());
setProperty(sequencedNode, FULL_CONTENT, excelMetadata.getText());
for (ExcelSheetMetadata sheetMetadata : excelMetadata.getSheets()) {
Node sheet = sequencedNode.addNode(EXCEL_SHEET, EXCEL_SHEET_NODE);
setProperty(sheet, SHEET_NAME, sheetMetadata.getName());
setProperty(sheet, TEXT, sheetMetadata.getText());
}
}
private boolean isWord( String mimeType ) {
// See http://blogs.msdn.com/b/vsofficedeveloper/archive/2008/05/08/office-2007-open-xml-mime-types.aspx
return MimeTypeConstants.MICROSOFT_WORD.equalsIgnoreCase(mimeType)
|| MimeTypeConstants.MICROSOFT_APPLICATION_MS_WORD.equalsIgnoreCase(mimeType);
}
private void sequenceWord( Node rootNode,
org.modeshape.jcr.api.ValueFactory valueFactory,
InputStream stream ) throws RepositoryException, IOException {
// Sometime in the future this will sequence WORD Table of contents.
WordMetadata wordMetadata = WordMetadataReader.instance(stream);
recordMetadata(rootNode, valueFactory, wordMetadata.getMetadata());
for (WordMetadata.WordHeading headingMetadata : wordMetadata.getHeadings()) {
Node heading = rootNode.addNode(HEADING_NODE, HEADING_NODE);
setProperty(heading, HEADING_NAME, headingMetadata.getText());
setProperty(heading, HEADING_LEVEL, headingMetadata.getHeaderLevel());
}
}
private boolean isPowerpoint( String mimeType ) {
return MimeTypeConstants.MICROSOFT_POWERPOINT.equalsIgnoreCase(mimeType);
}
private void sequencePowerpoint( Node rootNode,
org.modeshape.jcr.api.ValueFactory valueFactory,
InputStream stream ) throws IOException, RepositoryException {
PowerpointMetadata deck = PowerPointMetadataReader.instance(stream);
recordMetadata(rootNode, valueFactory, deck.getMetadata());
for (SlideMetadata slideMetadata : deck.getSlides()) {
Node slide = rootNode.addNode(SLIDE, SLIDE_NODE);
setProperty(slide, TITLE, slideMetadata.getTitle());
setProperty(slide, TEXT, slideMetadata.getText());
setProperty(slide, NOTES, slideMetadata.getNotes());
setProperty(slide, THUMBNAIL, valueFactory.createBinary(slideMetadata.getThumbnail()));
}
}
private void recordMetadata( Node rootNode,
org.modeshape.jcr.api.ValueFactory valueFactory,
MSOfficeMetadata metadata ) throws RepositoryException {
setProperty(rootNode, TITLE, metadata.getTitle());
setProperty(rootNode, SUBJECT, metadata.getSubject());
setProperty(rootNode, AUTHOR, metadata.getAuthor());
setProperty(rootNode, KEYWORDS, metadata.getKeywords());
setProperty(rootNode, COMMENT, metadata.getComment());
setProperty(rootNode, TEMPLATE, metadata.getTemplate());
setProperty(rootNode, SAVED, valueFactory.createValue(metadata.getLastSaved()));
setProperty(rootNode, REVISION, metadata.getRevision());
setProperty(rootNode, TOTAL_EDITING_TIME, metadata.getTotalEditingTime());
setProperty(rootNode, LAST_PRINTED, valueFactory.createValue(metadata.getLastPrinted()));
setProperty(rootNode, CREATED, valueFactory.createValue(metadata.getCreated()));
setProperty(rootNode, PAGES, metadata.getPages());
setProperty(rootNode, WORDS, metadata.getWords());
setProperty(rootNode, CHARACTERS, metadata.getCharacters());
setProperty(rootNode, CREATING_APPLICATION, metadata.getCreatingApplication());
setProperty(rootNode, THUMBNAIL, valueFactory.createBinary(metadata.getThumbnail()));
}
private void setProperty( Node node,
String propertyName,
String value ) throws RepositoryException {
if (value != null) {
node.setProperty(propertyName, value);
}
}
private void setProperty( Node node,
String propertyName,
Value value ) throws RepositoryException {
if (value != null) {
node.setProperty(propertyName, value);
}
}
private void setProperty( Node node,
String propertyName,
Binary value ) throws RepositoryException {
if (value != null) {
node.setProperty(propertyName, value);
}
}
// Intentionally use the Long object form, in case this is called by methods that return a null Long reference
// for optional values
private void setProperty( Node node,
String propertyName,
Long value ) throws RepositoryException {
if (value != null) {
node.setProperty(propertyName, value.longValue());
}
}
// Intentionally use the Integer object form, in case this is called by methods that return a null Integer reference
// for optional values
private void setProperty( Node node,
String propertyName,
Integer value ) throws RepositoryException {
if (value != null) {
node.setProperty(propertyName, value.longValue());
}
}
}