/**
* OpenKM, Open Document Management System (http://www.openkm.com)
* Copyright (c) 2006-2011 Paco Avila & Josep Llort
*
* No bytes were intentionally harmed during the development of this application.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package com.openkm.util.metadata;
import java.io.IOException;
import java.io.InputStream;
import java.util.Calendar;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.catcode.odf.ODFMetaFileAnalyzer;
import com.catcode.odf.OpenDocumentMetadata;
import com.openkm.core.Config;
public class MetadataExtractor {
private static Logger log = LoggerFactory.getLogger(MetadataExtractor.class);
/**
* Extract metadata from PDF
*/
public static PdfMetadata pdfExtractor(InputStream is) throws IOException {
PDDocument doc = PDDocument.load(is);
PDDocumentInformation info = doc.getDocumentInformation();
PdfMetadata md = new PdfMetadata();
md.setNumberOfPages(doc.getNumberOfPages());
md.setTitle(info.getTitle());
md.setAuthor(info.getAuthor());
md.setSubject(info.getSubject());
md.setKeywords(info.getKeywords());
md.setCreator(info.getCreator());
md.setProducer(info.getProducer());
md.setTrapped(info.getTrapped());
md.setCreationDate(info.getCreationDate());
md.setModificationDate(info.getModificationDate());
log.info("pdfExtractor: {}", md);
return md;
}
/**
* Extract metadata from Office Word
*/
public static OfficeMetadata officeExtractor(InputStream is, String mimeType) throws IOException {
POIFSFileSystem fs = new POIFSFileSystem(is);
OfficeMetadata md = new OfficeMetadata();
SummaryInformation si = null;
if (Config.MIME_MS_WORD.equals(mimeType)) {
si = new WordExtractor(fs).getSummaryInformation();
} else if (Config.MIME_MS_EXCEL.equals(mimeType)) {
si = new ExcelExtractor(fs).getSummaryInformation();
} else if (Config.MIME_MS_POWERPOINT.equals(mimeType)) {
si = new PowerPointExtractor(fs).getSummaryInformation();
}
if (si != null) {
md.setTitle(si.getTitle());
md.setSubject(si.getSubject());
md.setAuthor(si.getAuthor());
md.setLastAuthor(si.getLastAuthor());
md.setKeywords(si.getKeywords());
md.setComments(si.getComments());
md.setTemplate(si.getTemplate());
md.setRevNumber(si.getRevNumber());
md.setApplicationName(si.getApplicationName());
md.setEditTime(si.getEditTime());
md.setPageCount(si.getPageCount());
md.setWordCount(si.getWordCount());
md.setCharCount(si.getCharCount());
md.setSecurity(si.getSecurity());
Calendar createDateTime = Calendar.getInstance();
createDateTime.setTime(si.getCreateDateTime());
md.setCreateDateTime(createDateTime);
Calendar lastSaveDateTime = Calendar.getInstance();
lastSaveDateTime.setTime(si.getLastSaveDateTime());
md.setLastSaveDateTime(lastSaveDateTime);
Calendar lastPrinted = Calendar.getInstance();
lastPrinted.setTime(si.getLastPrinted());
md.setLastPrinted(lastPrinted);
}
log.info("officeExtractor: {}", md);
return md;
}
public static void OpenOfficeExtractor(InputStream is) {
ODFMetaFileAnalyzer analyzer = new ODFMetaFileAnalyzer();
OpenDocumentMetadata odmt = analyzer.analyzeZip(is);
OpenOfficeMetadata md = new OpenOfficeMetadata();
if (odmt != null) {
md.setTitle(odmt.getTitle());
md.setSubject(odmt.getSubject());
md.setCreator(odmt.getCreator());
md.setInitialCreator(odmt.getInitialCreator());
md.setKeyword(odmt.getKeyword());
md.setDescription(odmt.getDescription());
md.setEditingCycles(odmt.getEditingCycles());
md.setEditingDuration((long) odmt.getEditingDuration().getSeconds() +
odmt.getEditingDuration().getMinutes() * 60 +
odmt.getEditingDuration().getHours() * 3600 +
odmt.getEditingDuration().getDays() * 86400);
md.setPageCount(odmt.getPageCount());
md.setWordCount(odmt.getWordCount());
md.setCharacterCount(odmt.getCharacterCount());
Calendar creationDate = Calendar.getInstance();
creationDate.setTime(odmt.getCreationDate());
md.setCreationDate(creationDate);
Calendar date = Calendar.getInstance();
date.setTime(odmt.getDate());
md.setDate(date);
Calendar printDate = Calendar.getInstance();
printDate.setTime(odmt.getPrintDate());
md.setPrintDate(printDate);
}
log.info("OpenOfficeExtractor: {}", odmt);
}
}