/** * OpenKM, Open Document Management System (http://www.openkm.com) * Copyright (c) 2006-2011 Paco Avila & Josep Llort * * No bytes were intentionally harmed during the development of this application. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ package com.openkm.kea.metadata; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.Set; import org.ontoware.rdf2go.RDF2Go; import org.ontoware.rdf2go.model.Model; import org.ontoware.rdf2go.model.node.Node; import org.ontoware.rdf2go.model.node.URI; import org.ontoware.rdf2go.model.node.impl.URIImpl; import org.semanticdesktop.aperture.extractor.Extractor; import org.semanticdesktop.aperture.extractor.ExtractorException; import org.semanticdesktop.aperture.extractor.ExtractorFactory; import org.semanticdesktop.aperture.extractor.ExtractorRegistry; import org.semanticdesktop.aperture.extractor.impl.DefaultExtractorRegistry; import org.semanticdesktop.aperture.mime.identifier.MimeTypeIdentifier; import org.semanticdesktop.aperture.mime.identifier.magic.MagicMimeTypeIdentifier; import org.semanticdesktop.aperture.rdf.RDFContainer; import org.semanticdesktop.aperture.rdf.impl.RDFContainerImpl; import org.semanticdesktop.aperture.util.IOUtil; import org.semanticdesktop.aperture.vocabulary.NCO; import org.semanticdesktop.aperture.vocabulary.NFO; import org.semanticdesktop.aperture.vocabulary.NIE; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.openkm.bean.kea.MetadataDTO; /** * MetadataExtractor * * @author jllort * */ public class MetadataExtractor { private static Logger log = LoggerFactory.getLogger(MetadataExtractor.class); private MetadataDTO mdDTO; private File tempFile; private RDFContainer rdf; private SubjectExtractor subjectExtractor; private boolean se = true; /** * MetadataExtractor */ public MetadataExtractor(boolean se) throws MetadataExtractionException { mdDTO = new MetadataDTO(); this.se = se; if (se) { subjectExtractor = new SubjectExtractor(); } } /** * MetadataExtractor */ public MetadataExtractor(int subjectLimit) throws MetadataExtractionException { mdDTO = new MetadataDTO(); subjectExtractor = new SubjectExtractor(subjectLimit); } /** * getTempFile */ public File getTempFile() { return tempFile; } /** * getOriginalFileName */ public String getOriginalFileName() { return mdDTO.getFileName(); } /** * getMdDTO */ public MetadataDTO getMdDTO() { return mdDTO; } public MetadataDTO extract(File tempFile) throws MetadataExtractionException { try { this.tempFile = tempFile; loadRDF(); extractMetadataFromRDF(); if (se) { extractSuggestedSubjects(); } rdf.dispose(); return mdDTO; } catch (MetadataExtractionException e) { log.error("Metadata Extraction error: "); log.error(e.getMessage(), e); throw e; } } /** * loadRDF */ @SuppressWarnings("unchecked") private void loadRDF() { MimeTypeIdentifier identifier = new MagicMimeTypeIdentifier(); ExtractorRegistry extractorRegistry = new DefaultExtractorRegistry(); String mimeType; try { // establish mimetype FileInputStream fis = new FileInputStream(tempFile); BufferedInputStream bis = new BufferedInputStream(fis); byte[] bytes = IOUtil.readBytes(bis, identifier.getMinArrayLength()); bis.close(); mimeType = identifier.identify(bytes, tempFile.getPath(), null); if (mimeType == null) { throw new MetadataExtractionException("Unable to extract MimeType for: " + mdDTO.getFileName()); } else { mdDTO.setMimeType(mimeType); } // create RDF metadata model URI uri = new URIImpl(tempFile.toURI().toString()); Model rdfModel = RDF2Go.getModelFactory().createModel(); rdfModel.open(); rdf = new RDFContainerImpl(rdfModel, uri); // create extractor ExtractorFactory extractorFactory; Extractor extractor; Set<ExtractorFactory> factories = extractorRegistry.getExtractorFactories(mimeType); if (factories == null || factories.isEmpty()) { throw new MetadataExtractionException("Unable to find extractor factory for: " + mimeType); } else { extractorFactory = factories.iterator().next(); extractor = extractorFactory.get(); } // extract the metadata fis = new FileInputStream(tempFile); bis = new BufferedInputStream(fis, 8192); extractor.extract(uri, bis, null, mimeType, rdf); } catch (FileNotFoundException e) { log.error("Unable to locate the workspace file for: " + mdDTO.getFileName(), e); } catch (IOException e) { log.error("Unable to read workspace file for: " + mdDTO.getFileName(), e); } catch (MetadataExtractionException e) { log.error(e.getMessage(), e); } catch (ExtractorException e) { log.error("Aperture extraction error: " + e.getMessage(), e); } } /** * extractMetadataFromRDF */ @SuppressWarnings("unchecked") private void extractMetadataFromRDF() { // set up secondary RDF container for creator String creator = ""; Collection<Node> creators = rdf.getAll(NCO.creator); for (Iterator<Node> iterator = creators.iterator(); iterator.hasNext();) { Node node = iterator.next(); RDFContainer container = new RDFContainerImpl(rdf.getModel(), node.asURI()); creator = container.getString(NCO.fullname); if (creator!=null && !creator.equals("")) break; } // copy values to metadataDTO mdDTO.setTitle(rdf.getString(NIE.title)); mdDTO.setCreator(creator); mdDTO.addSubject(rdf.getString(NIE.subject)); mdDTO.setGenerator(rdf.getString(NIE.generator)); mdDTO.setContentCreated(rdf.getDate(NIE.contentCreated)); mdDTO.setContentLastModified(rdf.getDate(NIE.contentLastModified)); mdDTO.setPageCount(rdf.getInteger(NFO.pageCount)); mdDTO.setKeyword(rdf.getString(NIE.keyword)); } /** * extractSuggestedSubjects * * @throws MetadataExtractionException */ private void extractSuggestedSubjects() throws MetadataExtractionException { List<String> sugSubjects = subjectExtractor.extractSuggestedSubjects(rdf.getString(NIE.plainTextContent)); Iterator<String> iter = sugSubjects.iterator(); while (iter.hasNext()) { mdDTO.addSubject(iter.next()); } } }