/** * villemos solutions [space^] (http://www.villemos.com) * Probe. Send. Act. Emergent solution. * Copyright 2011 Gert Villemos * All Rights Reserved. * * Released under the Apache license, version 2.0 (do what ever * you want, just dont claim ownership). * * NOTICE: All information contained herein is, and remains * the property of villemos solutions, and its suppliers * if any. The intellectual and technical concepts contained * herein are proprietary to villemos solutions * and its suppliers and may be covered by European and Foreign Patents, * patents in process, and are protected by trade secret or copyright law. * * Dissemination of this information or reproduction of this material * is strictly forbidden unless prior written permission is obtained * from villemos solutions. * * And it wouldn't be nice either. * */ package com.villemos.ispace.aperture; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Set; import org.apache.camel.Message; import org.apache.camel.impl.DefaultMessage; import org.ontoware.rdf2go.RDF2Go; import org.ontoware.rdf2go.model.Model; import org.ontoware.rdf2go.model.node.URI; import org.semanticdesktop.aperture.accessor.DataObject; import org.semanticdesktop.aperture.accessor.FileDataObject; import org.semanticdesktop.aperture.accessor.RDFContainerFactory; import org.semanticdesktop.aperture.crawler.Crawler; import org.semanticdesktop.aperture.crawler.CrawlerHandler; import org.semanticdesktop.aperture.crawler.ExitCode; import org.semanticdesktop.aperture.extractor.Extractor; import org.semanticdesktop.aperture.extractor.ExtractorException; import org.semanticdesktop.aperture.extractor.ExtractorFactory; import org.semanticdesktop.aperture.extractor.ExtractorRegistry; import org.semanticdesktop.aperture.extractor.FileExtractor; import org.semanticdesktop.aperture.extractor.FileExtractorFactory; import org.semanticdesktop.aperture.extractor.impl.DefaultExtractorRegistry; import org.semanticdesktop.aperture.extractor.util.ThreadedExtractorWrapper; import org.semanticdesktop.aperture.extractor.xmp.XMPExtractorFactory; import org.semanticdesktop.aperture.mime.identifier.MimeTypeIdentifier; import org.semanticdesktop.aperture.mime.identifier.magic.MagicMimeTypeIdentifier; import org.semanticdesktop.aperture.rdf.RDFContainer; import org.semanticdesktop.aperture.rdf.impl.RDFContainerImpl; import org.semanticdesktop.aperture.util.IOUtil; import org.semanticdesktop.aperture.vocabulary.NFO; import org.semanticdesktop.aperture.vocabulary.NID3; import org.semanticdesktop.aperture.vocabulary.NIE; import org.semanticdesktop.aperture.vocabulary.NMO; import org.semanticdesktop.aperture.subcrawler.SubCrawler; import org.semanticdesktop.aperture.subcrawler.SubCrawlerException; import org.semanticdesktop.aperture.subcrawler.SubCrawlerFactory; import org.semanticdesktop.aperture.subcrawler.SubCrawlerRegistry; import org.semanticdesktop.aperture.subcrawler.impl.DefaultSubCrawlerRegistry; import com.villemos.ispace.aperture.processor.IProcessor; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; public class ExtendedCrawlerHandler implements CrawlerHandler, RDFContainerFactory { private static org.apache.log4j.Logger Logger = org.apache.log4j.Logger.getLogger(ExtendedCrawlerHandler.class); protected List<Message> messages = null; protected SubCrawlerRegistry subCrawlerRegistry = new DefaultSubCrawlerRegistry(); protected ExtractorRegistry extractorRegistry = new DefaultExtractorRegistry(); protected MimeTypeIdentifier mimeTypeIdentifier = new MagicMimeTypeIdentifier(); protected XMPExtractorFactory xmpExtractorFactory = new XMPExtractorFactory(); // Max allowed file size in bytes protected long maxSize = 50000000; /** List of specific processors. */ private List<IProcessor> processors = new ArrayList<IProcessor>(); public ExtendedCrawlerHandler(List<Message> messages) { this.messages = messages; } public void accessingObject(Crawler crawler, String url) { // TODO Auto-generated method stub } public void clearFinished(Crawler crawler, ExitCode exitCode) { // TODO Auto-generated method stub } public void clearStarted(Crawler crawler) { // TODO Auto-generated method stub } public void clearingObject(Crawler crawler, String url) { // TODO Auto-generated method stub } public void crawlStarted(Crawler crawler) { // TODO Auto-generated method stub } public void crawlStopped(Crawler crawler, ExitCode exitCode) { // TODO Auto-generated method stub } public RDFContainerFactory getRDFContainerFactory(Crawler crawler, String url) { return this; } public void objectChanged(Crawler crawler, DataObject object) { if (object instanceof FileDataObject) { process(crawler, (FileDataObject) object); } object.getMetadata().dispose(); object.dispose(); } public void objectNew(Crawler crawler, DataObject object) { String mimetype = process(crawler, (FileDataObject) object); String fullText = ""; Collection fullTexts = object.getMetadata().getAll(NIE.plainTextContent); fullTexts.addAll(object.getMetadata().getAll(NMO.plainTextMessageContent)); fullTexts.addAll(object.getMetadata().getAll(NID3.unsynchronizedTextContent)); if (!fullTexts.isEmpty()) { for (Object fullTextObject : fullTexts) { fullText += fullTextObject.toString(); } } InformationObject io = new InformationObject(); /** Build a message and set it on the message list returned from the processor. */ if (((FileDataObject) object).getFile() != null) { io.hasTitle = ((FileDataObject) object).getFile().getName(); io.hasUri = ((FileDataObject) object).getFile().getAbsolutePath(); } else { io.hasTitle = ((FileDataObject) object).getID().toString(); io.hasUri = ((FileDataObject) object).getID().toString(); } io.ofMimeType = mimetype; io.fromSource = "File System"; io.withRawText = fullText; Message message = new DefaultMessage(); message.setBody(io); messages.add(message); /** Add all messages that the specific processors may be able to extract from this file. */ for (IProcessor processor : processors ) { messages.addAll(processor.process(message, ((FileDataObject) object).getFile())); } /** Clean up. */ object.getMetadata().dispose(); object.dispose(); } private boolean applyExtractor(URI id, InputStream contentStream, String mimeType, RDFContainer metadata) throws ExtractorException, IOException { Set extractors = extractorRegistry.getExtractorFactories(mimeType); boolean supportedByXmp = xmpExtractorFactory.getSupportedMimeTypes().contains(mimeType); boolean result = false; byte [] buffer = null; if (!extractors.isEmpty() && supportedByXmp) { buffer = IOUtil.readBytes(contentStream); } if (!extractors.isEmpty()) { ExtractorFactory factory = (ExtractorFactory) extractors.iterator().next(); Extractor extractor = factory.get(); ThreadedExtractorWrapper wrapper = new ThreadedExtractorWrapper(extractor); if (buffer != null) { contentStream = new BufferedInputStream(new ByteArrayInputStream(buffer)); } try { wrapper.extract(id, contentStream, null, mimeType, metadata); result = true; } catch (Exception e) { e.printStackTrace(); } } if (supportedByXmp) { Extractor extractor = xmpExtractorFactory.get(); ThreadedExtractorWrapper wrapper = new ThreadedExtractorWrapper(extractor); if (buffer != null) { contentStream = new BufferedInputStream(new ByteArrayInputStream(buffer)); } try { wrapper.extract(id, contentStream, null, mimeType, metadata); result = true; } catch (Exception e) { e.printStackTrace(); } } return result; } public String process(Crawler crawler, FileDataObject object) { String mimetype = null; if (object.getFile() != null && object.getFile().length() > maxSize) { Logger.info("Ignoring file " + object.getFile().getAbsolutePath() + " as it is above the configured maxSize (" + maxSize + "). File size is " + object.getFile().length()); } else { // String mimeType = identifyMimeType(crawler, object); try { URI id = object.getID(); // Create a buffer around the object's stream large enough to be able to reset the stream // after MIME type identification has taken place. Add some extra to the minimum array // length required by the MimeTypeIdentifier for safety. int minimumArrayLength = mimeTypeIdentifier.getMinArrayLength(); // we don't specify our own buffer size anymore, I commented this out (Antoni Mylka) //int bufferSize = Math.max(minimumArrayLength, 8192); InputStream contentStream = object.getContent(); contentStream.mark(minimumArrayLength + 10); // add some for safety // apply the MimeTypeIdentifier byte[] bytes = IOUtil.readBytes(contentStream, minimumArrayLength); mimetype = mimeTypeIdentifier.identify(bytes, object.getMetadata().getString(NFO.fileName), id); if (mimetype != null) { // add the MIME type to the metadata RDFContainer metadata = object.getMetadata(); metadata.add(NIE.mimeType, mimetype); contentStream.reset(); // apply an Extractor if available boolean done = applyExtractor(id, contentStream, mimetype, metadata); if (done) { return mimetype; } // else try to apply a FileExtractor done = applyFileExtractor(object, id, mimetype, metadata); if (done) { return mimetype; } // or maybe apply a SubCrawler done = applySubCrawler(id, contentStream, mimetype, object, crawler); } } catch (Exception e) { Logger.error("Caurght exception"); e.printStackTrace(); } } return mimetype; } @SuppressWarnings("unchecked") private boolean applyFileExtractor(FileDataObject object, URI id, String mimeType, RDFContainer metadata) throws ExtractorException, IOException { Set fileextractors = extractorRegistry.getFileExtractorFactories(mimeType); if (!fileextractors.isEmpty()) { FileExtractorFactory factory = (FileExtractorFactory) fileextractors.iterator().next(); FileExtractor extractor = factory.get(); File originalFile = object.getFile(); if (originalFile != null) { System.out.print("|fex:" + extractor.getClass().getName()); extractor.extract(id, originalFile, null, mimeType, metadata); return true; } else { File tempFile = object.downloadContent(); try { System.out.print("|fexd:" + extractor.getClass().getName()); extractor.extract(id, tempFile, null, mimeType, metadata); return true; } finally { if (tempFile != null) { tempFile.delete(); } } } } else { return false; } } @SuppressWarnings("unchecked") private boolean applySubCrawler(URI id, InputStream contentStream, String mimeType, DataObject object, Crawler crawler) throws SubCrawlerException { Set subCrawlers = subCrawlerRegistry.get(mimeType); if (!subCrawlers.isEmpty()) { SubCrawlerFactory factory = (SubCrawlerFactory) subCrawlers.iterator().next(); SubCrawler subCrawler = factory.get(); System.out.print("|sc:" + subCrawler.getClass().getName()); try { crawler.runSubCrawler(subCrawler, object, contentStream, null, mimeType); } catch (Exception e) { e.printStackTrace(); } return true; } else { return false; } } public void objectNotModified(Crawler crawler, String url) { // DO NULL. } public void objectRemoved(Crawler crawler, String url) { // TODO // Create delete message to the repository. } public RDFContainer getRDFContainer(URI uri) { Logger.trace("Creating RDF container for Information Element " + uri); Model model = RDF2Go.getModelFactory().createModel(uri); model.open(); return new RDFContainerImpl(model, uri); } public long getMaxSize() { return maxSize; } public void setMaxSize(long maxSize) { this.maxSize = maxSize; } public List<IProcessor> getProcessors() { return processors; } public void setProcessors(List<IProcessor> processors) { this.processors = processors; } }